当前位置：首页 > news >正文

VGG16猫狗二分类

news 2026/7/22 3:33:15

VGG模型

VGG是由牛津大学视觉几何组（Visual Geometry Group）提出的深度卷积神经网络模型，以其简洁的架构和优异的性能著称。VGG-16和VGG-19是两种常用变体，分别包含16层和19层权重层。VGG的核心特点是使用多个连续的3×3卷积核代替大尺寸卷积核，在减少参数量的同时增强了非线性表达能力。

数据集准备

猫狗二分类任务通常使用Kaggle的"Dogs vs. Cats"数据集，包含25,000张训练图片（12,500狗/12,500猫）和12,500张测试图片。数据预处理包括：

统一调整为224×224像素（VGG输入尺寸）
数据增强：随机旋转、水平翻转、缩放等
像素值归一化到[0,1]范围
划分训练集/验证集（如80%/20%比例）

模型构建

使用VGG16作为基础模型

import os import torch import torch.nn as nn import torch.optim as optim # 优化器模块 from torchvision import transforms # 图像预处理工具 from torchvision.datasets import ImageFolder # 专门用于加载分类数据集的工具 from torch.utils.data import DataLoader # 数据加载器，实现批量加载、打乱数据、多进程加速 from PIL import Image # -------------------------- 1. 定义超参数 -------------------------- batch_size = 32 # 显存不足时调小（16/8/4） learning_rate = 0.001 # 学习率，控制参数更新的步长 num_epochs = 10 # 训练轮数，可根据训练效果调整 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"使用设备: {device}") # 创建模型保存目录 os.makedirs("model", exist_ok=True) # -------------------------- 2. 数据加载与预处理 -------------------------- ''' transforms.Compose将多个预处理步骤按顺序组合： Resize()：VGG16 网络的输入尺寸固定为 224×224，必须将所有图片缩放到该尺寸。 ToTensor()：将 PIL 图像（像素值 0~255，形状(H, W, C)）转换为 PyTorch 张量（像素值 0~1，形状(C, H, W)）。 Normalize():对 RGB 三个通道分别归一化,均值和标准差都设为 0.5，将 0~1 的像素值映射到 **-1~1** 区间，能显著加速模型收敛。 ''' transform = transforms.Compose([ transforms.Resize((224, 224)), # VGG16要求输入224x224 transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # 归一化到[-1,1] ]) # 加载数据集（修改为你的数据集路径） train_dataset = ImageFolder(root="D:/PyCharm/Py_Projects/VGG/train", transform=transform) test_dataset = ImageFolder(root="D:/PyCharm/Py_Projects/VGG/test", transform=transform) # 创建数据加载器 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0) # 打印类别映射（cat:0, dog:1） print(f"类别映射: {train_dataset.class_to_idx}") # -------------------------- 3. 构建VGG16模型 -------------------------- # 由 5 个卷积块（特征提取）+ 3 个全连接层（分类）组成，总共有 16 个带参数的层（13 个卷积层 + 3 个全连接层） class VGG16(nn.Module): def __init__(self): super(VGG16, self).__init__() # 特征提取部分（5个卷积块） self.features = nn.Sequential( # Block 1 nn.Conv2d(3, 64, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(64, 64, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), # Block 2 nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(128, 128, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), # Block 3 nn.Conv2d(128, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), # Block 4 nn.Conv2d(256, 512, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(512, 512, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(512, 512, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), # Block 5 nn.Conv2d(512, 512, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(512, 512, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(512, 512, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), ) # 分类器部分（3个全连接层） self.classifier = nn.Sequential( nn.Linear(512 * 7 * 7, 4096), nn.ReLU(inplace=True), nn.Dropout(p=0.5), # 防止过拟合 nn.Linear(4096, 4096), nn.ReLU(inplace=True), nn.Dropout(p=0.5), nn.Linear(4096, 2) # 二分类输出 ) def forward(self, x): x = self.features(x) x = torch.flatten(x, 1) # 展平特征图 x = self.classifier(x) return x # 初始化模型并移到设备 model = VGG16().to(device) # -------------------------- 4. 定义损失函数和优化器 -------------------------- criterion = nn.CrossEntropyLoss() # 分类任务标准损失函数 optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9) # 随机梯度下降优化器 # optimizer = optim.Adam(model.parameters(), lr=0.001) # -------------------------- 5. 训练模型 -------------------------- print("开始训练...") total_step = len(train_loader) for epoch in range(num_epochs): model.train() # 训练模式（开启Dropout和BatchNorm） running_loss = 0.0 for i, (images, labels) in enumerate(train_loader): images = images.to(device) labels = labels.to(device) # 前向传播 outputs = model(images) loss = criterion(outputs, labels) # 反向传播与优化 optimizer.zero_grad() # 清空梯度 loss.backward() # 反向传播 optimizer.step() # 更新参数 running_loss += loss.item() # 每100步打印一次 if (i + 1) % 100 == 0: print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_step}], Loss: {loss.item():.4f}") # 每个epoch结束后保存模型 torch.save(model.state_dict(), f"model/vgg16_epoch_{epoch+1}.pth") print(f"Epoch {epoch+1} 训练完成，平均损失: {running_loss/total_step:.4f}") # 保存最终模型 torch.save(model.state_dict(), "model/vgg16_final.pth") print("训练完成，最终模型已保存") # -------------------------- 6. 测试模型 -------------------------- print("开始测试...") model.eval() # 评估模式（关闭Dropout和BatchNorm） correct = 0 total = 0 with torch.no_grad(): # 不计算梯度，加速推理 for images, labels in test_loader: images = images.to(device) labels = labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print(f"测试集准确率: {(correct / total) * 100:.2f}%") # -------------------------- 7. 单张图片预测（可选） -------------------------- def predict_image(image_path): model.eval() image = Image.open(image_path).convert("RGB") image = transform(image).unsqueeze(0) # 增加batch维度 image = image.to(device) with torch.no_grad(): output = model(image) _, predicted = torch.max(output.data, 1) class_names = ["猫", "狗"] return class_names[predicted.item()] # 测试单张图片（修改为你的图片路径） # print(predict_image("test1.jpg"))

模型改良（迁移学习）

利用在 ImageNet（1400 万张图片）上预训练好的 VGG16 模型，加载PyTorch 官方提供的预训练 VGG16 权重，用自己的本地猫狗数据集只训练最后一层分类器。（预训练模型学到的是通用视觉特征，可以迁移到几乎任何图像分类任务）

import os import torch import torch.nn as nn import torch.optim as optim from torchvision import transforms from torchvision.datasets import ImageFolder from torchvision.models import vgg16, VGG16_Weights # 导入预训练权重 from torch.utils.data import DataLoader from PIL import Image # -------------------------- 1. 超参数设置（微调） -------------------------- batch_size = 64 # 显存足够可开到128，不足调为32/16 learning_rate = 0.0001 # 微调用更小的学习率（比从头训练小10倍） num_epochs = 5 # 只训练最后一层，3-5 轮就能收敛，再多容易过拟合 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"使用设备: {device}") # 创建模型保存目录 os.makedirs("model", exist_ok=True) # -------------------------- 2. 数据预处理 -------------------------- # VGG16预训练模型在ImageNet上训练时的归一化参数，不能用原来的(0.5,0.5,0.5) transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], # ImageNet数据集均值 std=[0.229, 0.224, 0.225] # ImageNet数据集标准差 ) ]) # 加载数据集（保持和之前一样的目录结构） train_dataset = ImageFolder(root="D:/PyCharm/Py_Projects/VGG/train", transform=transform) test_dataset = ImageFolder(root="D:/PyCharm/Py_Projects/VGG/test", transform=transform) train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, # Windows系统设为0，Linux/macOS可设为4加速 pin_memory=True if torch.cuda.is_available() else False ) test_loader = DataLoader( test_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True if torch.cuda.is_available() else False ) # print(f"类别映射: {train_dataset.class_to_idx}") print(f"训练集大小: {len(train_dataset)} 张图片") print(f"测试集大小: {len(test_dataset)} 张图片") # -------------------------- 3. 加载预训练VGG16并修改分类头 -------------------------- # 加载在ImageNet上预训练的VGG16模型 ''' 第一次运行时会自动下载预训练权重，下载完成后会缓存到本地 这个模型已经学会了识别边缘、纹理、形状、眼睛、耳朵等通用视觉特征 ''' model = vgg16(weights=VGG16_Weights.DEFAULT) # 冻结所有卷积层（特征提取层），只训练最后的全连接分类器 ''' VGG16 的features部分是 13 个卷积层，负责提取图像特征 这些特征是在 1400 万张图片上学到的，通用完全适用于猫狗分类 训练时不会更新这些层的参数，既加快了训练速度，又防止了过拟合 ''' for param in model.features.parameters(): param.requires_grad = False # 替换最后一个全连接层，将1000分类改为2分类（猫/狗） # VGG16的classifier结构：Linear(4096→4096) → ReLU → Dropout → Linear(4096→1000) model.classifier[6] = nn.Linear(in_features=4096, out_features=2) # 将模型移到GPU/CPU model = model.to(device) # 打印可训练参数（只有最后一层的权重和偏置） trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) total_params = sum(p.numel() for p in model.parameters()) # print(f"总参数: {total_params / 1e6:.2f}M，可训练参数: {trainable_params / 1e6:.2f}M") # -------------------------- 4. 定义损失函数和优化器 -------------------------- criterion = nn.CrossEntropyLoss() # 只优化分类器的参数（因为卷积层已经冻结） optimizer = optim.SGD(model.classifier.parameters(), lr=learning_rate, momentum=0.9) # 也可以用Adam优化器，收敛更快： # optimizer = optim.Adam(model.classifier.parameters(), lr=learning_rate) # -------------------------- 5. 训练模型 -------------------------- print("\n开始微调训练...") best_acc = 0.0 # 保存最佳准确率模型 for epoch in range(num_epochs): model.train() # 训练模式（开启Dropout） running_loss = 0.0 for i, (images, labels) in enumerate(train_loader): images = images.to(device) labels = labels.to(device) # 前向传播 outputs = model(images) loss = criterion(outputs, labels) # 反向传播与优化 optimizer.zero_grad() loss.backward() optimizer.step() running_loss += loss.item() # 每50步打印一次（因为batch_size变大，步数变少） if (i + 1) % 50 == 0: print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}") # 每个epoch结束后在测试集上评估 model.eval() correct = 0 total = 0 with torch.no_grad(): for images, labels in test_loader: images = images.to(device) labels = labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() epoch_acc = correct / total * 100 avg_loss = running_loss / len(train_loader) print(f"\nEpoch {epoch + 1} 完成") print(f"平均训练损失: {avg_loss:.4f}") print(f"测试集准确率: {epoch_acc:.2f}%\n") # 保存最佳模型 if epoch_acc > best_acc: best_acc = epoch_acc torch.save(model.state_dict(), "model/vgg16_pretrained_best.pth") print(f"保存最佳模型，当前最高准确率: {best_acc:.2f}%") # 保存最终模型 torch.save(model.state_dict(), "model/vgg16_pretrained_final.pth") print(f"训练完成！最高测试准确率: {best_acc:.2f}%") # -------------------------- 6. 单张图片预测函数 -------------------------- def predict_image(image_path): # 加载最佳模型 model.load_state_dict(torch.load("model/vgg16_pretrained_best.pth", map_location=device)) model.eval() # 预处理图片 image = Image.open(image_path).convert("RGB") image_tensor = transform(image).unsqueeze(0) # 增加batch维度 [1,3,224,224] image_tensor = image_tensor.to(device) # 预测 with torch.no_grad(): output = model(image_tensor) # 计算概率 ''' 增加置信度计算 不仅能知道模型预测的是什么，还能知道它对这个预测有多大把握 ''' probabilities = nn.functional.softmax(output, dim=1) confidence, predicted = torch.max(probabilities, 1) class_names = ["猫", "狗"] return f"预测结果: {class_names[predicted.item()]}，置信度: {confidence.item() * 100:.2f}%" # -------------------------- 7. 测试单张图片 -------------------------- # 取消下面注释，替换为你的图片路径即可测试 print(predict_image("test1.jpg")) # print(predict_image("test_cat.jpg"))

模型评估

可使用混淆矩阵、ROC曲线和AUC值评估模型性能：

from sklearn.metrics import classification_report, confusion_matrix test_generator = train_datagen.flow_from_directory( 'test_data_dir', target_size=(224, 224), batch_size=32, class_mode='binary', shuffle=False) y_pred = model.predict(test_generator) y_pred = (y_pred > 0.5).astype(int) print(classification_report(test_generator.classes, y_pred)) print(confusion_matrix(test_generator.classes, y_pred))