VGG16猫狗二分类
VGG模型
VGG是由牛津大学视觉几何组(Visual Geometry Group)提出的深度卷积神经网络模型,以其简洁的架构和优异的性能著称。VGG-16和VGG-19是两种常用变体,分别包含16层和19层权重层。VGG的核心特点是使用多个连续的3×3卷积核代替大尺寸卷积核,在减少参数量的同时增强了非线性表达能力。
数据集准备
猫狗二分类任务通常使用Kaggle的"Dogs vs. Cats"数据集,包含25,000张训练图片(12,500狗/12,500猫)和12,500张测试图片。数据预处理包括:
统一调整为224×224像素(VGG输入尺寸)
数据增强:随机旋转、水平翻转、缩放等
像素值归一化到[0,1]范围
划分训练集/验证集(如80%/20%比例)
模型构建
使用VGG16作为基础模型
import os import torch import torch.nn as nn import torch.optim as optim # 优化器模块 from torchvision import transforms # 图像预处理工具 from torchvision.datasets import ImageFolder # 专门用于加载分类数据集的工具 from torch.utils.data import DataLoader # 数据加载器,实现批量加载、打乱数据、多进程加速 from PIL import Image # -------------------------- 1. 定义超参数 -------------------------- batch_size = 32 # 显存不足时调小(16/8/4) learning_rate = 0.001 # 学习率,控制参数更新的步长 num_epochs = 10 # 训练轮数,可根据训练效果调整 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"使用设备: {device}") # 创建模型保存目录 os.makedirs("model", exist_ok=True) # -------------------------- 2. 数据加载与预处理 -------------------------- ''' transforms.Compose将多个预处理步骤按顺序组合: Resize():VGG16 网络的输入尺寸固定为 224×224,必须将所有图片缩放到该尺寸。 ToTensor():将 PIL 图像(像素值 0~255,形状(H, W, C))转换为 PyTorch 张量(像素值 0~1,形状(C, H, W))。 Normalize():对 RGB 三个通道分别归一化,均值和标准差都设为 0.5,将 0~1 的像素值映射到 **-1~1** 区间,能显著加速模型收敛。 ''' transform = transforms.Compose([ transforms.Resize((224, 224)), # VGG16要求输入224x224 transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # 归一化到[-1,1] ]) # 加载数据集(修改为你的数据集路径) train_dataset = ImageFolder(root="D:/PyCharm/Py_Projects/VGG/train", transform=transform) test_dataset = ImageFolder(root="D:/PyCharm/Py_Projects/VGG/test", transform=transform) # 创建数据加载器 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0) # 打印类别映射(cat:0, dog:1) print(f"类别映射: {train_dataset.class_to_idx}") # -------------------------- 3. 构建VGG16模型 -------------------------- # 由 5 个卷积块(特征提取)+ 3 个全连接层(分类)组成,总共有 16 个带参数的层(13 个卷积层 + 3 个全连接层) class VGG16(nn.Module): def __init__(self): super(VGG16, self).__init__() # 特征提取部分(5个卷积块) self.features = nn.Sequential( # Block 1 nn.Conv2d(3, 64, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(64, 64, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), # Block 2 nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(128, 128, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), # Block 3 nn.Conv2d(128, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), # Block 4 nn.Conv2d(256, 512, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(512, 512, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(512, 512, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), # Block 5 nn.Conv2d(512, 512, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(512, 512, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(512, 512, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), ) # 分类器部分(3个全连接层) self.classifier = nn.Sequential( nn.Linear(512 * 7 * 7, 4096), nn.ReLU(inplace=True), nn.Dropout(p=0.5), # 防止过拟合 nn.Linear(4096, 4096), nn.ReLU(inplace=True), nn.Dropout(p=0.5), nn.Linear(4096, 2) # 二分类输出 ) def forward(self, x): x = self.features(x) x = torch.flatten(x, 1) # 展平特征图 x = self.classifier(x) return x # 初始化模型并移到设备 model = VGG16().to(device) # -------------------------- 4. 定义损失函数和优化器 -------------------------- criterion = nn.CrossEntropyLoss() # 分类任务标准损失函数 optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9) # 随机梯度下降优化器 # optimizer = optim.Adam(model.parameters(), lr=0.001) # -------------------------- 5. 训练模型 -------------------------- print("开始训练...") total_step = len(train_loader) for epoch in range(num_epochs): model.train() # 训练模式(开启Dropout和BatchNorm) running_loss = 0.0 for i, (images, labels) in enumerate(train_loader): images = images.to(device) labels = labels.to(device) # 前向传播 outputs = model(images) loss = criterion(outputs, labels) # 反向传播与优化 optimizer.zero_grad() # 清空梯度 loss.backward() # 反向传播 optimizer.step() # 更新参数 running_loss += loss.item() # 每100步打印一次 if (i + 1) % 100 == 0: print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_step}], Loss: {loss.item():.4f}") # 每个epoch结束后保存模型 torch.save(model.state_dict(), f"model/vgg16_epoch_{epoch+1}.pth") print(f"Epoch {epoch+1} 训练完成,平均损失: {running_loss/total_step:.4f}") # 保存最终模型 torch.save(model.state_dict(), "model/vgg16_final.pth") print("训练完成,最终模型已保存") # -------------------------- 6. 测试模型 -------------------------- print("开始测试...") model.eval() # 评估模式(关闭Dropout和BatchNorm) correct = 0 total = 0 with torch.no_grad(): # 不计算梯度,加速推理 for images, labels in test_loader: images = images.to(device) labels = labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print(f"测试集准确率: {(correct / total) * 100:.2f}%") # -------------------------- 7. 单张图片预测(可选) -------------------------- def predict_image(image_path): model.eval() image = Image.open(image_path).convert("RGB") image = transform(image).unsqueeze(0) # 增加batch维度 image = image.to(device) with torch.no_grad(): output = model(image) _, predicted = torch.max(output.data, 1) class_names = ["猫", "狗"] return class_names[predicted.item()] # 测试单张图片(修改为你的图片路径) # print(predict_image("test1.jpg"))模型改良(迁移学习)
利用在 ImageNet(1400 万张图片)上预训练好的 VGG16 模型,加载PyTorch 官方提供的预训练 VGG16 权重,用自己的本地猫狗数据集只训练最后一层分类器。(预训练模型学到的是通用视觉特征,可以迁移到几乎任何图像分类任务)
import os import torch import torch.nn as nn import torch.optim as optim from torchvision import transforms from torchvision.datasets import ImageFolder from torchvision.models import vgg16, VGG16_Weights # 导入预训练权重 from torch.utils.data import DataLoader from PIL import Image # -------------------------- 1. 超参数设置(微调) -------------------------- batch_size = 64 # 显存足够可开到128,不足调为32/16 learning_rate = 0.0001 # 微调用更小的学习率(比从头训练小10倍) num_epochs = 5 # 只训练最后一层,3-5 轮就能收敛,再多容易过拟合 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"使用设备: {device}") # 创建模型保存目录 os.makedirs("model", exist_ok=True) # -------------------------- 2. 数据预处理 -------------------------- # VGG16预训练模型在ImageNet上训练时的归一化参数,不能用原来的(0.5,0.5,0.5) transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], # ImageNet数据集均值 std=[0.229, 0.224, 0.225] # ImageNet数据集标准差 ) ]) # 加载数据集(保持和之前一样的目录结构) train_dataset = ImageFolder(root="D:/PyCharm/Py_Projects/VGG/train", transform=transform) test_dataset = ImageFolder(root="D:/PyCharm/Py_Projects/VGG/test", transform=transform) train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, # Windows系统设为0,Linux/macOS可设为4加速 pin_memory=True if torch.cuda.is_available() else False ) test_loader = DataLoader( test_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True if torch.cuda.is_available() else False ) # print(f"类别映射: {train_dataset.class_to_idx}") print(f"训练集大小: {len(train_dataset)} 张图片") print(f"测试集大小: {len(test_dataset)} 张图片") # -------------------------- 3. 加载预训练VGG16并修改分类头 -------------------------- # 加载在ImageNet上预训练的VGG16模型 ''' 第一次运行时会自动下载预训练权重,下载完成后会缓存到本地 这个模型已经学会了识别边缘、纹理、形状、眼睛、耳朵等通用视觉特征 ''' model = vgg16(weights=VGG16_Weights.DEFAULT) # 冻结所有卷积层(特征提取层),只训练最后的全连接分类器 ''' VGG16 的features部分是 13 个卷积层,负责提取图像特征 这些特征是在 1400 万张图片上学到的,通用完全适用于猫狗分类 训练时不会更新这些层的参数,既加快了训练速度,又防止了过拟合 ''' for param in model.features.parameters(): param.requires_grad = False # 替换最后一个全连接层,将1000分类改为2分类(猫/狗) # VGG16的classifier结构:Linear(4096→4096) → ReLU → Dropout → Linear(4096→1000) model.classifier[6] = nn.Linear(in_features=4096, out_features=2) # 将模型移到GPU/CPU model = model.to(device) # 打印可训练参数(只有最后一层的权重和偏置) trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) total_params = sum(p.numel() for p in model.parameters()) # print(f"总参数: {total_params / 1e6:.2f}M,可训练参数: {trainable_params / 1e6:.2f}M") # -------------------------- 4. 定义损失函数和优化器 -------------------------- criterion = nn.CrossEntropyLoss() # 只优化分类器的参数(因为卷积层已经冻结) optimizer = optim.SGD(model.classifier.parameters(), lr=learning_rate, momentum=0.9) # 也可以用Adam优化器,收敛更快: # optimizer = optim.Adam(model.classifier.parameters(), lr=learning_rate) # -------------------------- 5. 训练模型 -------------------------- print("\n开始微调训练...") best_acc = 0.0 # 保存最佳准确率模型 for epoch in range(num_epochs): model.train() # 训练模式(开启Dropout) running_loss = 0.0 for i, (images, labels) in enumerate(train_loader): images = images.to(device) labels = labels.to(device) # 前向传播 outputs = model(images) loss = criterion(outputs, labels) # 反向传播与优化 optimizer.zero_grad() loss.backward() optimizer.step() running_loss += loss.item() # 每50步打印一次(因为batch_size变大,步数变少) if (i + 1) % 50 == 0: print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}") # 每个epoch结束后在测试集上评估 model.eval() correct = 0 total = 0 with torch.no_grad(): for images, labels in test_loader: images = images.to(device) labels = labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() epoch_acc = correct / total * 100 avg_loss = running_loss / len(train_loader) print(f"\nEpoch {epoch + 1} 完成") print(f"平均训练损失: {avg_loss:.4f}") print(f"测试集准确率: {epoch_acc:.2f}%\n") # 保存最佳模型 if epoch_acc > best_acc: best_acc = epoch_acc torch.save(model.state_dict(), "model/vgg16_pretrained_best.pth") print(f"保存最佳模型,当前最高准确率: {best_acc:.2f}%") # 保存最终模型 torch.save(model.state_dict(), "model/vgg16_pretrained_final.pth") print(f"训练完成!最高测试准确率: {best_acc:.2f}%") # -------------------------- 6. 单张图片预测函数 -------------------------- def predict_image(image_path): # 加载最佳模型 model.load_state_dict(torch.load("model/vgg16_pretrained_best.pth", map_location=device)) model.eval() # 预处理图片 image = Image.open(image_path).convert("RGB") image_tensor = transform(image).unsqueeze(0) # 增加batch维度 [1,3,224,224] image_tensor = image_tensor.to(device) # 预测 with torch.no_grad(): output = model(image_tensor) # 计算概率 ''' 增加置信度计算 不仅能知道模型预测的是什么,还能知道它对这个预测有多大把握 ''' probabilities = nn.functional.softmax(output, dim=1) confidence, predicted = torch.max(probabilities, 1) class_names = ["猫", "狗"] return f"预测结果: {class_names[predicted.item()]},置信度: {confidence.item() * 100:.2f}%" # -------------------------- 7. 测试单张图片 -------------------------- # 取消下面注释,替换为你的图片路径即可测试 print(predict_image("test1.jpg")) # print(predict_image("test_cat.jpg"))模型评估
可使用混淆矩阵、ROC曲线和AUC值评估模型性能:
from sklearn.metrics import classification_report, confusion_matrix test_generator = train_datagen.flow_from_directory( 'test_data_dir', target_size=(224, 224), batch_size=32, class_mode='binary', shuffle=False) y_pred = model.predict(test_generator) y_pred = (y_pred > 0.5).astype(int) print(classification_report(test_generator.classes, y_pred)) print(confusion_matrix(test_generator.classes, y_pred))常见问题解决
过拟合:增加Dropout层、数据增强、权重正则化
欠拟合:减少Dropout率、增加全连接层神经元数量
训练慢:使用预训练权重、冻结部分层、减小批量大小
内存不足:降低图像分辨率、减小批量大小、使用生成器
实际应用中可根据具体需求调整模型深度和训练策略。
