当前位置: 首页 > news >正文

Day40 早停策略和模型权重的保存

@浙大疏锦行

作业:对信贷数据集进行训练后保持权重,后继续训练50次,采取早停策略

import torch import torch.nn as nn import torch.optim as optim from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler import time import matplotlib.pyplot as plt from tqdm import tqdm import warnings warnings.filterwarnings("ignore") # 检查GPU是否可用,优先使用GPU device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(f"使用设备: {device}") # 若有多个GPU,可指定具体GPU,例如cuda:1 # 验证GPU是否真的在使用(可选) if torch.cuda.is_available(): print(f"GPU名称: {torch.cuda.get_device_name(0)}") torch.cuda.empty_cache() # 清空GPU缓存 # 加载信贷数据集 iris = load_iris() X = iris.data # 特征数据 y = iris.target # 标签数据 # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 归一化数据 scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # 转换为PyTorch张量并强制移至指定设备(GPU/CPU) X_train = torch.FloatTensor(X_train).to(device, non_blocking=True) y_train = torch.LongTensor(y_train).to(device, non_blocking=True) X_test = torch.FloatTensor(X_test).to(device, non_blocking=True) y_test = torch.LongTensor(y_test).to(device, non_blocking=True) class MLP(nn.Module): def __init__(self): super(MLP, self).__init__() self.fc1 = nn.Linear(4, 10) # 输入层(信贷数据集需修改输入维度) self.relu = nn.ReLU() self.fc2 = nn.Linear(10, 3) # 输出层(信贷数据集需修改输出维度) def forward(self, x): out = self.fc1(x) out = self.relu(out) out = self.fc2(out) return out # 实例化模型并移至GPU model = MLP().to(device) criterion = nn.CrossEntropyLoss() # 分类损失函数 optimizer = optim.SGD(model.parameters(), lr=0.01) # 优化器 # 首次训练参数 first_train_epochs = 20000 train_losses = [] # 首次训练损失 test_losses = [] epochs = [] # 早停参数(首次训练和继续训练共用相同策略) best_test_loss = float('inf') best_epoch = 0 patience = 50 counter = 0 early_stopped = False print("\n===== 开始首次训练 =====") start_time = time.time() with tqdm(total=first_train_epochs, desc="首次训练进度", unit="epoch") as pbar: for epoch in range(first_train_epochs): model.train() # 前向传播 outputs = model(X_train) train_loss = criterion(outputs, y_train) # 反向传播和优化 optimizer.zero_grad() train_loss.backward() optimizer.step() # 每200轮记录损失并检查早停 if (epoch + 1) % 200 == 0: model.eval() with torch.no_grad(): test_outputs = model(X_test) test_loss = criterion(test_outputs, y_test) train_losses.append(train_loss.item()) test_losses.append(test_loss.item()) epochs.append(epoch + 1) # 更新进度条 pbar.set_postfix({'Train Loss': f'{train_loss.item():.4f}', 'Test Loss': f'{test_loss.item():.4f}'}) # 早停逻辑 if test_loss.item() < best_test_loss: best_test_loss = test_loss.item() best_epoch = epoch + 1 counter = 0 # 保存最佳模型 torch.save(model.state_dict(), 'best_model.pth') else: counter += 1 if counter >= patience: print(f"\n首次训练早停触发!在第{epoch+1}轮,测试集损失已有{patience}轮未改善。") print(f"最佳测试集损失出现在第{best_epoch}轮,损失值为{best_test_loss:.4f}") early_stopped = True break # 更新进度条 if (epoch + 1) % 1000 == 0: pbar.update(1000) # 补全进度条 if pbar.n < first_train_epochs: pbar.update(first_train_epochs - pbar.n) # 保存首次训练结束后的模型权重(核心修改点1) torch.save({ 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'epoch': epoch + 1, 'best_loss': best_test_loss }, 'trained_model.pth') print(f"\n首次训练完成,权重已保存至 trained_model.pth") print(f"首次训练总耗时: {time.time() - start_time:.2f} 秒") print("\n===== 加载权重并开始继续训练 =====") # 加载保存的权重(核心修改点2) checkpoint = torch.load('trained_model.pth', map_location=device) model.load_state_dict(checkpoint['model_state_dict']) print(f"加载了首次训练至第{checkpoint['epoch']}轮的权重,最佳损失: {checkpoint['best_loss']:.4f}") # 重新初始化优化器(核心修改点3:继续训练必须重置优化器) optimizer = optim.SGD(model.parameters(), lr=0.01) # 若需要延续优化器状态,可取消下面注释(视场景选择) # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) # 继续训练的参数 continue_train_epochs = 50 # 目标继续训练50轮 continue_train_losses = [] # 继续训练损失 continue_test_losses = [] continue_epochs = [] # 重置早停参数(针对继续训练) continue_best_loss = checkpoint['best_loss'] continue_counter = 0 continue_early_stop = False start_continue_time = time.time() with tqdm(total=continue_train_epochs, desc="继续训练进度", unit="epoch") as pbar: for epoch in range(continue_train_epochs): model.train() # 前向传播 outputs = model(X_train) train_loss = criterion(outputs, y_train) # 反向传播和优化 optimizer.zero_grad() train_loss.backward() optimizer.step() # 每1轮就检查损失和早停(继续训练轮数少,无需间隔) model.eval() with torch.no_grad(): test_outputs = model(X_test) test_loss = criterion(test_outputs, y_test) continue_train_losses.append(train_loss.item()) continue_test_losses.append(test_loss.item()) continue_epochs.append(epoch + 1) # 更新进度条 pbar.set_postfix({'Train Loss': f'{train_loss.item():.4f}', 'Test Loss': f'{test_loss.item():.4f}'}) pbar.update(1) # 继续训练的早停逻辑 if test_loss.item() < continue_best_loss: continue_best_loss = test_loss.item() continue_counter = 0 # 保存继续训练后的最佳模型 torch.save(model.state_dict(), 'continue_best_model.pth') else: continue_counter += 1 if continue_counter >= patience: print(f"\n继续训练早停触发!在第{epoch+1}轮,测试集损失已有{patience}轮未改善。") print(f"继续训练最佳损失: {continue_best_loss:.4f}") continue_early_stop = True break print(f"继续训练完成,总耗时: {time.time() - start_continue_time:.2f} 秒") print(f"继续训练实际轮数: {len(continue_epochs)} 轮(早停触发则少于50轮)") print("\n===== 最终模型评估 =====") model.load_state_dict(torch.load('continue_best_model.pth', map_location=device)) model.eval() with torch.no_grad(): outputs = model(X_test) _, predicted = torch.max(outputs, 1) correct = (predicted == y_test).sum().item() accuracy = correct / y_test.size(0) print(f'测试集最终准确率: {accuracy * 100:.2f}%') # ====================== 8. 可视化 ====================== plt.figure(figsize=(12, 6)) # 绘制首次训练损失 plt.subplot(1, 2, 1) plt.plot(epochs, train_losses, label='Train Loss') plt.plot(epochs, test_losses, label='Test Loss') plt.xlabel('Epoch') plt.ylabel('Loss') plt.title('首次训练损失曲线') plt.legend() plt.grid(True) # 绘制继续训练损失 plt.subplot(1, 2, 2) plt.plot(continue_epochs, continue_train_losses, label='Train Loss') plt.plot(continue_epochs, continue_test_losses, label='Test Loss') plt.xlabel('Continue Epoch') plt.ylabel('Loss') plt.title('继续训练50轮损失曲线') plt.legend() plt.grid(True) plt.tight_layout() plt.show()
http://www.jsqmd.com/news/282567/

相关文章:

  • AI说话人拆分实战:基于Speech Seaco的多角色语音处理
  • 如何验证MinerU安装成功?test.pdf运行结果查看指南
  • BERT填空AI生产环境落地:稳定性与兼容性实测报告
  • 从零部署DeepSeek OCR模型|WebUI镜像简化流程,支持单卡推理
  • GPEN教育场景应用:学生证件照自动美化系统搭建
  • 3步搞定Llama3部署:Open-WebUI可视化界面教程
  • YOLO26镜像功能全测评:目标检测新标杆
  • 为什么要学数字滤波器与C语言实现
  • Z-Image-Turbo推理延迟高?9步生成优化技巧实战分享
  • 创建型模式:简单工厂模式(C语言实现)
  • 语音社交App创新:用SenseVoiceSmall增加情感互动反馈
  • Glyph启动失败?常见错误代码排查步骤详解教程
  • 对比实测:自己搭环境 vs 使用预置镜像微调效率差异
  • 语音标注预处理:FSMN-VAD辅助人工标注实战案例
  • 效果展示:Qwen3-Reranker-4B打造的智能文档排序案例
  • Z-Image-Turbo生成动漫角色全过程分享
  • 实时性要求高的场景:FSMN-VAD流式处理可能性分析
  • NewBie-image-Exp0.1内存泄漏?长时运行稳定性优化指南
  • MinerU vs 其他PDF提取工具:多模态模型性能实战对比评测
  • 科哥定制FunASR镜像实战|轻松实现语音识别与标点恢复
  • 从零部署高性能OCR:DeepSeek-OCR-WEBUI镜像快速上手
  • SenseVoiceSmall情感标签解析:HAPPY/ANGRY识别后处理代码实例
  • 零配置体验阿里Qwen-Image-2512,开箱即用真省心
  • 一看就会的verl教程:无需深度学习背景
  • AI团队部署必看:Llama3生产环境最佳实践指南
  • Qwen3-4B函数调用不稳定?工具使用优化部署教程
  • Z-Image-Turbo显存管理技巧:generator手动设seed避坑法
  • Kubernetes 高频部署 CI/CD 架构实战指南
  • bert-base-chinese功能全测评:中文文本分类真实表现
  • UI-TARS-desktop开箱体验:一键启动的多模态AI工作台