【深度学习代码流程】李宏毅机器学习HW-1:预测美国COVID-19阳性病率
eg:美国新冠预测
参考:【2022版李宏毅机器学习作业讲解】-HW1_哔哩哔哩_bilibili
一、引入需要的库
import math import numpy as np #矩阵处理 import pandas as pd #读取excel import os import csv from tqdm import tqdm #进度条 import torch import torch.nn as nn from torch.utils.data import DataLoader,Dataset,random_split from torch.utils.tensorboard import SummaryWriter二、相关函数与参数准备
1.设置随机种子
#设置随机种子,以实现结果可重复-》复现实验结果(可当作模板使用) def same_seed(seed): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)2.准备数据
#划分数据集,划分出训练集和验证集 def train_valid_split(data_set,valid_radio,seed): valid_data_size = int(len(data_set)*valid_radio) train_data_size = len(data_set) - valid_data_size train_data,valid_data = random_split(data_set,[train_data_size,valid_data_size],generator=torch.Generator().manual_seed(seed)) return np.array(train_data),np.array(valid_data)3.选择特征
#选择特征 def select_feat(train_data,valid_data,test_data,select_all = True): #选择label y_train = train_data[:,-1] y_valid = valid_data[:,-1] #选择feature raw_x_train = train_data[:,:-1] #除了最后一列 raw_x_valid = valid_data[:,:-1] raw_x_test = test_data if select_all: feat_idx = list(range(raw_x_train.shape[1])) else: feat_idx = [0,1,2,3,4] return raw_x_train[:,feat_idx],raw_x_valid[:,feat_idx],raw_x_test[:,feat_idx],y_train,y_valid4.构造数据集
#数据集 class COVID19Dataset(Dataset): def __init__(self,features,targets=None): if targets is None: self.targets = targets else: self.targets = torch.FloatTensor(targets) self.features = torch.FloatTensor(features) def __getitem__(self, idx): if self.targets is None: return self.features[idx] else: return self.features[idx],self.targets[idx] def __len__(self): return len(self.features)5.构造网络结构
#神经网络 class My_Model(nn.Module): def __init__(self, input_dim): super(My_Model,self).__init__() self.layers = nn.Sequential( nn.Linear(input_dim,16), nn.ReLU(), nn.Linear(16,8), nn.ReLU(), nn.Linear(8,1) ) def forward(self,x): x = self.layers(x) x = x.squeeze(1) return x6、参数设置
#参数设置 device = 'cuda' if torch.cuda.is_available() else 'cpu' config = { 'seed':1122408, 'select_all':True, 'valid_radio':0.2, 'n_epochs':3000, 'batch_size':256, 'learning_rate':1e-5, 'early_stop':400, 'save_path':'./models/model.ckpt' }7.训练过程
#训练过程 def trainer(train_loader,valid_loader,model,config,device): criterion = nn.MSELoss(reduction='mean') optimizer = torch.optim.SGD(model.parameters(),lr = config['learning_rate'],momentum=0.9) writer = SummaryWriter() #可视化,可注释 if not os.path.isdir('./models'): os.mkdir('./models') n_epochs = config['n_epochs'] best_loss = math.inf step = 0 early_stop_count = 0 for epoch in range(n_epochs): model.train() loss_record = [] train_pbar = tqdm(train_loader,position=0,leave=True) #进度条可视化显示 #train loop for x,y in train_pbar: optimizer.zero_grad() x,y = x.to(device),y.to(device) pred = model(x) loss = criterion(pred,y) loss.backward() optimizer.step() step += 1 loss_record.append(loss.detach().item()) #显示训练过程 train_pbar.set_description(f'Epoch[{epoch+1}/{n_epochs}]') train_pbar.set_postfix({'loss':loss.detach().item()}) mean_train_loss = sum(loss_record) / len(loss_record) writer.add_scalar('Loss/train',mean_train_loss,step)#可视化图表 #valid loop model.eval() loss_record = [] for x,y in valid_loader: x,y = x.to(device),y.to(device) with torch.no_grad(): pred = model(x) loss = criterion(pred,y) loss_record.append(loss.detach().item()) mean_valid_loss = sum(loss_record) / len(loss_record) print(f'Epoch[{epoch+1}/{n_epochs}]: Train loss:{mean_train_loss:.4f},Valid loss:{mean_valid_loss:.4f}') writer.add_scalar('Loss/valid',mean_valid_loss,step) if mean_valid_loss < best_loss: best_loss = mean_valid_loss torch.save(model.state_dict(),config['save_path']) print('Saving model with loss {:.3}.'.format(best_loss)) early_stop_count = 0 else: early_stop_count += 1 if early_stop_count >= config['early_stop']: print('\n Model is not improvinng, so we halt train session.') return三、开始训练前的准备工作
'''准备工作''' # 1.设置随机种子 same_seed(config['seed']) # 2.准备数据 # pandas库读取数据 train_data = pd.read_csv('./covid.train_new.csv').values test_data = pd.read_csv('./covid.test_un.csv').values # 划分数据集 train_data,valid_data = train_valid_split(train_data,config['valid_radio'],config['seed']) print(f'train_data size:{train_data.shape},valid_data size:{valid_data.shape},test_data size:{test_data.shape}') # 3.选择特征 x_train,x_valid,x_test, y_train,y_valid = select_feat(train_data,valid_data,test_data,config['select_all']) print(f'the number of feature: {x_train.shape[1]}') # 4.构造数据集 train_dataset = COVID19Dataset(x_train,y_train) valid_dataset = COVID19Dataset(x_valid,y_valid) test_dataset = COVID19Dataset(x_test) # 5.封装加载数据集 train_loader = DataLoader(train_dataset,batch_size=config['batch_size'],shuffle=True,pin_memory=True) valid_loader = DataLoader(valid_dataset,batch_size=config['batch_size'],shuffle=True,pin_memory=True) test_loader = DataLoader(test_dataset,batch_size=config['batch_size'],shuffle=False,pin_memory=True)四、开始训练
'''开始训练''' model = My_Model(input_dim=x_train.shape[1]).to(device) trainer(train_loader,valid_loader,model,config,device) #根据测试集进行预测 def predict(test_loader,model,device): model.eval() preds = [] for x in tqdm(test_loader): x = x.to(device) with torch.no_grad(): pred = model(x) preds.append(pred.detach().cpu()) preds = torch.cat(preds,dim=0).numpy() #转化成numpy数组 return preds def save_pred(preds,file): with open(file,'w') as fp: writer = csv.writer(fp) writer.writerow(['id','tested_positive']) for i,p in enumerate(preds): writer.writerow([i,p]) #预测并保存结果 model = My_Model(input_dim=x_train.shape[1]).to(device) model.load_state_dict(torch.load(config['save_path'])) preds = predict(test_loader,model,device) save_pred(preds,'pred.csv')