
【回归任务】新冠感染人数预测
新冠感染人数预测模型,基础回归任务
·
数据来源:ML2021Spring-hw1 | Kaggle
1.数据预处理类
# 数据集中“州”以"独热编码"形式呈现
import matplotlib.pyplot as plt
import torch
import numpy as np
import csv
import pandas as pd
from torch.utils.data import DataLoader, Dataset
# Dataset类,需要提供file; 提供的函数:init吃file path, getitem取数据,根据下标给X,Y, len求长度
# DataLoader函数,分批次取数据集
import torch.nn as nn
# nn.Module类下定义了init和forward函数
from torch import optim # 优化器
import time
# 1.数据预处理
class CovidDataset(Dataset):
def __init__(self, file_path, mode="train"):
with open(file_path, "r") as f:
ori_data = list(csv.reader(f)) # 将Exel表转为list列表,此时的数据皆是“字符串”形式
csv_data = np.array(ori_data[1:])[:, 1:].astype(float) # 将上述列表转矩阵且去第一行第一列 .astype(float)转为浮点型
# 取数据集下标:
if mode == "train": # 逢5取1
indices = [i for i in range(len(csv_data)) if i % 5 != 0]
data = torch.tensor(csv_data[indices, :-1])
self.y = torch.tensor(csv_data[indices, -1])
elif mode == "val": # 验证集
indices = [i for i in range(len(csv_data)) if i % 5 == 0]
data = torch.tensor(csv_data[indices, :-1])
self.y = torch.tensor(csv_data[indices, -1])
else: # 测试集
indices = [i for i in range(len(csv_data))]
data = torch.tensor(csv_data[indices])
# 测试集无label
self.mode = mode
self.data = (data - data.mean(dim=0, keepdim=True)) / data.std(dim=0, keepdim=True) # 归一化处理,keepdim表示维持原张量维度
def __getitem__(self, idx):
if self.mode != "test":
return self.data[idx].float(), self.y[idx].float() # .float()改为32位
else:
return self.data[idx].float()
def __len__(self):
return len(self.data)
上述类实现读取原数据文件,转换数据形式,对数据进行归一化处理,且将两个文件的数据分为“训练集”、“验证集”、“测试集”三部分。
train_file = "covid.train.csv"
test_file = "covid.test.csv"
# file = pd.read_csv(train_file)
# print(file.head())
# 调用数据预处理类
train_dataset = CovidDataset(train_file, "train") # 返回训练集的X和Y
val_dataset = CovidDataset(train_file, "val")
test_dataset = CovidDataset(test_file, "test")
# for data in train_dataset:
# print(data) # data为93列的X和最后的label即Y
# 将数据分为多批次,每轮分批次训练
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size, shuffle=True) # 将训练集数据分组
val_loader = DataLoader(val_dataset, batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
2.训练模型
class MyModel(nn.Module):
def __init__(self, inDim):
super().__init__()
self.fc1 = nn.Linear(inDim, 64)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(64, 1)
def forward(self, x): # 模型前向过程
x = self.fc1(x)
x = self.relu1(x)
x = self.fc2(x)
if len(x.size()) > 1: # 由于数据集batch_x是二维,batch_y是一维,所以forward出来的预测值是二维的,需要压缩为一维
return x.squeeze(1)
return x
本次回归任务的训练模型为两个全连接层和一个Relu激活函数相结合,可自行调整修改优化。
3.训练函数
def train_val(train_loader, val_loader, device, model, loss, optimizer, epochs, save_path):
model = model.to(device)
plt_train_loss = [] # 训练过程的loss值,取每批次的均值
plt_val_loss = [] # 验证
min_val_loss = 9999999999999 # 最优loss值
for epoch in range(epochs): # 训练号角
train_loss = 0.0
val_loss = 0.0
start_time = time.time()
model.train() # 模型调为训练模式
for batch_x, batch_y in train_loader:
x, target = batch_x.to(device), batch_y.to(device)
pred = model(x) # 训练得到pred_y
train_bat_loss = loss(pred, target) # 根据pred_y和y计算loss
train_loss += train_bat_loss.cpu().item() # 每次训练中,数据集分批次,每次训练总loss的值即为各批次loss总和
train_bat_loss.backward() # 对loss梯度回传
optimizer.step() # 更新模型参数
optimizer.zero_grad() # grad置零
# 由于batch_x.to(device), batch_y.to(device)操作导致数据集可能是在gpu上的,故train_bat_loss.cpu()也可能在gpu上,需取下放到cpu上后才允许取值
plt_train_loss.append(train_loss / train_loader.__len__()) # 计算train的平均loss
# 上述的train训练结束后已得到本次训练结束的最优参数(权重和偏置),下面根据该组参数值对验证集进行验证查看loss
model.eval() # 模型调为验证模式
with torch.no_grad(): # 验证过程不需要梯度回传,无需追踪grad
for batch_x, batch_y in val_loader:
x, target = batch_x.to(device), batch_y.to(device)
pred = model(x)
val_bat_loss = loss(pred, target)
val_loss += val_bat_loss.cpu().item()
plt_val_loss.append(val_loss / val_loader.__len__())
# 一轮训练结束,对更优的loss表现保存其model
if val_loss < min_val_loss:
torch.save(model, save_path)
min_val_loss = val_loss
# 打印这轮的训练结果
print("[%03d/%03d] %2.2fsec(s) Train_loss: %.6f | Val_loss: %.6f" % \
(epoch, epochs, time.time() - start_time, plt_train_loss[-1], plt_val_loss[-1]))
# for
# epochs轮训练全部结束,可视化操作一手
plt.plot(plt_train_loss)
plt.plot(plt_val_loss)
plt.title("Loos图")
plt.legend(["train", "val"])
plt.show()
将真正的训练过程封装为上述函数。
训练模式中使用“2.训练模型”获取预估值,根据loss和梯度回传不断优化模型内参数,且保存训练过程的loss值。
验证模式无需梯度回传,设置为验证模式以保证模型验证过程的数据完整性,记录验证过程的模型loss值。
最后将整个训练过程和验证过程的loss值进行可视化展现。
训练过程使用MSE为损失函数,通过SGD优化参数,训练过程超参数设置如下:
# 超参数设置:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
config = {
"lr": 0.001,
"epochs": 20,
"momentum": 0.9, # 动量
"save_path": "model_save/best_model.pth",
"rel_path": "model_save/pred.csv"
}
model = MyModel(inDim=93).to(device)
loss = nn.MSELoss() # (pred_y - y)^2
optimizer = optim.SGD(model.parameters(), lr=config["lr"], momentum=config["momentum"]) # torch下的SGD优化器调用,momentum为动量
# 开始训练
train_val(train_loader, val_loader, device, model, loss, optimizer, config["epochs"], config["save_path"])
损失函数等可进一步优化,下列给出通过L2正则化优化损失函数,避免模型过拟合等问题。
def mseLoss_with_reg(pred, target, model):
loss = nn.MSELoss(reduction='mean')
''' Calculate loss '''
regularization_loss = 0 # 正则项
for param in model.parameters():
# TODO: you may implement L1/L2 regularization here
# 使用L2正则项
# regularization_loss += torch.sum(abs(param))
regularization_loss += torch.sum(param ** 2) # 计算所有参数平方
return loss(pred, target) + 0.00075 * regularization_loss # 返回损失。
4.评估函数
def evaluate(save_path, test_loader, device, rel_path): # 得出测试结果文件
model = torch.load(save_path).to(device)
rel = []
with torch.no_grad():
for x in test_loader:
pred = model(x.to(device))
rel.append(pred.cpu().item())
print(rel)
with open(rel_path, "w", newline='') as f:
csvWriter = csv.writer(f)
csvWriter.writerow(["id", "tested_positive"])
for i, value in enumerate(rel):
csvWriter.writerow([str(i), str(value)])
print("文件已经保存到{}", format(rel_path))
通过训练过程保存的最优loss结果模型,对测试文件内数据进行预估结果,注意结果写入文件时的格式。
# 测试
evaluate(config["save_path"], test_loader, device, config["rel_path"])
5.部分结果展示
6.完整代码
# 数据集中“州”以"独热编码"形式呈现
import matplotlib.pyplot as plt
import torch
import numpy as np
import csv
import pandas as pd
from torch.utils.data import DataLoader, Dataset
# Dataset类,需要提供file; 提供的函数:init吃file path, getitem取数据,根据下标给X,Y, len求长度
# DataLoader函数,分批次取数据集
import torch.nn as nn
# nn.Module类下定义了init和forward函数
from torch import optim # 优化器
import time
# 1.数据预处理
class CovidDataset(Dataset):
def __init__(self, file_path, mode="train"):
with open(file_path, "r") as f:
ori_data = list(csv.reader(f)) # 将Exel表转为list列表,此时的数据皆是“字符串”形式
csv_data = np.array(ori_data[1:])[:, 1:].astype(float) # 将上述列表转矩阵且去第一行第一列 .astype(float)转为浮点型
# 取数据集下标:
if mode == "train": # 逢5取1
indices = [i for i in range(len(csv_data)) if i % 5 != 0]
data = torch.tensor(csv_data[indices, :-1])
self.y = torch.tensor(csv_data[indices, -1])
elif mode == "val": # 验证集
indices = [i for i in range(len(csv_data)) if i % 5 == 0]
data = torch.tensor(csv_data[indices, :-1])
self.y = torch.tensor(csv_data[indices, -1])
else: # 测试集
indices = [i for i in range(len(csv_data))]
data = torch.tensor(csv_data[indices])
# 测试集无label
self.mode = mode
self.data = (data - data.mean(dim=0, keepdim=True)) / data.std(dim=0, keepdim=True) # 归一化处理,keepdim表示维持原张量维度
def __getitem__(self, idx):
if self.mode != "test":
return self.data[idx].float(), self.y[idx].float() # .float()改为32位
else:
return self.data[idx].float()
def __len__(self):
return len(self.data)
# 2.训练模型
class MyModel(nn.Module):
def __init__(self, inDim):
super().__init__()
self.fc1 = nn.Linear(inDim, 64)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(64, 1)
def forward(self, x): # 模型前向过程
x = self.fc1(x)
x = self.relu1(x)
x = self.fc2(x)
if len(x.size()) > 1: # 由于数据集batch_x是二维,batch_y是一维,所以forward出来的预测值是二维的,需要压缩为一维
return x.squeeze(1)
return x
# 3.训练函数
def train_val(train_loader, val_loader, device, model, loss, optimizer, epochs, save_path):
model = model.to(device)
plt_train_loss = [] # 训练过程的loss值,取每批次的均值
plt_val_loss = [] # 验证
min_val_loss = 9999999999999 # 最优loss值
for epoch in range(epochs): # 训练号角
train_loss = 0.0
val_loss = 0.0
start_time = time.time()
model.train() # 模型调为训练模式
for batch_x, batch_y in train_loader:
x, target = batch_x.to(device), batch_y.to(device)
pred = model(x) # 训练得到pred_y
train_bat_loss = loss(pred, target) # 根据pred_y和y计算loss
train_loss += train_bat_loss.cpu().item() # 每次训练中,数据集分批次,每次训练总loss的值即为各批次loss总和
train_bat_loss.backward() # 对loss梯度回传
optimizer.step() # 更新模型参数
optimizer.zero_grad() # grad置零
# 由于batch_x.to(device), batch_y.to(device)操作导致数据集可能是在gpu上的,故train_bat_loss.cpu()也可能在gpu上,需取下放到cpu上后才允许取值
plt_train_loss.append(train_loss / train_loader.__len__()) # 计算train的平均loss
# 上述的train训练结束后已得到本次训练结束的最优参数(权重和偏置),下面根据该组参数值对验证集进行验证查看loss
model.eval() # 模型调为验证模式
with torch.no_grad(): # 验证过程不需要梯度回传,无需追踪grad
for batch_x, batch_y in val_loader:
x, target = batch_x.to(device), batch_y.to(device)
pred = model(x)
val_bat_loss = loss(pred, target)
val_loss += val_bat_loss.cpu().item()
plt_val_loss.append(val_loss / val_loader.__len__())
# 一轮训练结束,对更优的loss表现保存其model
if val_loss < min_val_loss:
torch.save(model, save_path)
min_val_loss = val_loss
# 打印这轮的训练结果
print("[%03d/%03d] %2.2fsec(s) Train_loss: %.6f | Val_loss: %.6f" % \
(epoch, epochs, time.time() - start_time, plt_train_loss[-1], plt_val_loss[-1]))
# for
# epochs轮训练全部结束,可视化操作一手
plt.plot(plt_train_loss)
plt.plot(plt_val_loss)
plt.title("Loos图")
plt.legend(["train", "val"])
plt.show()
# 4.评估函数
def evaluate(save_path, test_loader, device, rel_path): # 得出测试结果文件
model = torch.load(save_path).to(device)
rel = []
with torch.no_grad():
for x in test_loader:
pred = model(x.to(device))
rel.append(pred.cpu().item())
print(rel)
with open(rel_path, "w", newline='') as f:
csvWriter = csv.writer(f)
csvWriter.writerow(["id", "tested_positive"])
for i, value in enumerate(rel):
csvWriter.writerow([str(i), str(value)])
print("文件已经保存到{}", format(rel_path))
# def mseLoss_with_reg(pred, target, model):
# loss = nn.MSELoss(reduction='mean')
# ''' Calculate loss '''
# regularization_loss = 0 # 正则项
# for param in model.parameters():
# # TODO: you may implement L1/L2 regularization here
# # 使用L2正则项
# # regularization_loss += torch.sum(abs(param))
# regularization_loss += torch.sum(param ** 2) # 计算所有参数平方
# return loss(pred, target) + 0.00075 * regularization_loss # 返回损失。
train_file = "covid.train.csv"
test_file = "covid.test.csv"
# file = pd.read_csv(train_file)
# print(file.head())
train_dataset = CovidDataset(train_file, "train") # 返回训练集的X和Y
val_dataset = CovidDataset(train_file, "val")
test_dataset = CovidDataset(test_file, "test")
# for data in train_dataset:
# print(data) # data为93列的X和最后的label即Y
# 随机梯度下降:在数据集里面每次随机取一批进行梯度下降更新权重和偏置
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size, shuffle=True) # 将训练集数据分组,shuffle打乱数据
val_loader = DataLoader(val_dataset, batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
# for batch_x, batch_y in train_loader:
# print(batch_x, "\n", batch_y)
# model = MyModel(inDim=93)
# pred_y = model(batch_x) # nn.Module,一旦模型后跟()内有参数,直接进入forward函数
# 超参数设置:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
config = {
"lr": 0.001,
"epochs": 20,
"momentum": 0.9, # 动量
"save_path": "model_save/best_model.pth",
"rel_path": "model_save/pred.csv"
}
model = MyModel(inDim=93).to(device)
loss = nn.MSELoss() # (pred_y - y)^2
optimizer = optim.SGD(model.parameters(), lr=config["lr"], momentum=config["momentum"]) # torch下的SGD优化器调用,momentum为动量
# 开始训练
train_val(train_loader, val_loader, device, model, loss, optimizer, config["epochs"], config["save_path"])
# 测试
evaluate(config["save_path"], test_loader, device, config["rel_path"])
7.小白学习巩固
7.1 from torch.utils.data import DataLoader, Dataset Dataset类,需要提供file; 提供的函数:init吃file path, getitem取数据,根据下标给X,Y, 和len。 DataLoader函数,分批次取数据集,可直接调用。
7.2 import torch.nn as nn nn.Module类下定义了init和forward函数,如class MyModel(nn.Module)这样进行直接继承使用,编写好训练模型后,给数据直接调用即可,自动进入forward函数进行训练。
7.3 读取文件过程,数据格式转换 with open(file_path, "r") as f: ori_data = list(csv.reader(f)) # 将Exel表转为list列表,此时的数据皆是“字符串”形式 csv_data = np.array(ori_data[1:])[:, 1:].astype(float) # .astype(float)转为浮点型
7.4 输出结果保存,赋予文件的写权限,使用writerow进行每行写入 with open(rel_path, "w", newline='') as f: csvWriter = csv.writer(f) csvWriter.writerow(["id", "tested_positive"]) for i, value in enumerate(rel): csvWriter.writerow([str(i), str(value)])
更多推荐
所有评论(0)