1.正则化: loss = loss + w* w,使模型曲线更加平滑,不容易过拟合。
def mseLoss_with_reg(pred, target, model):
loss = nn.MSELoss(reduction='mean') # 求loss平均值
''' Calculate loss '''
regularization_loss = 0 # 正则项
for param in model.parameters():
# TODO: you may implement L1/L2 regularization here
# 使用L2正则项
# regularization_loss += torch.sum(abs(param))
regularization_loss += torch.sum(param ** 2) # 计算所有参数平方
return loss(pred, target) + 0.00075 * regularization_loss # 返回损失
2.相关系数:在初始数据中可能存在无关列对预测结果产生干扰,用相关系数找出与已知结果相关程度最高的几列,用其进行训练、验证和预测。
def get_feature_importance(feature_data, label_data, k=4, column=None):
"""
feature_data, label_data 要求字符串形式
k为选择的特征数量
如果需要打印column,需要传入行名
此处省略 feature_data, label_data 的生成代码。
如果是 CSV 文件,可通过 read_csv() 函数获得特征和标签。
这个函数的目的是, 找到所有的特征种, 比较有用的k个特征, 并打印这些列的名字。
"""
model = SelectKBest(chi2, k=k) #定义一个选择k个最佳特征的函数
feature_data = np.array(feature_data, dtype=np.float64)
X_new = model.fit_transform(feature_data, label_data) #用这个函数选择k个最佳特征
#feature_data是特征数据,label_data是标签数据,该函数可以选择出k个特征
print('x_new', X_new)
scores = model.scores_ # scores即每一列与结果的相关性
# 按重要性排序,选出最重要的 k 个
indices = np.argsort(scores)[::-1] # [::-1]表示反转一个列表或者矩阵。
# argsort这个函数, 可以矩阵排序后的下标。 比如 indices[0]表示的是,scores中最小值的下标。
if column: # 如果需要打印选中的列名字
k_best_features = [column[i] for i in indices[0:k].tolist()] # 选中这些列 打印
print('k best features are: ', k_best_features)
return X_new, indices[0:k] # 返回选中列的特征和他们的下标。
3.主成分分析PCA,对所有列进行降维,减少数据维度,而相关系数是选取几列。
总代码
import time
import torch
import numpy as np
import csv
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch import optim
def get_feature_importance(feature_data, label_data, k=4, column=None):
"""
feature_data, label_data 要求字符串形式
k为选择的特征数量
如果需要打印column,需要传入行名
此处省略 feature_data, label_data 的生成代码。
如果是 CSV 文件,可通过 read_csv() 函数获得特征和标签。
这个函数的目的是, 找到所有的特征种, 比较有用的k个特征, 并打印这些列的名字。
"""
model = SelectKBest(chi2, k=k) #定义一个选择k个最佳特征的函数
feature_data = np.array(feature_data, dtype=np.float64)
X_new = model.fit_transform(feature_data, label_data) #用这个函数选择k个最佳特征
#feature_data是特征数据,label_data是标签数据,该函数可以选择出k个特征
print('x_new', X_new)
scores = model.scores_ # scores即每一列与结果的相关性
# 按重要性排序,选出最重要的 k 个
indices = np.argsort(scores)[::-1] # [::-1]表示反转一个列表或者矩阵。
# argsort这个函数, 可以矩阵排序后的下标。 比如 indices[0]表示的是,scores中最小值的下标。
if column: # 如果需要打印选中的列名字
k_best_features = [column[i] for i in indices[0:k].tolist()] # 选中这些列 打印
print('k best features are: ', k_best_features)
return X_new, indices[0:k] # 返回选中列的特征和他们的下标。
# 数据部分
class CovidDataset(Dataset):
def __init__(self, file_path, mode="train", all_feature=True, feature_dim=4): # __init__初始化,mode代表是训练集还是测试集,默认是train
with open(file_path, "r") as f: # 以只读方式打开file_path路径下的文件,并将其值赋值给f
ori_data = list(csv.reader(f))
column = ori_data[0]
csv_data = np.array(ori_data[1:])[:, 1:].astype(float) # 去掉第一行和第一列,将数据类型转为float
feature = np.array(ori_data[1:])[:, 1:-1]
label_data = np.array(ori_data[1:])[:, -1]
if all_feature:
col = np.array([i for i in range(0, 93)])
else:
_, col = get_feature_importance(feature, label_data)
col = col.tolist() # 重要的列由数组转为列表
if mode == "train": # 训练集
indices = [i for i in range(len(csv_data)) if i % 5 != 0] # 5个中取一个
data = torch.tensor(csv_data[indices, :-1]) # 处理x
self.y = torch.tensor(csv_data[indices, -1]) # 处理y,-1表最后一列
elif mode == "val": # 验证集
indices = [i for i in range(len(csv_data)) if i % 5 == 0]
data = torch.tensor(csv_data[indices, :-1]) # 处理x
self.y = torch.tensor(csv_data[indices, -1])
else: # 测试集,测试集=训练集+验证集
indices = [i for i in range(len(csv_data))] # 测试集没有y
data = torch.tensor(csv_data[indices]) # 处理x
data = data[:, col]
self.data = (data - data.mean(dim=0, keepdim=True)) / data.std(dim=0, keepdim=True)
self.mode = mode # 前加self,能传递到__getitem__
def __getitem__(self, idx): # __getitem__取数
if self.mode != "test":
return self.data[idx].float(), self.y[idx].float()
else:
return self.data[idx].float()
def __len__(self): # 样本数量
return len(self.data)
# 模型
class MyModel(nn.Module):
def __init__(self, inDim):
super(MyModel, self).__init__()
self.fc1 = nn.Linear(inDim, 64)
self.relu1 = nn.ReLU() # 激活函数
self.fc2 = nn.Linear(64, 1)
def forward(self, x): # 模型前向过程
x = self.fc1(x)
x = self.relu1(x)
x = self.fc2(x)
if len(x.size()) > 1: # size表维度,len表多少个维度
return x.squeeze(1)
return x
def train_val(model, train_loder, val_loder, device, epochs, optimizer, loss, save_path):
model = model.to(device)
plt_train_loss = [] # 记录所有轮次的loss
plt_val_loss = []
min_val_loss = 99999
for epoch in range(epochs):
train_loss = 0.0 # 每一轮loss
val_loss = 0.0
start_time = time.time() # 开始时间
model.train() # 模型调为训练模式
for batch_x, batch_y in train_loder:
x, target = batch_x.to(device), batch_y.to(device)
pred = model(x)
train_bat_loss = loss(pred, target, model) # 每一批loss
train_bat_loss.backward() # 梯度回传
optimizer.step() # 更新模型
optimizer.zero_grad() # 梯度归0
train_loss += train_bat_loss.cpu().item()
plt_train_loss.append(train_loss / train_loder.__len__())
model.eval() # 验证集
with torch.no_grad(): # 验证集不计算梯度
for batch_x, batch_y in train_loder:
x, target = batch_x.to(device), batch_y.to(device)
pred = model(x)
val_bat_loss = loss(pred, target, model) # 每一批loss
val_loss += val_bat_loss.cpu().item()
plt_val_loss.append(val_loss / val_loder.__len__())
if val_loss < min_val_loss:
torch.save(model, save_path) # 保存模型
min_val_loss = val_loss
print("[%03d/%03d] %2.2f sec(s) TrainLoss: %.6f |valLoss: %.6f" % \
(epoch, epochs, time.time() - start_time, plt_train_loss[-1], plt_val_loss[-1])) # \表换行
plt.plot(plt_train_loss)
plt.plot(plt_val_loss)
plt.title("loss")
plt.legend(["train", "val"])
plt.show()
def evaluate(save_path, test_loader, device, rel_path): # 得出测试结果文件
model = torch.load(save_path).to(device) # 加载模型
rel = [] # 定义保留结果列表
with torch.no_grad(): # 测试无梯度
for x in test_loader: # 取数据
pred = model(x.to(device)) # 预测值,x.to(device)保证数据,张量在同一设备上
rel.append(pred.cpu().item()) # 预测值放在rel列表里
print(rel)
with open(rel_path, "w", newline='') as f: # 写成文件
csvWriter = csv.writer(f) # 指针
csvWriter.writerow(["id", "tested_positive"]) # 写第一行
for i, value in enumerate(rel): # 每一个下标和下标值
csvWriter.writerow([str(i), str(value)]) # 写其余的行
print("文件已保存到" + rel_path)
all_feature = False
if all_feature:
feature_dim = 93
else:
feature_dim = 4
train_file = "covid.train.csv" # 训练集
test_file = "covid.test.csv" # 测试集
train_dataset = CovidDataset(train_file, "train", all_feature=all_feature, feature_dim=feature_dim) # 创建训练集实例
val_dataset = CovidDataset(train_file, "val", all_feature=all_feature, feature_dim=feature_dim) # 创建验证集实例
test_dataset = CovidDataset(test_file, "test", all_feature=all_feature, feature_dim=feature_dim) # 创建测试集实例
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
# for batch_x, batch_y in train_loder:
# print(batch_x, batch_y)
# predy = model(batch_x)
# for data in train_dataset:
# print(data)
# file = pd.read_csv(train_file)
# print(file.head()) # head表数据前5行
# 超参
device = "cuda" if torch.cuda.is_available() else "cpu" # 如果cuda能用就用显卡,否则用cpu
print(device)
config = { # 超参写在字典里,可以随时调用
"lr": 0.0001,
"epoch": 20, # 轮数
"momentum": 0.85, # 动量
"save_path": "model_save/best_model.pth", # 保存最佳模型路径
"rel_path": "pred.csv"
}
def mseLoss_with_reg(pred, target, model):
loss = nn.MSELoss(reduction='mean') # 求loss平均值
''' Calculate loss '''
regularization_loss = 0 # 正则项
for param in model.parameters():
# TODO: you may implement L1/L2 regularization here
# 使用L2正则项
# regularization_loss += torch.sum(abs(param))
regularization_loss += torch.sum(param ** 2) # 计算所有参数平方
return loss(pred, target) + 0.00075 * regularization_loss # 返回损失
model = MyModel(inDim=feature_dim).to(device) # 把模型放在设备上
# loss = nn.MSELoss() # 直接用官方函数,loss = 两个数的平方差
loss = mseLoss_with_reg
optimizer = optim.SGD(model.parameters(), lr=config["lr"], momentum=config["momentum"]) # 优化器,SGD是官方函数,直接用
train_val(model, train_loader, val_loader, device, config["epoch"], optimizer, loss, config["save_path"]) # 训练
evaluate(config["save_path"], test_loader, device, config["rel_path"]) # 评估验证