1.添加读热编码
pd.get_dummies(data.salary)
2.Dataset
dataset进行重构
- PyTorch有一个抽象的Dataset类。Dataset可以是任何具有len函数和getitem作为对其进行索引的方法的函数。
- PyTorch的TensorDataset 是一个包装张量的Dataset。通过定义索引的长度和方式,这也为我们提供了沿张量的第一维进行迭代,索引和切片的方法。这将使我们在训练的同一行中更容易访问自变量和因变量。
from torch.utils.data import TensorDataset
HRdataset = TensorDataset(X, Y)
3.DataLoader
Pytorch DataLoader负责管理批次。
DataLoader从Dataset创建。
DataLoader使遍历批次变得更容易。DataLoader会自动为我们提供每个小批量。
无需使用切片处理 HRdataset[i * batch: i * batch + batch]
from torch.utils.data import DataLoader
HR_ds = TensorDataset(X, Y)
HR_dl = DataLoader(HR_ds, batch_size=batch)
4.添加验证
应把数据分为训练+验证的方式,以识别您是否过度拟合。
训练数据的乱序(shuffle)对于防止批次与过度拟合之间的相关性很重要。另一方面,无论我们是否乱序验证集,验证损失都是相同的。由于shufle需要额外的开销,因此shuffle验证数据没有任何意义。
我们将为验证集使用批大小,该批大小是训练集的两倍。这是因为验证集不需要反向传播,因此占用的内存更少(不需要存储梯度)。我们利用这一优势来使用更大的批量,并更快地计算损失。
在jupyter notebook中采用 !pip install sklearn -i https://pypi.douban.com/simple/ 添加来库。
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X_data, Y_data)
实例
#!/usr/bin/env python
# coding: utf-8
# In[4]:
import torch
# In[5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# In[6]:
data = pd.read_csv('./dataset/HR.csv')
# In[7]:
data.head()
# In[8]:
data.info()
# In[9]:
data.part.unique()
# In[10]:
data.salary.unique()
# In[11]:
data = data.join(pd.get_dummies(data.part)).join(pd.get_dummies(data.salary))
# In[12]:
data.head()
# In[13]:
data.drop(columns=['part', 'salary'], inplace=True)
# In[14]:
data.left.value_counts()
# In[56]:
11428/(11428+3571)
# In[15]:
Y_data = data.left.values.reshape(-1, 1)
# In[16]:
Y = torch.from_numpy(Y_data).type(torch.FloatTensor)
# In[17]:
Y
# In[18]:
Y.shape
# In[19]:
[c for c in data.columns if c != 'left']
# In[20]:
X_data = data[[c for c in data.columns if c != 'left']].values
# In[23]:
X = torch.from_numpy(X_data).type(torch.FloatTensor)
# In[24]:
X.shape
# # 创建模型
# In[25]:
from torch import nn
# In[26]:
class Logistic(nn.Module):
def __init__(self):
super().__init__()
self.lin_1 = nn.Linear(20, 64)
self.lin_2 = nn.Linear(64, 64)
self.lin_3 = nn.Linear(64, 1)
self.activate = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self, input):
x = self.lin_1(input)
x = self.activate(x)
x = self.lin_2(x)
x = self.activate(x)
x = self.lin_3(x)
x = self.sigmoid(x)
return x
# ### 我们将定义一个小函数来创建我们的模型和优化器,以便将来可以重用。
# In[27]:
lr = 0.0001
# In[28]:
def get_model():
model = Logistic()
return model, torch.optim.Adam(model.parameters(), lr=lr)
# ## 定义损失函数
# In[29]:
loss_fn = nn.BCELoss()
# In[30]:
model, opt = get_model()
# In[31]:
len(data)
# In[32]:
batch = 64
no_of_batches = len(data)//batch
epochs = 100
# In[33]:
for epoch in range(epochs):
for i in range(no_of_batches):
start = i*batch
end = start + batch
x = X[start: end]
y = Y[start: end]
y_pred = model(x)
loss = loss_fn(y_pred, y)
opt.zero_grad()
loss.backward()
opt.step()
print('epoch:', epoch, ' ', 'loss:', loss_fn(model(X), Y))
# # 使用dataset进行重构
# PyTorch有一个抽象的Dataset类。Dataset可以是任何具有__len__函数和__getitem__作为对其进行索引的方法的函数。 本教程将通过示例将自定义HRDataset类创建为的Dataset的子类。
# PyTorch的TensorDataset 是一个包装张量的Dataset。通过定义索引的长度和方式,这也为我们提供了沿张量的第一维进行迭代,索引和切片的方法。这将使我们在训练的同一行中更容易访问自变量和因变量。
# In[34]:
from torch.utils.data import TensorDataset
# In[35]:
HRdataset = TensorDataset(X, Y)
# In[36]:
model, opt = get_model()
# In[37]:
for epoch in range(epochs):
for i in range(no_of_batches):
x, y = HRdataset[i * batch: i * batch + batch]
y_pred = model(x)
loss = loss_fn(y_pred, y)
opt.zero_grad()
loss.backward()
opt.step()
print('epoch:', epoch, ' ', 'loss:', loss_fn(model(X), Y))
# # 使用DataLoader进行重构
# Pytorch DataLoader负责管理批次。
#
# DataLoader从Dataset创建。
#
# DataLoader使遍历批次变得更容易。DataLoader会自动为我们提供每个小批量。
#
# 无需使用 HRdataset[i * batch: i * batch + batch]
# In[38]:
from torch.utils.data import DataLoader
HR_ds = TensorDataset(X, Y)
HR_dl = DataLoader(HR_ds, batch_size=batch)
现在,我们的循环更加简洁了,因为(xb,yb)是从数据加载器自动加载的:
for x,y in HR_dl:
pred = model(x)
# In[39]:
model, opt = get_model()
# In[40]:
for epoch in range(epochs):
for x, y in HR_dl:
y_pred = model(x)
loss = loss_fn(y_pred, y)
opt.zero_grad()
loss.backward()
opt.step()
print('epoch:', epoch, ' ', 'loss:', loss_fn(model(X), Y))
# # 添加验证
# 前面我们只是试图建立一个合理的训练循环以用于我们的训练数据。实际上,您始终还应该具有一个验证集,以识别您是否过度拟合。
#
# 训练数据的乱序(shuffle)对于防止批次与过度拟合之间的相关性很重要。另一方面,无论我们是否乱序验证集,验证损失都是相同的。由于shufle需要额外的开销,因此shuffle验证数据没有任何意义。
#
# 我们将为验证集使用批大小,该批大小是训练集的两倍。这是因为验证集不需要反向传播,因此占用的内存更少(不需要存储梯度)。我们利用这一优势来使用更大的批量,并更快地计算损失。
# !pip install sklearn -i https://pypi.douban.com/simple/
# In[42]:
from sklearn.model_selection import train_test_split
# In[43]:
train_x, test_x, train_y, test_y = train_test_split(X_data, Y_data)
# In[44]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape
# In[45]:
train_x = torch.from_numpy(train_x).type(torch.FloatTensor)
test_x = torch.from_numpy(test_x).type(torch.FloatTensor)
train_y = torch.from_numpy(train_y).type(torch.FloatTensor)
test_y = torch.from_numpy(test_y).type(torch.FloatTensor)
# In[46]:
train_ds = TensorDataset(train_x, train_y)
train_dl = DataLoader(train_ds, batch_size=batch, shuffle=True)
valid_ds = TensorDataset(test_x, test_y)
valid_dl = DataLoader(valid_ds, batch_size=batch * 2)
# # 定义计算正确率函数
# In[49]:
def accuracy(out, yb):
preds = (out>0.5).type(torch.IntTensor)
return (preds == yb).float().mean()
# model.train()在训练之前调用代表训练模式
#
# model.eval() 推理之前进行调用代表推理模式
#
# 不同的模式仅会在使用nn.BatchNorm2d ,nn.Dropout等层时以确保这些不同阶段的行为正确。
# In[54]:
epochs = 500
# In[55]:
model, opt = get_model()
for epoch in range(epochs+1):
model.train()
for xb, yb in train_dl:
pred = model(xb)
loss = loss_fn(pred, yb)
loss.backward()
opt.step()
opt.zero_grad()
if epoch%50==0:
model.eval()
with torch.no_grad():
valid_loss = sum(loss_fn(model(xb), yb) for xb, yb in valid_dl)
acc_mean = np.mean([accuracy(model(xb), yb) for xb, yb in valid_dl])
print(epoch, valid_loss / len(valid_dl), acc_mean)
# # 优化
# In[57]:
class Logistic(nn.Module):
def __init__(self):
super().__init__()
self.lin_1 = nn.Linear(20, 64)
self.lin_2 = nn.Linear(64, 64)
self.lin_3 = nn.Linear(64, 64)
self.lin_4 = nn.Linear(64, 1)
self.activate = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self, input):
x = self.lin_1(input)
x = self.activate(x)
x = self.lin_2(x)
x = self.activate(x)
x = self.lin_3(x)
x = self.activate(x)
x = self.lin_4(x)
x = self.sigmoid(x)
return x
# In[67]:
model, opt = get_model()
acc_val = []
acc_train = []
for epoch in range(epochs+1):
model.train()
for xb, yb in train_dl:
pred = model(xb)
loss = loss_fn(pred, yb)
loss.backward()
opt.step()
opt.zero_grad()
if epoch%50==0:
model.eval()
with torch.no_grad():
valid_loss = sum(loss_fn(model(xb), yb) for xb, yb in valid_dl)
acc_mean_train = np.mean([accuracy(model(xb), yb) for xb, yb in train_dl])
acc_mean_val = np.mean([accuracy(model(xb), yb) for xb, yb in valid_dl])
acc_train.append(acc_mean_train)
acc_val.append(acc_mean_val)
print(epoch, valid_loss / len(valid_dl), acc_mean_train, acc_mean_val)
# # 创建fit()和get_data()
# In[72]:
def loss_batch(model, loss_func, xb, yb, opt=None):
loss = loss_func(model(xb), yb)
if opt is not None:
loss.backward()
opt.step()
opt.zero_grad()
return loss.item(), len(xb)
# In[73]:
import numpy as np
def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
for epoch in range(epochs):
model.train()
for xb, yb in train_dl:
loss_batch(model, loss_func, xb, yb, opt)
model.eval()
with torch.no_grad():
losses, nums = zip(
*[loss_batch(model, loss_func, xb, yb) for xb, yb in valid_dl]
)
val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
print(epoch, val_loss)
# In[74]:
def get_data(train_ds, valid_ds, bs):
return (
DataLoader(train_ds, batch_size=bs, shuffle=True),
DataLoader(valid_ds, batch_size=bs * 2),
)
# ### 现在,我们获取数据加载器和拟合模型的整个过程可以在3行代码中运行:
# In[77]:
train_dl, valid_dl = get_data(train_ds, valid_ds, batch)
model, opt = get_model()
fit(epochs, model, loss_fn, opt, train_dl, valid_dl)
# In[ ]: