Dataset and DataLoader
传送门:https://www.bilibili.com/video/BV1Y7411d7Ys?p=8
数据集
常见定义:
Epoch:全部训练样本通过一次前向传播和反向传播
Batch-Size:通过一次前向传播和反向传播的训练样本数量
Iteration:Batch-Size的个数
公式: E p o c h = B a t c h − S i z e × I t e r a t i o n Epoch=Batch-Size \times Iteration Epoch=Batch−Size×Iteration
多Batch训练

1、DataSet 是抽象类,不能实例化对象,主要是用于构造我们的数据集
2、DataLoader 需要获取DataSet提供的索引[i]和len;用来帮助我们加载数据,比如说做shuffle(提高数据集的随机性),batch_size,能拿出Mini-Batch进行训练。它帮我们自动完成这些工作。DataLoader可实例化对象。DataLoader is a class to help us loading data in Pytorch.
3、__getitem__目的是为支持下标(索引)操作
代码:
1、需要mini_batch 就需要import DataSet和DataLoader数据集方案
2、继承DataSet的类需要重写init, getitem, len魔法函数。分别是为了加载数据集,获取数据索引,获取数据总量。
3、DataLoader对数据集先打乱(shuffle),然后划分成mini_batch。
4、len函数的返回值 除以 batch_size 的结果就是每一轮epoch中需要迭代的次数。
5、inputs, labels = data中的inputs的shape是[32,8],labels 的shape是[32,1]。也就是说mini_batch在这个地方体现的
6、diabetes.csv数据集第七讲已给下载地址,该数据集需和源代码放在同一个文件夹内。
import numpy as np
import torch
from torch.utils.data import Dataset #Dataset 抽象类
from torch.utils.data import DataLoader #dataloader 可实例化
#1.准备数据集
class DiabetesDataset(Dataset):
def __init__(self, filepath): # 初始化数据
xy = np.loadtxt(filepath, delimiter=',', dtype=np.float32) # 载入数据
self.len = xy.shape[0]
self.x_data = torch.from_numpy(xy[:,:-1])
self.y_data = torch.from_numpy(xy[:,[-1]])
def __getitem__(self, index): # 重载数据切片功能
return self.x_data[index], self.y_data[index]
def __len__(self): # 数据长度
return self.len
dataset = DiabetesDataset('diabetes.csv.gz') # 加载数据,保证diabets和代码放到同一个文件,否则更改文件地址
#模型参数
train_loader = DataLoader(dataset=dataset, # 数据集
batch_size=32, # 训练batch
shuffle=True, # 随机打乱
num_workers=2)# 并行化,进程数
#2.设计模型
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear1 = torch.nn.Linear(8, 6)
self.linear2 = torch.nn.Linear(6, 4)
self.linear3 = torch.nn.Linear(4, 1)
self.activate = torch.nn.ReLU() # ReLu激活函数
self.sigmoid = torch.nn.Sigmoid() # Sigmoid激活函数
def forward(self, x):
x = self.activate(self.linear1(x))
x = self.activate(self.linear2(x))
x = self.sigmoid(self.linear3(x))
return x
model = Model()
#3.构造损失函数和优化器
criterion = torch.nn.BCELoss(size_average=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
#4.训练周期
if __name__ == "__main__":
for epoch in range(100):
for i, data in enumerate(train_loader, 0):
#1.prepare data
inputs, labels = data # tenor类别
#2.forward
y_pred = model(inputs)
loss = criterion(y_pred, labels)
print(epoch, i, loss.item())
#3.backward
optimizer.zero_grad()
loss.backward()
#4.updata
optimizer.step()
Tantic作业
数据集下载 链接:https://pan.baidu.com/s/1_cTzeBt6BLUo9xxkQ9Ce0w
提取码:n9ph
#Titanic Dataset training
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
#prepare dataset
train_data = pd.read_csv("train.csv")
del train_data['PassengerId']
del train_data['Cabin']
del train_data['Name']
del train_data['Ticket']
train_data['Age'] = train_data['Age'].fillna(train_data["Age"].mean())
ans = train_data['Embarked'].value_counts()
fillstr = ans.idxmax()
train_data['Embarked'] = train_data['Embarked'].fillna(fillstr)
train_data.replace(["female", "male"], [0, 1], inplace=True)
train_data.replace(["Q", "C", "S"], [0, 1, 2], inplace=True)
train_data = train_data.astype(np.float32)
# 查看数据
# train_data.info()
# print(train_data.head())
# print(train_data.describe())
train_data = np.array(train_data)
x_data = torch.from_numpy(train_data[:, 1:8])
y_data = torch.from_numpy(train_data[:, [0]])
#Design Model
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear1 = torch.nn.Linear(7, 4)
self.linear2 = torch.nn.Linear(4, 2)
self.linear3 = torch.nn.Linear(2, 1)
self.activate = torch.nn.ReLU()
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x):
x = self.activate(self.linear1(x))
x = self.activate(self.linear2(x))
x = self.sigmoid(self.linear3(x))
return x
model = Model()
#loss and optim
criterion = torch.nn.BCELoss(size_average=True, reduction='mean')
optim = torch.optim.Adam(model.parameters(), lr=0.01)
epoch_list = []
loss_list = []
#------------training cycle-----------------
for epoch in range(15000):
y_pred = model(x_data)
res = y_pred.detach().numpy()
loss = criterion(y_pred, y_data)
optim.zero_grad()
loss.backward()
optim.step()
if (epoch % 5000) == 0:
print("Epoch = " + str(epoch) + '\t' + 'loss = ' + str(format(loss.item(), '.4f')))
epoch_list.append(epoch)
loss_list.append(loss.item())
#绘制结果
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
plt.plot(epoch_list, loss_list)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.title('result of predicting of Titanic Dataset')
plt.show()
res[res >= 0.5] = 1
res[res < 0.5] = 0
train_size = x_data.shape[0]
print(train_size)
cnt = 0
for i in range(train_size):
cnt += (res[i].item() == y_data[i])
accuracy = cnt / train_size
print("精确率为:", accuracy)
#--------------test---------------
test_data = pd.read_csv('test.csv')
del test_data['Cabin']
del test_data['Name']
del test_data['Ticket']
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())
ans = test_data['Embarked'].value_counts()
fillstr = ans.idxmax()
test_data['Embarked'] = test_data['Embarked'].fillna(fillstr)
test_data.replace(['female', 'male'], [0, 1], inplace=True)
test_data.replace(["Q", "C", "S"], [0, 1, 2], inplace=True)
test_data = test_data.astype(np.float32)
print(test_data.head())
test = np.array(test_data)
x_test = torch.from_numpy(test[:, 1:8])
y_test = model(x_test)
y = y_test.data.numpy()
y[y>=0.5] = 1
y[y<0.5] = 0
# 导出结果
pIds = pd.DataFrame(data=test_data['PassengerId'].astype(int), columns=['PassengerId'],dtype=int)
preds = pd.DataFrame(data=y, columns=['Survived'], dtype=int)
result = pd.concat([pIds, preds], axis=1)
result.to_csv('tantic_prediction.csv', index=False)

9万+

被折叠的 条评论
为什么被折叠?



