- 我这个的损失最后是500多,正确率百分之62,总的来说不值得借鉴,这是我入门pytorch后写的第一个象征性的分类器,后面更深入学习后再来优化。我的这个倒是还是能走完全流程的,对于只想试试水,学个流程的可以参考。
- 我参考的博主链接
- 上代码:总的来说可以分为,数据准备,数据读入与处理,构建模型,构建损失函数与优化器,训练
MAP_Embarkded={'C':0,'Q':1,'S':2}
MAP_Sex={"male":1,"female":0}
# OLD_INDEX=['Pclass','Sex'.'Age','SibSp','Parch','Fare','Embarked']
def uniformization(x,m):
if m>0:
x=x/m
return x
# data
class Dataset_my(Dataset):
def __init__(self):
pass
def read_data(self):
data_train=pd.read_csv('D:/01_MyLab/DeepLearning/train.csv')
data_test=pd.read_csv('D:/01_MyLab/DeepLearning/test.csv')
# data_train.info()# 显示数据的信息,以判断数据缺失情况
data_train['Age']=data_train.Age.fillna(data_train['Age'].mean())
# data_train.info()# 显示数据的信息,以判断数据缺失情况
data_test.Fare=data_test.Fare.fillna(14.4)
data_test.Age=data_test.Age.fillna(data_test.Age.mean())
train_label=data_train[['Survived']]
test_label=pd.read_csv('D:/01_MyLab/DeepLearning/gender_submission.csv')
test_label=test_label[['Survived']]
self.data_train=data_train[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
self.data_test=data_test[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
self.data_train=pd.DataFrame(self.data_train,columns=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'])
self.data_test=pd.DataFrame(self.data_test,columns=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'])
self.train_label=pd.DataFrame(train_label)
self.test_label=pd.DataFrame(test_label)#把数据转为dataframe类型
self.manage_data()
for col in self.data_train:
# m=self.data_train[col].max()
self.data_train[[col]]=self.data_train[[col]].apply(lambda x:(x-np.min(x)/(np.max(x)-np.min(x))))
# if m>0:
# self.data_train[col]/=m
# self.data_train.apply(lambda x:(x-np.min(x)/(np.max(x)-np.min(x))))
return self.data_train,self.train_label,self.data_test,self.test_label
def manage_data(self):
# 处理male,embarked两个非数据字符
self.data_train['Sex']=self.data_train['Sex'].map(MAP_Sex)
self.data_train.Embarked=self.data_train.Embarked.map(MAP_Embarkded)
self.data_test['Sex']=self.data_train['Sex'].map(MAP_Sex)
self.data_test.Embarked=self.data_train.Embarked.map(MAP_Embarkded)
return 0
def __getitem__(self,index):
return self.data_train[index],self.train_label[index]
def __len__(self):
return len(self.data_train)
# data_loader=DataLoader(dataset=dataset,shuffle=True,batch_size=50,num_workers=4)
# 写模型
class Model(torch.nn.Module):
def __init__(self):
super(Model,self).__init__()
self.linear1=torch.nn.Linear(7,4)
self.linear2=torch.nn.Linear(4,4)
self.linear3=torch.nn.Linear(4,1)
self.sigmoid=torch.nn.Sigmoid()
def forward(self,x):
x=self.sigmoid(self.linear1(x))
x=self.sigmoid(self.linear2(x))
x=self.sigmoid(self.linear3(x))
return x
# self.fc=torch.nn.Sequential(
# torch.nn.Linear(10,7),
# torch.nn.Sigmoid(),
# torch.nn.Linear(7,7),
# torch.nn.Sigmoid(),
# torch.nn.Linear(7,1),
# )
# def forward(self,inputs):
# return self.fc(inputs)
my_model=Model()
# 定义损失函数 二分类问题的损失函数
criterion=torch.nn.BCELoss(reduction='sum')
optimizer=torch.optim.SGD(my_model.parameters(),lr=0.0001)
# 获取数据
dataset=Dataset_my()
train_data,label1,test_data,label2=dataset.read_data()
train_data=np.array(train_data)
test_data=np.array(test_data)
label1=np.array(label1)
label2=np.array(label2)
# train_data=train_data.astype(float)
# test_data=test_data.astype(float)
# label1=label1.astype(float)
# label2=label2.astype(float)
train_data=torch.tensor(train_data,dtype=torch.float)
test_data=torch.tensor(test_data,dtype=torch.float)
label1=torch.tensor(label1,dtype=torch.float)
label2=torch.tensor(label2,dtype=torch.float)
# 训练
# if __name__=='__main__':
num=len(train_data)
R=0
loss=0
for epoch in range(5000):
y_pred=my_model(train_data)
loss=criterion(y_pred,label1)
for i in range(len(y_pred)):
y1=y_pred[i][0].type(torch.long)
l1=label1[i][0].type(torch.long)
d=abs(y1-l1)
# print(d.item())
if d<0.5:
R+=1
# print("Correction rate: ",(R/num)*100)
R=0
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(loss)
注意事项
- 文件读入 pandas的read_csv函数
- pandas 在读取csv文件后,可以利用file.[“Property”]来直接获得某一列的属性值
- pandas在读取csv文件后,也会自动将列属性作为读入的文件的一个属性,file.Property直接进行调用和操作
- 利用fillna()函数可以将缺失的数据补齐,补齐方式有常数,均值,中位数等
- 如果想要对获取的内容进行操作,需要加一步,将读入的内容变为DataFrame()类型
train_label=pd.DataFrame(train)label)
train)label[3]
- 利用map()函数将字符类型的数据映射为数值类型
MAP_SEX={"male":1,"female":0}
data_train['Sex']=data_train['Sex'].map(MAP_SEX)
- !!! pandas里直接返回的DataFrame类型,在进行训练之前需要进行转换。首先转换为numpy.array()类型,然后再转换为torch.Tensor类型,并且必须设置tensor的dtype属性
train_data=np.array(train_data)
train_data=torch.tensor(train_data.dtype=torch.float)