1.题目链接
Titanic: Machine Learning from Disaster
2.参考资料
3.线上成绩
截至2019年2月24日
排名:413/9909 前4%
线上成绩:0.82296
4.流程及代码
4.1 载入数据
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')
train_y = train_data["Survived"]
train_x = train_data
# 将训练集和测试集合成
all_data = pd.concat((train_x, test_data)).reset_index(drop=True)
4.2 缺失值填充
## 根据计算不同仓位的在不同港口的中位数售价,最接近票价的港口,填充Embarked
P1 = all_data[all_data['Pclass']==1][['Fare','Embarked']]
P1.groupby('Embarked')['Fare'].median()
all_data.loc[829,'Embarked'] = 'C'
all_data.loc[61,'Embarked'] = 'C'
P3 = all_data[all_data['Pclass']==3][['Fare','Embarked']]
P3.groupby('Embarked')['Fare'].median()
all_data.loc[1043,'Fare'] = 8.05
all_data_Name = all_data['Name']
all_data_Family = all_data_Name.str.split(',',expand=True)[0]
all_data['Family'] = all_data_Family
only_have_Sibsp = all_data[all_data['Age'].isnull()==True][(all_data['SibSp'] > 0) & (all_data['Parch']==0)]
Sibsp_ticket = only_have_Sibsp['Ticket'].unique()
only_have_Sibsp = all_data[all_data['Age'].isnull()==True][(all_data['SibSp'] > 0) & (all_data['Parch']==0)]
Sibsp_ticket = only_have_Sibsp['Ticket'].unique()
for f in Sibsp_ticket:
tmp = all_data[all_data['Ticket'] =&#