1. 加载数据
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
train_df = pd.read_csv("tatannic/train.csv")
test_df = pd.read_csv("tatannic/test.csv")
2. 特征类型分析
print(train_df.shape)
train_df.head()
print(test_df.shape)
test_df.head()
- 可以发现测试集比训练集少‘Survived‘这个特征,当然我们要预测的也是这个特征
train_df.describe()
sns.countplot(train_df['Survived'])
train_df['Survived'].value_counts()
train_df.info()
- 一共有12个特征,7个数值特征,5个类别特征,'Age’和‘Cabin’缺失的比较多,‘Embarked’缺失的较少
train_df.describe(include=[np.object])
3. 无关特征删除
类别特征
- Name
- 目标变量分析的从存活率,所以这里我们删除’Name’这个特征
train_df.drop('Name', axis=1, inplace=True)
test_df.drop('Name', axis=1, inplace=True)
- Ticket
train_df['Ticket'].value_counts()
ticket_count = train_df['Ticket'].value_counts()
ticket_count = ticket_count[ticket_count>=4]
ticket_count_df = train_df[train_df['Ticket'].isin(ticket_count.index)]
ticket_count_df['Ticket'].value_counts()
ax, fig = plt.subplots(figsize=(12, 9))
sns.barplot(data=ticket_count_df, x='Ticket', y='Survived')
plt.xticks(rotation=90)
- 我们可以发现113760和2666这两种票号的存活率基本为百分之百,347077、1601、17421、PC17757这些票号的存活率也较高
list = ['113760', '2666', '347077', '1601', '17421', 'PC17757']
for c in list:
train_df['Ticket_rate'] = (train_df['Ticket'] == c).apply(int)
test_df['Ticket_rate'] = (test_df['Ticket'] == c).apply(int)
train_df.drop('Ticket', axis=1, inplace=True)
test_df.drop('Ticket', axis=1, inplace=True)
- Embarked
train_df['Embarked'].value_counts()
sns.barplot(data=train_df, x='Embarked', y='Survived')
- 可以发现在C登港口的存活率最高,S和Q登港口的存活率差不多
- Sex
train_df['Sex'].value_counts()
sns.barplot(data=train_df, x='Sex', y='Survived')
- 可以看出女性的存活率远远高于男性
- Cabin
train_df['Cabin'].value_counts()
Cabin_count = train_df['Cabin'].value_counts()
Cabin_count = Cabin_count[Cabin_count >= 3]
Cabin_df = train_df[train_df['Cabin'].isin(Cabin_count.index)]
Cabin_df['Cabin'].value_counts()
ax, fig = plt.subplots(figsize=(12, 9))
sns.barplot(data=Cabin_df, x='Cabin', y='Survived')
- 从图中发现,F33、E101、B96B98的 船舱的存活率为百分之百,接下来是F2和D船舱
list = ['F33', 'E101', 'B96B98', 'F2', 'D']
for c in list:
train_df['Cabin_rate'] = (train_df['Cabin'] == c).apply(int)
test_df['Cabin_rate'] = (test_df['Cabin'] == c).apply(int)
train_df.drop('Cabin', axis=1, inplace=True)
test_df.drop('Cabin', axis=1, inplace=True)
数值特征
value_cols = train_df.describe().columns
train_df[value_cols].nunique().describe()
train_df[value_cols].nunique().plot(kind='bar', logy=True)
print("全部为unique的特征", train_df[value_cols].columns[train_df[value_cols].nunique() == len(train_df)])
- 为PassengerId为序号,不能体现和存活率的关系,故删除
train_df.drop('PassengerId', axis=1, inplace=True)
test_ID = test_df['PassengerId']
test_df.drop('PassengerId', axis=1, inplace=True)
4. 数据类型转化
train_df.describe(include=[np.object])
sex_map = {'male':0, 'female':1}
train_df['Sex'] = train_df['Sex'].map(sex_map)
test_df['Sex'] = test_df['Sex'].map(sex_map)
train_df['Embarked'].value_counts()
embarked_map = {'S':0, 'C':1, 'Q':2}
train_df['Embarked'] = train_df['Embarked'].map(embarked_map)
test_df['Embarked'] = test_df['Embarked'].map(embarked_map)
5. 缺失值处理
def missing_values_table(df):
missing_values = df.isnull().sum()
missing_values_percent = 100 * df.isnull().sum() / len(df)
missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1,
keys=['Missig Values', '% of Total Values'])
missing_values_table = missing_values_table[missing_values_table.iloc[:,1] != 0].sort_values(
"% of Total Values", ascending =False)
return missing_values_table
missing_values = missing_values_table(train_df)
missing_values.head()
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())
rain_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
test_df['Embarked'] = test_df['Embarked'].fillna(test_df['Embarked'].mode()[0])
- 这里我用中位数填充Age,用众数填充Embarked,至于为什么要这样填充,就是想这样填充
missing_values_table(train_df).head()
missing_values_table(test_df).head()
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mode()[0])
6. 模型拟合
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
label = train_df['Survived']
train_df.drop('Survived', axis=1, inplace=True)
pipe_svc = make_pipeline(MinMaxScaler(), SVC())
pipe_svc.fit(train_df, label)
y_svc = pipe_svc.predict(test_df)
pipe_svc.score(train_df, label)
pipe_rfc = make_pipeline(MinMaxScaler(), RandomForestClassifier(n_estimators=300))
pipe_rfc.fit(train_df, label)
y_rfc = pipe_rfc.predict(test_df)
print(pipe_rfc)
pipe_rfc.score(train_df, label)
pipe_knn = make_pipeline(MinMaxScaler(), KNeighborsClassifier())
pipe_knn.fit(train_df, label)
y_knn = pipe_knn.predict(test_df)
pipe_knn.score(train_df, label)
7. 调参
- 上面可以看到随机森林表现的效果较好,这里我意思的用下网格搜索
from sklearn.model_selection import GridSearchCV
n_fold = 5
param = {'n_estimators': [20, 50, 100, 200]}
rfc = RandomForestClassifier()
scaler = MinMaxScaler().fit(train_df)
train_df_std = scaler.transform(train_df)
test_df_std = scaler.transform(train_df)
grid = GridSearchCV(rfc, param, cv=n_fold)
grid.fit(train_df, label)
print(grid.best_score_) #最优分数
print(grid.best_params_) #最优参数display(pd.DataFrame(ltc_grid.cv_results_).T)
display(pd.DataFrame(grid.cv_results_).T)
- 惊奇的发现分庶还降低了。。。
rfc = RandomForestClassifier()
rfc.fit(train_df_std, label)
test_label = rfc.predict(test_df)
submission = pd.DataFrame({
"PassengerId": test_ID,
"Survived": test_label
})
submission['Survived'].value_counts()
- 有待进一步深入的:
- 异常值的分析
- 多模型融合
- 。。。。。。