再刷泰坦尼克号
数据探索和可视化:形成假设的第一步
导入需要的包
#忽略警告提示
import warnings
warnings.filterwarnings('ignore')
#数据处理
import pandas as pd
import numpy as np
import random
#可视化
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
数据读入
path='C:/Users/Titanic/'
p1=open(path+'train.csv')
p2=open(path+'test.csv')
train=pd.read_csv(p1)
test=pd.read_csv(p2)
数据概览
train.head(3)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th… | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
train.info()
train.describe()
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
train.drop('PassengerId',axis=1).corr()
Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|
Survived | 1.000000 | -0.338481 | -0.077221 | -0.035322 | 0.081629 | 0.257307 |
Pclass | -0.338481 | 1.000000 | -0.369226 | 0.083081 | 0.018443 | -0.549500 |
Age | -0.077221 | -0.369226 | 1.000000 | -0.308247 | -0.189119 | 0.096067 |
SibSp | -0.035322 | 0.083081 | -0.308247 | 1.000000 | 0.414838 | 0.159651 |
Parch | 0.081629 | 0.018443 | -0.189119 | 0.414838 | 1.000000 | 0.216225 |
Fare | 0.257307 | -0.549500 | 0.096067 | 0.159651 | 0.216225 | 1.000000 |
#图来观察更直观
sns.set(context='paper',font='monospace')
sns.set(style='white')
f,ax=plt.subplots(figsize=(10,6))
train_corr=train.drop('PassengerId',axis=1).corr()
sns.heatmap(train_corr,ax=ax,vmax=.9,square=True,cmap=plt.cm.get_cmap('RdYlBu'))
ax.set_xticklabels(train_corr.index,size=15)
ax.set_yticklabels(train_corr.columns,size=15)
ax.set_title('train feature corr',fontsize=20)
- Pclass与获救情况负相关
- Fare和获救情况正相关
- Pclass和Fare负相关
Age
#深入观察一下年龄
fig,axes=plt.subplots(2,1,figsize=(8,6))
sns.set_style('white')
sns.distplot(train.Age.fillna(-20),rug=True,color='b',ax=axes[0])
ax0=axes[0]
ax0.set_title('Age distribution')
ax0.set_xlabel('')
ax1=axes[1]
ax1.set_title('Age survived distribution')
k1=sns.distplot(train[train.Survived==0].Age.fillna(-20),hist=False,color='r',ax=ax1,label='dead')
k2=sns.distplot(train[train.Survived==1].Age.fillna(-20),hist=False,color='g',ax=ax1,label='alive')
ax1.set_xlabel('')
ax1.legend(fontsize=16)
上图暂时将年龄的缺省值用-20进行了填充,再对年龄的分布和获救情况进行了展示,可以观察到
* 年龄分布范围很宽,以及小孩和中等偏大的人获救概率更高
* age和survived之间并不是线性模型
* 年龄缺省的部分获救的人更少
fig,ax=plt.subplots(figsize=(8,3))
ax.set_title('Sex Age dist',size=20)
sns.distplot(train[train.Sex=='female'].dropna().Age,hist=False,color='pink',label='female')
sns.distplot(train[train.Sex=='male'].dropna().Age,hist=False,color='b',label='male')
ax.legend(fontsize=15)
女性更年轻,男性中老年居多,小孩则是男孩更多一点
fig,ax=plt.subplots(figsize=(8,3))
ax.set_title('Pclass Age dist',size=20)
sns.distplot(train[train.Pclass==1].dropna().Age,hist=False,color='pink',label='P1')
sns.distplot(train[train.Pclass==2].dropna().Age,hist=False,color='b',label='P2')
sns.distplot(train[train.Pclass==3].dropna().Age,hist=False,color='g',label='P3')
ax.legend(fontsize=15)
仓位等级越高,年龄越偏大
Pclass
y_dead=train[train.Survived==0].groupby('Pclass')['Survived'].count()
y_alive=train[train.Survived==1].groupby('Pclass')['Survived'].count()
pos=[1,2,3]
ax=plt.figure(figsize=(8,4)).add_subplot(111)
ax.bar(pos,y_dead,color='r',alpha=.6,label='dead')
ax.bar(pos,y_alive,color='g',bottom=y_dead,alpha=.6,label='alive')
ax.legend(fontsize=16,loc='best')
ax.set_xticks(pos)
ax.set_xticklabels(['Pclass%d'%(i) for i in range(1,4)],size=15)
ax.set_title('Pclass Survived count',size=20)
三个舱人数对比:
* 三等舱总体人数遥遥领先
* 从获救比例来看,头等舱存活率最高,三等舱死亡率最高
pos=range(0,6)
age_list=[]
for Pclass_ in range(1,4):
for Survived_ in range(0,2):
age_list.append(train[(train.Pclass==Pclass_)&(train.Survived==Survived_)].Age.values)
fig,axes=plt.subplots(3,1,figsize=(10,6))
i_Pclass=1
for ax in axes:
sns.distplot(age_list[i_Pclass*2-2],hist=False,ax=ax,label='Pclass:%d ,survived:0'%(i_Pclass),color='r')
sns.distplot(age_list[i_Pclass*2-1],hist=False,ax=ax,label='Pclass:%d ,survived:1'%(i_Pclass),color='g')
i_Pclass +=1
ax.set_xlabel('age',size=15)
ax.legend(fontsize=15)
观察可知,总体来讲三个舱都是小孩获救多,其中一等二等舱小孩保护的很好,三等舱小孩也存在一定死亡率
Sex
print(train.Sex.value_counts())
print(train.groupby('Sex')['Survived'].mean())
male 577
female 314
Name: Sex, dtype: int64
Sex
female 0.742038
male 0.188908
Name: Survived, dtype: float64
男女数量比例577:314
女性生存率74%,男性仅为18%
ax=plt.figure(figsize=(10,4)).add_subplot(111)
sns.violinplot(x='Sex',y='Age',hue='Survived',data=train.dropna(),split=True)
ax.set_xlabel('Sex',size=20)
ax.set_xticklabels(['Female','male'],size=18)
ax.set_ylabel('Age',size=20)
ax.legend(fontsize=25,loc='best')
女性获救的年龄集中在中段年龄,男性小孩和年轻人容易获救
label=[]
for sex_i in ['female','male']:
for pclass_i in range(1,4):
label.append('sex:%s,Pclass:%d'%(sex_i,pclass_i))
pos=range(6)
fig=plt.figure(figsize=(16,4))
ax=fig.add_subplot(111)
ax.bar(pos,train[train['Survived']==0].groupby(['Sex','Pclass'])['Survived'].count().values,
color='r',alpha=.5,align='center',tick_label=label,label='dead')
ax.bar(pos,train[train['Survived']==1].groupby(['Sex','Pclass'])['Survived'].count().values,
bottom=train[train['Survived']==0].groupby(['Sex','Pclass'])['Survived'].count().values,
color='g',alpha=.5,align='center',tick_label=label,label='alive')
ax.tick_params(labelsize=15)
ax.set_title('sex_pclass_survived',size=30)
ax.legend(fontsize=15,loc='best')
可见女性的绿色显著的多,更容易获救,在相同性别下,舱等级越高,获救率越高
Fare
fig=plt.figure(figsize=(8,6))
ax=plt.subplot2grid((2,2),(0,0),colspan=2)#subplot2grid绘制排列更紧密的多重子图
ax.tick_params(labelsize=15)
ax.set_title('Fare dist',size=20)
sns.kdeplot(train.Fare,ax=ax)
sns.distplot(train.Fare,ax=ax)
ax.legend(fontsize=15)
pos=range(0,400,50)
ax.set_xticks(pos)
ax.set_xlim([0,200])
ax.set_xlabel('')
ax1=plt.subplot2grid((2,2),(1,0),colspan=2)
for i in range(1,4):
sns.kdeplot(train[train.Pclass==i].Fare,ax=ax1,label='Pclass %d'%(i))
ax1.set_xlim([0,200])
ax1.legend(fontsize=15)
fig=plt.figure(figsize=(8,3))
ax1=fig.add_subplot(111)
sns.kdeplot(train[train.Survived==0].Fare,ax=ax1,label='dead',color='r')
sns.kdeplot(train[train.Survived==1].Fare,ax=ax1,label='alive',color='g')
ax1.set_xlim([0,300])
ax1.legend(fontsize=15)
ax1.set_title('Fare survived',size=20)
ax1.set_xlabel('Fare',size=15)
钱出的多的人,更容易获救
sibsp&parch
fig=plt.figure(figsize=(8,4))
ax1=fig.add_subplot(211)
sns.countplot(train.SibSp)
ax1.set_title('SibSp',size=20)
ax2=fig.add_subplot(212,sharex=ax1)
sns.countplot(train.Parch)
ax2.set_title('Parch',size=20)
大多数人没有直系亲属,不同代直系亲属1个居多,同代直系亲属1,2个居多
fig=plt.figure(figsize=(10,6))
ax1=fig.add_subplot(311)
train.groupby('SibSp')['Survived'].mean().plot(kind='bar',ax=ax1)
ax1.set_title('Sibsp Survived Rate',size=16)
ax1.set_xlabel('')
ax2=fig.add_subplot(312)
train.groupby('Parch')['Survived'].mean().plot(kind='bar',ax=ax2)
ax2.set_title('Parch Survived Rate',size=16)
ax2.set_xlabel('')
ax3=fig.add_subplot(313)
train.groupby(train.SibSp+train.Parch)['Survived'].mean().plot(kind='bar',ax=ax3)
ax3.set_title('Parch+Sibsp Survived Rate',size=16)
似乎有亲属的人的获救率会比独自一人获救率高一些,但也并不是简单的线性关系
Embarked
y1=train[train.Survived==0].groupby('Embarked')['Survived'].count()
y1
Embarked
C 75
Q 47
S 427
Name: Survived, dtype: int64
plt.style.use('ggplot')#使用自带样式美化
ax=plt.figure(figsize=(8,3)).add_subplot(111)
pos=[1,2,3]
y1=train[train.Survived==0].groupby('Embarked')['Survived'].count().sort_index().values
y2=train[train.Survived==1].groupby('Embarked')['Survived'].count().sort_index().values
ax.bar(pos,y1,color='r',alpha=.4,align='center',label='dead')
ax.bar(pos,y2,color='g',alpha=.4,align='center',label='alive',bottom=y1)
ax.set_xticks(pos)
ax.set_xticklabels(['C','Q','S'])
ax.legend(fontsize=15,loc='best')
ax.set_title('Embarked survived count',size=18)
似乎从C登船的人有很高的获救率
ax=plt.figure(figsize=(8,3)).add_subplot(111)
ax.set_xlim([-20,80])
sns.kdeplot(train[train.Embarked=='C'].Age.fillna(-10),ax=ax,label='C')
sns.kdeplot(train[train.Embarked=='Q'].Age.fillna(-10),ax=ax,label='Q')
sns.kdeplot(train[train.Embarked=='S'].Age.fillna(-10),ax=ax,label='S')
ax.legend(fontsize=18)
ax.set_title('Embarked Age Dist',size=18)
年龄缺省值用-10填充了,可以发现在Q登船的人很多没有年龄
#reset_index()还原索引
y1=train[train.Survived==0].groupby(['Embarked','Pclass'])['Survived'].count().reset_index()['Survived'].values
y2=train[train.Survived==1].groupby(['Embarked','Pclass'])['Survived'].count().reset_index()['Survived'].values
ax=plt.figure(figsize=(8,3)).add_subplot(111)
pos=range(9)
ax.bar(pos,y1,align='center',alpha=.5,color='r',label='dead')
ax.bar(pos,y2,align='center',alpha=.5,color='g',label='alive',bottom=y1)
ax.set_xticks(pos)
xticklabels=[]
for embarked_val in ['C','Q','S']:
for pclass_val in range(1,4):
xticklabels.append('%s/%d'%(embarked_val,pclass_val))
ax.set_xticklabels(xticklabels,size=15)
ax.legend(fontsize=15,loc='best')
对比不同港口以及舱位来看,C容易获救是因为在头等舱的多,Q上船的人大多在三等舱,C上船的大多是一等舱和三等舱,S地上船的人最多,各个舱位都有,按等级分布的数量也符合常理
Cabin
train.Cabin.isnull().value_counts()
True 687
False 204
Name: Cabin, dtype: int64
train.groupby(by=train.Cabin.isnull())['Survived'].mean()
Cabin
False 0.666667
True 0.299854
Name: Survived, dtype: float64
客舱编号大部分为空,而且为空的获救率较低,不为空的获救率高
train['Cabin_Zone']=train.Cabin.fillna('0').str.split(' ').apply(lambda x: x[0][0])
train.groupby(by='Cabin_Zone')['Survived'].agg(['mean','count'])
mean | count | |
---|---|---|
Cabin_Zone | ||
0 | 0.299854 | 687 |
A | 0.466667 | 15 |
B | 0.744681 | 47 |
C | 0.593220 | 59 |
D | 0.757576 | 33 |
E | 0.750000 | 32 |
F | 0.615385 | 13 |
G | 0.500000 | 4 |
T | 0.000000 | 1 |
不同客舱编号获救差别比较大
Ticket
train.Ticket.head()
0 A/5 21171
1 PC 17599
2 STON/O2. 3101282
3 113803
4 373450
Name: Ticket, dtype: object
train.Ticket.nunique()
681
没什么明显特点,但是有重复,但是有的船票有英文,看一下区别
def find_e_word(x):
import re #正则模块
pattern=re.compile('[a-z]|[A-Z]')
try:
re.search(pattern,x).group()
return 1
except:
return 0
train['Ticket_e']=train.Ticket.apply(lambda x: find_e_word(x))
train.groupby('Ticket_e')['Survived'].mean()
Ticket_e
0 0.384266
1 0.382609
Name: Survived, dtype: float64
没有什么区别
Name
train.Name.head()
0 Braund, Mr. Owen Harris
1 Cumings, Mrs. John Bradley (Florence Briggs Th...
2 Heikkinen, Miss. Laina
3 Futrelle, Mrs. Jacques Heath (Lily May Peel)
4 Allen, Mr. William Henry
Name: Name, dtype: object
train.groupby(train['Name'].apply(lambda x: x.split(', ')[1]).apply(lambda x: x.split('.')[0]))['Survived'].mean().plot()
按照上次所讲的名字头衔可以提取作为特征,接下来我们看看名字长度
train.groupby(train.Name.apply(lambda x: len(x)))['Survived'].mean().plot()
名字越长,获救的可能性越大
练习一下可视化
seaborn参考文档