Titanic 学习心得（代码向）

最新推荐文章于 2022-08-29 15:03:49 发布

原创最新推荐文章于 2022-08-29 15:03:49 发布 · 346 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#Titanic 学习心得

python pandas 专栏收录该内容

1 篇文章

订阅专栏

本文介绍了一种使用支持向量机（SVM）进行Titanic生存预测的方法，通过详细的特征工程，包括处理缺失值、类别特征编码等，提升了模型的预测准确性。

Titanic 学习心得

# -*- coding: utf-8 -*-
"""
Created on Sun Dec 16 17:58:13 2018

@author: 懒狼狼7号
"""

# -*- coding: utf-8 -*-
"""
Created on Wed Dec 12 08:47:07 2018

@author: 懒狼狼7号
"""

import numpy as np
from sklearn import svm
from sklearn import preprocessing
import pandas as pd
#import numpy as np
from sklearn.model_selection import GridSearchCV

##分析train数据集的数据
df_train=pd.read_csv(r'./all\train.csv')
label=df_train.pop('Survived')
label=np.array(label)
Id_train=df_train.pop('PassengerId')

df_test=pd.read_csv(r'./all\test.csv')
Id_test=df_test.pop('PassengerId')
df_test.loc[pd.isnull(df_test.Fare),'Fare']=10   #把fare中nan处理成10

df_combine=pd.concat([df_train,df_test],sort=False)

#处理Cabin中的缺失值
fare_0=df_combine[df_combine.Cabin.isnull()].Fare.sum()/df_combine[df_combine.Cabin.isnull()].Fare.count()      #Cabin为空的乘客的票价的平均值
fare_1=df_combine[df_combine.Cabin.notnull()].Fare.sum()/df_combine[df_combine.Cabin.notnull()].Fare.count()    #Cabin非空的乘客的票价的平均值
df_combine.loc[df_combine.Cabin.notnull(),'Cabin']=1
df_combine.loc[df_combine.Cabin.isnull(),'Cabin']=-1

df_combine.loc[(df_combine.Cabin==-1)&(df_combine.Fare<=fare_0),'Cabin']=0  #把票价低于fare_0的未登记Cabin的乘客的cabin赋值为0
df_combine.loc[(df_combine.Cabin==-1)&(df_combine.Fare>=fare_0),'Cabin']=1  #把票价高于fare_1的未登记Cabin的乘客的cabin赋值为1

#处理Age中的缺失值
df2=df_combine[df_combine.Name.str.contains(r'Mr.')&df_combine.Age.notnull()]    #根据计算男士的年龄的平均值
age_mr=df2.Age.sum()/df2.Age.count()
df_combine.loc[df_combine.Name.str.contains(r'Mr.')&df_combine.Age.isnull(),'Age']=age_mr


df3=df_combine[df_combine.Name.str.contains(r'Mrs.')&df_combine.Age.notnull()]    #根据计算已结婚女士的年龄的平均值
age_mrs=df3.Age.sum()/df3.Age.count()
df_combine.loc[df_combine.Name.str.contains(r'Mrs.')&df_combine.Age.isnull(),'Age']=age_mrs

df4=df_combine[df_combine.Name.str.contains(r'Miss.')&df_combine.Age.notnull()]    #根据计算未婚女士的年龄的平均值
age_miss=df4.Age.sum()/df4.Age.count()
df_combine.loc[df_combine.Name.str.contains(r'Miss.')&df_combine.Age.isnull(),'Age']=age_miss

df5=df_combine[df_combine.Name.str.contains(r'Master.')&df_combine.Age.notnull()]    #根据计算未婚女士的年龄的平均值
age_ma=df5.Age.sum()/df5.Age.count()
df_combine.loc[df_combine.Name.str.contains(r'Master.')&df_combine.Age.isnull(),'Age']=age_ma

df6=df_combine[df_combine.Name.str.contains(r'Dr.')&df_combine.Age.notnull()]    #根据计算未婚女士的年龄的平均值
age_dr=df6.Age.sum()/df6.Age.count()
df_combine.loc[df_combine.Name.str.contains(r'Dr.')&df_combine.Age.isnull(),'Age']=age_dr

df_combine.loc[df_combine.Age.isnull(),'Age']=age_mr




df_combine.drop(['Name','Ticket'],1,inplace=True)
#
df_combine.loc[df_combine.Sex=='male','Sex']=0   #对Sex这一列中的male赋值为1
df_combine.loc[df_combine.Sex=='female','Sex']=1   #对Sex这一列中的male赋值为1

df_combine.loc[df_combine.Embarked=='S','Embarked']=0    #把Embarked中的S处理成1
df_combine.loc[df_combine.Embarked=='C','Embarked']=1    #把Embarked中的C处理成1
df_combine.loc[df_combine.Embarked=='Q','Embarked']=2    #把Embarked中的Q处理成1
df_combine.loc[pd.isnull(df_combine.Embarked),'Embarked']=np.random.randint(0,3)    #把Embarked中的nan值处理成（0-2）中的随机数

label=np.array(label)

##返回处理好的数据集给df_train 和df_test
df_train_1=df_combine[0:891]
df_test_1=df_combine[891:1309]


# =============================================================================
#   train with SVM
# =============================================================================
x=np.array(df_train_1[:])
x[:,2]=x[:,2]*1.1
#a1=preprocessing.MinMaxScaler()  #对x归一化
#x=a1.fit_transform(x)

x=preprocessing.normalize(x, norm='l2')  #对x归一化

tuned_parameters = [{'kernel': ['linear'],
                     'C': np.mgrid[5:100:0.2]}]  
gsearch1= GridSearchCV(estimator =svm.SVC(),   
                       param_grid =tuned_parameters,scoring='accuracy',cv=3)  
gsearch1.fit(x,label.ravel())  
print(gsearch1.best_params_)  




x_train=x[0:600]
y_train=label[0:600]
clf=svm.SVC(C=gsearch1.best_params_['C'],kernel='linear',class_weight='balanced')
#clf.fit(x,label.ravel())
clf.fit(x_train,y_train.ravel())

#score=clf.score(x_train,y_train)



# =============================================================================
def show_accuracy(y_predict,y_true):
    count_same=((y_predict-y_true)==0).sum()
    count_diff=((y_predict-y_true)!=0).sum()
    rate=count_same/(count_same+count_diff)
    return rate

y_hat_1=clf.predict(x[600:891])
acc_test=show_accuracy(y_hat_1,label[600:891])

y_hat_2=clf.predict(x[0:600])
acc=show_accuracy(y_hat_2,label[0:600])

# =============================================================================
# 在测试集上预测
# =============================================================================

x_test=np.array(df_test_1[:])

#a1=preprocessing.MinMaxScaler()  #对x归一化
#x_test=a1.fit_transform(x_test)

x_test=preprocessing.normalize(x_test, norm='l2')

clf.fit(x,label.ravel())
y_test=clf.predict(x_test)

y_test=pd.DataFrame(y_test)
Id_test=pd.DataFrame(Id_test)

df=pd.concat([Id_test,y_test],axis=1)
df.rename(columns={0:'Survived'},inplace=True)
df.to_csv(r'./all\result.csv',index=None)