Titanic 学习心得
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 16 17:58:13 2018
@author: 懒狼狼7号
"""
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 12 08:47:07 2018
@author: 懒狼狼7号
"""
import numpy as np
from sklearn import svm
from sklearn import preprocessing
import pandas as pd
#import numpy as np
from sklearn.model_selection import GridSearchCV
##分析train数据集的数据
df_train=pd.read_csv(r'./all\train.csv')
label=df_train.pop('Survived')
label=np.array(label)
Id_train=df_train.pop('PassengerId')
df_test=pd.read_csv(r'./all\test.csv')
Id_test=df_test.pop('PassengerId')
df_test.loc[pd.isnull(df_test.Fare),'Fare']=10 #把fare中nan处理成10
df_combine=pd.concat([df_train,df_test],sort=False)
#处理Cabin中的缺失值
fare_0=df_combine[df_combine.Cabin.isnull()].Fare.sum()/df_combine[df_combine.Cabin.isnull()].Fare.count() #Cabin为空的乘客的票价的平均值
fare_1=df_combine[df_combine.Cabin.notnull()].Fare.sum()/df_combine[df_combine.Cabin.notnull()].Fare.count() #Cabin非空的乘客的票价的平均值
df_combine.loc[df_combine.Cabin.notnull(),'Cabin']=1
df_combine.loc[df_combine.Cabin.isnull(),'Cabin']=-1
df_combine.loc[(df_combine.Cabin==-1)&(df_combine.Fare<=fare_0),'Cabin']=0 #把票价低于fare_0的未登记Cabin的乘客的cabin赋值为0
df_combine.loc[(df_combine.Cabin==-1)&(df_combine.Fare>=fare_0),'Cabin']=1 #把票价高于fare_1的未登记Cabin的乘客的cabin赋值为1
#处理Age中的缺失值
df2=df_combine[df_combine.Name.str.contains(r'Mr.')&df_combine.Age.notnull()] #根据计算男士的年龄的平均值
age_mr=df2.Age.sum()/df2.Age.count()
df_combine.loc[df_combine.Name.str.contains(r'Mr.')&df_combine.Age.isnull(),'Age']=age_mr
df3=df_combine[df_combine.Name.str.contains(r'Mrs.')&df_combine.Age.notnull()] #根据计算已结婚女士的年龄的平均值
age_mrs=df3.Age.sum()/df3.Age.count()
df_combine.loc[df_combine.Name.str.contains(r'Mrs.')&df_combine.Age.isnull(),'Age']=age_mrs
df4=df_combine[df_combine.Name.str.contains(r'Miss.')&df_combine.Age.notnull()] #根据计算未婚女士的年龄的平均值
age_miss=df4.Age.sum()/df4.Age.count()
df_combine.loc[df_combine.Name.str.contains(r'Miss.')&df_combine.Age.isnull(),'Age']=age_miss
df5=df_combine[df_combine.Name.str.contains(r'Master.')&df_combine.Age.notnull()] #根据计算未婚女士的年龄的平均值
age_ma=df5.Age.sum()/df5.Age.count()
df_combine.loc[df_combine.Name.str.contains(r'Master.')&df_combine.Age.isnull(),'Age']=age_ma
df6=df_combine[df_combine.Name.str.contains(r'Dr.')&df_combine.Age.notnull()] #根据计算未婚女士的年龄的平均值
age_dr=df6.Age.sum()/df6.Age.count()
df_combine.loc[df_combine.Name.str.contains(r'Dr.')&df_combine.Age.isnull(),'Age']=age_dr
df_combine.loc[df_combine.Age.isnull(),'Age']=age_mr
df_combine.drop(['Name','Ticket'],1,inplace=True)
#
df_combine.loc[df_combine.Sex=='male','Sex']=0 #对Sex这一列中的male赋值为1
df_combine.loc[df_combine.Sex=='female','Sex']=1 #对Sex这一列中的male赋值为1
df_combine.loc[df_combine.Embarked=='S','Embarked']=0 #把Embarked中的S处理成1
df_combine.loc[df_combine.Embarked=='C','Embarked']=1 #把Embarked中的C处理成1
df_combine.loc[df_combine.Embarked=='Q','Embarked']=2 #把Embarked中的Q处理成1
df_combine.loc[pd.isnull(df_combine.Embarked),'Embarked']=np.random.randint(0,3) #把Embarked中的nan值处理成(0-2)中的随机数
label=np.array(label)
##返回处理好的数据集给df_train 和df_test
df_train_1=df_combine[0:891]
df_test_1=df_combine[891:1309]
# =============================================================================
# train with SVM
# =============================================================================
x=np.array(df_train_1[:])
x[:,2]=x[:,2]*1.1
#a1=preprocessing.MinMaxScaler() #对x归一化
#x=a1.fit_transform(x)
x=preprocessing.normalize(x, norm='l2') #对x归一化
tuned_parameters = [{'kernel': ['linear'],
'C': np.mgrid[5:100:0.2]}]
gsearch1= GridSearchCV(estimator =svm.SVC(),
param_grid =tuned_parameters,scoring='accuracy',cv=3)
gsearch1.fit(x,label.ravel())
print(gsearch1.best_params_)
x_train=x[0:600]
y_train=label[0:600]
clf=svm.SVC(C=gsearch1.best_params_['C'],kernel='linear',class_weight='balanced')
#clf.fit(x,label.ravel())
clf.fit(x_train,y_train.ravel())
#score=clf.score(x_train,y_train)
# =============================================================================
def show_accuracy(y_predict,y_true):
count_same=((y_predict-y_true)==0).sum()
count_diff=((y_predict-y_true)!=0).sum()
rate=count_same/(count_same+count_diff)
return rate
y_hat_1=clf.predict(x[600:891])
acc_test=show_accuracy(y_hat_1,label[600:891])
y_hat_2=clf.predict(x[0:600])
acc=show_accuracy(y_hat_2,label[0:600])
# =============================================================================
# 在测试集上预测
# =============================================================================
x_test=np.array(df_test_1[:])
#a1=preprocessing.MinMaxScaler() #对x归一化
#x_test=a1.fit_transform(x_test)
x_test=preprocessing.normalize(x_test, norm='l2')
clf.fit(x,label.ravel())
y_test=clf.predict(x_test)
y_test=pd.DataFrame(y_test)
Id_test=pd.DataFrame(Id_test)
df=pd.concat([Id_test,y_test],axis=1)
df.rename(columns={0:'Survived'},inplace=True)
df.to_csv(r'./all\result.csv',index=None)