逻辑回归应用Kaggle
出处:寒小阳老师的博客http://blog.youkuaiyun.com/han_xiaoyang/article/details/49797143
个人用做复习笔记用
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
from matplotlib import pyplot as pl
pl.rcParams['font.sans-serif'] = ['SimHei']
导入数据集,一般拷贝原数据,保护原数据
data_train1=pd.read_csv("E:/Anaconda/envs/torch160/Python_Test/Data/train.csv")
data_test1=pd.read_csv("E:/Anaconda/envs/torch160/Python_Test/Data/test.csv")
data_train=data_train1.copy()
data_test=data_test1.copy()
查看数据信息的操作
data_train.info()
data_train.describe()
data_train.Survived.value_counts()#查看Survived分布
比如:
一、预先观察各特征,进行特征的比对
画图观察特征
import matplotlib.pyplot as plt
fig = plt.figure() #画板
fig.set(alpha=0.2)
#在一张大图里分列几个小图,(2,3)代表将整个画板化为2行3列,(0,0)表示图在画板上的位置
plt.subplot2grid((2,3),(0,0))
#柱状图!!0,1变为x轴坐标类别,数量是y轴
data_train.Survived.value_counts().plot(kind='bar')
plt.title(u'Survived')
plt.ylabel(u'persons')
plt.subplot2grid((2,3),(0,1))
data_train.Pclass.value_counts().plot(kind='bar')
plt.ylabel(u"人数")
plt.title(u"乘客等级分布")
plt.subplot2grid((2,3),(0,2))
plt.scatter(data_train.Survived,data_train.Age)
plt.ylabel(u"年龄")
plt.grid(b=True,which='major',axis='y')
plt.title(u"按年龄看获救分布(1为获救)")
#折线图,在画板的(1,0)位置,colspan=2表示该图在行上占画板2个单位(即横向拉长图)
plt.subplot2grid((2,3),(1,0),colspan=2)
data_train.Age[data_train.Pclass==1].plot(kind='kde')
data_train.Age[data_train.Pclass==2].plot(kind='kde')
data_train.Age[data_train.Pclass==3].plot(kind='kde')
plt.xlabel(u"年龄")
plt.ylabel(u"密度")
plt.title(u"各等级的乘客年龄分布")
plt.legend((u'头等舱',u'2等舱',u'3等舱'),loc='best')
plt.subplot2grid((2,3),(1,2))
data_train.Embarked.value_counts().plot(kind='bar')
plt.title(u"各登船口岸上船人数")
plt.ylabel(u"人数")
plt.show()
根据乘客的船舱等级,来显示获救的情况(x轴为船舱等级)
fig=plt.figure(2)
fig.set(alpha=0.2)
Survived_0 = data_train.Pclass[data_train.Survived == 0].value_counts()#进行帅选(两列),选择Pclass列和Survived列中数值为0元素
Survived_1 = data_train.Pclass[data_train.Survived == 1].value_counts()
df = pd.DataFrame({u'获救':Survived_1,u'未获救':Survived_0})#形成表格
df.plot(kind='bar',stacked=True)#stacke表示将同一客舱等级的获救和未获救图以竖轴叠在一起
plt.title("各乘客等级的获救情况")
plt.xlabel("乘客等级")
plt.ylabel("人数")
plt.show()
按性别来看获救的情况
fig=plt.figure(3)
fig.set(alpha=0.2)
Survived_m = data_train.Survived[data_train.Sex == 'male'].value_counts()
Survived_f = data_train.Survived[data_train.Sex == 'female'].value_counts()
df = pd.DataFrame({'男性':Survived_m,'女性':Survived_f})#图中右下角显示的是df的信息
df.plot(kind='bar',stacked=True)
plt.title('按性别看获救情况')
plt.xlabel('性别')
plt.ylabel('人数')
plt.show()
根据舱等级和性别两个特征联合在一起查看获救情况————>感觉是联合特征
fig = plt.figure()
fig.set(alpha=0.95)
plt.title("根据舱等级和性别的获救情况")
ax1 = fig.add_subplot(141)
#筛选三个条件,但显示是Survived
data_train.Survived[data_train.Sex == 'female'][data_train.Pclass!=3].value_counts().plot(kind='bar',label='female highclass',color='#FA2479')
ax1.set_xticklabels(["获救","未获救"],rotation=0)
ax1.legend(["女性/高级舱"],loc='best')
ax2 = fig.add_subplot(1,4,2,sharey=ax1)
data_train.Survived[data_train.Sex=='female'][data_train.Pclass == 3].value_counts().plot(kind='bar',label='female lowclass',color='pink')
ax2.set_xticklabels(["未获救","获救"],rotation=0)
ax2.legend(["女性/低级舱"],loc='best')
ax3 = fig.add_subplot(1,4,3,sharey=ax1)
data_train.Survived[data_train.Sex=='male'][data_train.Pclass !=3].value_counts().plot(kind='bar',label='male,high,class',color='lightblue')
ax3.set_xticklabels(["未获救","获救"],rotation=0)
ax3.legend(["男性/高级舱"],loc='best')
ax4 = fig.add_subplot(1,4,4,sharey=ax1)
data_train.Survived[data_train.Sex=='male'][data_train.Pclass ==3].value_counts().plot(kind='bar',label='male,low,class',color='lightblue')
ax4.set_xticklabels(["未获救","获救"],rotation=0)
ax4.legend(["男性/低级舱"],loc='best')
根据登录的港口,来看获救的情况
#各登船港口的获救情况
fig=plt.figure()
fig.set(alpha=0.2)
Survived_0 = data_train.Embarked[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Embarked[data_train.Survived != 0].value_counts()
df = pd.DataFrame({'获救':Survived_1,'未获救':Survived_0})
df.plot(kind='bar',stacked=True)
plt.title("各登录港口乘客的获救情况")
plt.xlabel("登录港口")
plt.ylabel("人数")
plt.show()
聚合两个特征(即根据一个特征,看另外一个特征的情况),然后进行特征筛选
g = data_train.groupby(['SibSp','Survived'])#聚合列
df = pd.DataFrame(g.count()['PassengerId'])#筛选其余特征,只留下乘客ID
print(df)
g = data_train.groupby(['Parch','Survived'])
df = pd.DataFrame(g.count()['PassengerId'])
print(df)
按照有无Cabin,来查看获救情况
fig = plt.figure()
fig.set(alpha=0.2)
Survived_cabin = data_train.Survived[pd.notnull(data_train.Cabin)].value_counts()
Survived_nocabin = data_train.Survived[pd.isnull(data_train.Cabin)].value_counts()
df = pd.DataFrame({'无':Survived_nocabin,'有':Survived_cabin}).transpose()###
df.plot(kind='bar',stacked=True)
plt.title('按Cabin有无看获救情况')
plt.xlabel('Cabin有无')
plt.ylabel('人数')
plt.show()
二、下面是进行特征预处理(即填补缺失信息和转换信息)
用randomforestgressionor填补缺失的年龄属性,以及将特征属性Cabin分为‘Yes’,‘No’
#用scikit-learn中的RandomForest来拟合一下缺失的年龄数据
from sklearn.ensemble import RandomForestRegressor
##使用randomforestregressionor填补缺失的年龄属性
def set_missing_ages(df):
# 把已有的数值型特征取出来丢进Random Forest Regressor中
age_df = df[['Age','Fare','Parch','SibSp','Pclass']]
## 乘客分成已知年龄和未知年龄两部分
known_age = age_df[age_df.Age.notnull()].values
unknown_age = age_df[age_df.Age.isnull()].values
#y即目标年龄
y = known_age[:,0]
#x即特征属性值
x = known_age[:,1:]
#fit到randomfroestregressor之中
rfr = RandomForestRegressor(random_state=0,n_estimators=2000,n_jobs=-1)
rfr=rfr.fit(x,y)
#用得到的模型进行未知年龄结果预测
predictedAges = rfr.predict(unknown_age[:,1:])
#用得到的预测结果填补原缺失数据
df.loc[(df.Age.isnull()),'Age'] = predictedAges
return df,rfr
def set_Cabin_type(df):
df.loc[(df.Cabin.notnull()),'Cabin'] = 'Yes'
df.loc[(df.Cabin.isnull()),'Cabin'] = 'No'
return df
data_train,rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)
对特征:‘Cabin’,‘Embarked’,‘Sex’,'Pclass’进行one-hot(独热)编码
dummies_Cabin = pd.get_dummies(data_train['Cabin'],prefix='Cabin')
dummies_Embarked = pd.get_dummies(data_train['Embarked'],prefix='Embarked')
dummies_Sex = pd.get_dummies(data_train['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix= 'Pclass')
df =pd.concat([data_train,dummies_Cabin,dummies_Embarked,dummies_Sex,dummies_Pclass],axis=1)#多个拼接,但是原来的Cabin等还是没有删除
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'],axis=1,inplace=True)#删除原来的
由于特征属性Age和Fare太分散,会对预测结果有很大影响,因此需要对他们进行scaling(类似聚合到某一固定范围)
#用scikit-learn里面的preprocessing模块对Age和Fare做一个scaling
from sklearn.preprocessing import StandardScaler as std
scaler = std()
age_scale_param = scaler.fit(df[['Age']])#只是df['Age']试(889,),而df[['Age']]是(889,1)
df['Age_scaled'] = scaler.fit_transform(df[['Age']],age_scale_param)
fare_scale_param = scaler.fit(df[['Fare']])
df['Fare_scaled'] = scaler.fit_transform(df[['Fare']],fare_scale_param)
三、开始建模(即特征工程)
逻辑回归建模,用训练集训练一个模型
from sklearn import linear_model
#把需要的feature字段取出来,转成numpy格式,使用scikit-learn中的LogisticRegression建模。
# 用正则取出我们要的属性值
train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np = train_df.values
# y即Survival结果
y = train_np[:, 0]
# X即特征属性值
X = train_np[:, 1:]
# fit到RandomForestRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l2', tol=1e-6)
clf=clf.fit(X, y)#模型已经训练好
下面开始用模型对测试集进行预测(但测试前,也要将测试集进行预处理,呜呜呜~)
#我们的"test_data"也要做和"train_data"一样的预处理啊!!
data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0
# 接着我们对test_data做和train_data中一致的特征变换
# 首先用同样的RandomForestRegressor模型填上丢失的年龄
tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
null_age = tmp_df[data_test.Age.isnull()].values
# 根据特征属性X预测年龄并补上
X = null_age[:, 1:]
predictedAges = rfr.predict(X)
data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAges
data_test = set_Cabin_type(data_test)
#同样进行scaling映射归一
dummies_Cabin = pd.get_dummies(data_test['Cabin'],prefix = 'Cabin')
dummies_Embarked = pd.get_dummies(data_test['Embarked'],prefix = 'Embarked')
dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')
df_test = pd.concat([data_test,dummies_Cabin,dummies_Embarked, dummies_Sex, dummies_Pclass],axis=1)
df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'],axis=1,inplace = True)
df_test['Age_scaled'] = scaler.fit_transform(df_test[['Age']], age_scale_param)
df_test['Fare_scaled'] = scaler.fit_transform(df_test[['Fare']], fare_scale_param)
下面用训练好的模型,使用测试集来预测目标值Survived的情况
test=df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = clf.predict(test)
result=pd.DataFrame({'PassengerId':data_test['PassengerId'].values,'Survived':predictions.astype(np.int32)})
#保存结果
result.to_csv("E:/Anaconda/envs/torch160/Python_Test/Data/logistic_regression_predictions.csv", index=False)
四、上面只是做了个baseline,下面是进行优化
可以进行一个测试,用交叉验证来对现有的baseline模型进行评估打分(方便后面跟优化后进行比对),这里的交叉验证集是指将训练集分为5份,4份用来训练,1份用来测试,交叉验证5次,得到5个结果。上面的baseline没有这个操作
########################################
#以上是baseline模型,下面进行优化
from sklearn.model_selection import cross_val_score as cvs
#简单看看打分情况(用交叉验证集)
clf = linear_model.LogisticRegression(C=1.0,penalty='l2',tol=1e-6)
#对训练集中的数据进行匹配(这里相当于筛选)
all_data=df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
x=all_data.values[:,1:]
y = all_data.values[:,0]
print(cvs(clf, x, y, cv=5))
from sklearn.model_selection import train_test_split
###只有一个数据(训练集)进行划分,就只会返回两个。将训练集划分为训练集和验证集
split_train,split_cv = train_test_split(df,test_size=0.3,random_state=0)
train_df=split_train.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')######用正则进行匹配
#生成模型,逻辑线性回归模型
clf = linear_model.LogisticRegression(C=1.0,penalty='l2',tol=1e-6)
clf.fit(train_df.values[:,1:],train_df.values[:,0])
#对验证集中的数据进行匹配筛选,保护原来的验证集
cv_df=split_cv.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
#用训练好的逻辑回归模型clf对验证集进行预测
predictions = clf.predict(cv_df.values[:,1:])###获得预测值
#复制原始训练集
origin_data_train = data_train1.copy()
#这里不好理解,下面两个图是帮助理解的
bad_cases = origin_data_train.loc[origin_data_train['PassengerId'].isin(split_cv[predictions != cv_df.values[:,0]]['PassengerId'].values)]######
bad_cases#####判定错误的数据
利用learning curves学习曲线绘画交叉验证集和普通训练集的得分情况(预测好坏)
# 用sklearn的learning_curve得到training_score和cv_score,使用matplotlib画出learning curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1,
train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):
"""
画出data在某模型上的learning curve.
参数解释
----------
estimator : 你用的分类器。
title : 表格的标题。
X : 输入的feature,numpy类型****x=all_data.values[:,1:]即特征
y : 输入的target vector****y = all_data.values[:,0]即目标
ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点
cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
verbose:控制详细程度:越高,消息越多。
n_jobs : 并行的的任务数(默认1)
"""
#进行学习曲线的工作,train_sizes用于生成学习曲线的培训示例数,即生成的散点个数,train_scores训练集分数, test_scores测试集分数
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv,n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
if plot:
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel(u"训练样本数")
plt.ylabel(u"得分")
#Gca即Get Current Axes,invert_yaxis()是翻转坐标轴,让图像在y轴颠倒过来
plt.gca().invert_yaxis()
#
plt.grid()
#填充两条水平曲线之间的区域。x1,均为一维数组,定义曲线的节点的 x 坐标,y1,定义第一条曲线的节点的 y 坐标,y2定义第二条曲线的节点的 y 坐标。
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
alpha=0.1, color="r")
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉验证集上得分")
plt.legend(loc="best")
#draw()重绘当前图形。这用于更新已更改的数字,但不会自动重新绘制。如果交互式模式处于(通过ion(),则很少需要,但可能有办法修改图形的状态,而不将图形标记为"过时"。
plt.draw()
plt.show()
plt.gca().invert_yaxis()
midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
return midpoint, diff
plot_learning_curve(clf, u"学习曲线", x, y)
得到最终结果(袋装法是集成学习的一种,需要加强)
Bagging方法在结合时可以采用简单投票法,即取最多分类器认为的类别,当两个类得到相同票数时,随机选取其中一个。
#用scikit-learn里面的Bagging来完成上面的思路
from sklearn.ensemble import BaggingRegressor
train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
train_np = train_df.values
#用y即Survived结果
y = train_np[:,0]
#x即特征属性值
x = train_np[:,1:]
#fit到BaggingRegressor之中,clf为逻辑回归模型
clf = linear_model.LogisticRegression(C=1.0,penalty='l2',tol=1e-6)
#bagging_cf是袋装法模型,放入clf估计器,设置n_estimators估计器数量为20,每个估计器最多绘制20个样本(max_samples=20),每个样本取所有特征(max_features=1.0)
#max_features:The number of samples to draw from X to train each base estimator
#bootstrap:是否用替换来绘制样品
bagging_clf = BaggingRegressor(clf,n_estimators = 20,max_samples = 0.8,max_features=1.0,bootstrap=True,bootstrap_features = False,n_jobs=-1)
#训练袋装法模型
bagging_clf.fit(x,y)
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
predictions = bagging_clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].values,'Survived':predictions.astype(np.int32)})
result.to_csv("E:/Anaconda/envs/torch160/Python_Test/Data/logistic_regression_bagging_predictions.csv", index=False)