阿里云 快来一起挖掘幸福感!项目实战
项目地址
1,数据准备
问卷调查数据主要包含的个人信息有职业,婚姻状况,收入,学历等40个features,label就是幸福感。
首先对数据进行预处理
(一) 第二列是数据特征,先独立抽取出来。
(二) 第七列是时间,本身对结果影响不大,又由于是字符串,暂时删除不用。
(三) 数据本身包含20197个空缺,对数据进行补充。
2,训练和预测
给定的数据包含train和test两部分,先用train分成两部分对建立的模型进行训练评分,最后对test预测。
3,代码实战
3.1,进行数据预处理
import pandas as pd
import numpy as np
datatrain = pd.read_csv('happiness_train_complete.csv',encoding="gb2312")
datatest = pd.read_csv('happiness_test_complete.csv',encoding="gb2312")
dataplot = datatrain.copy()
datatrain = datatrain[datatrain["happiness"]!=-8].reset_index(drop=True)
dataplot = dataplot[dataplot["happiness"]!=-8].reset_index(drop=True)
target_col = "happiness"
target = datatrain[target_col]
del datatrain['id']
del datatest['id']
label = datatrain['happiness']
del datatrain['happiness']
dataproc = pd.concat([datatrain,datatest],ignore_index=True)
dataproc['survey_type'] = dataproc['survey_type'].map(lambda x:x-1) #变0-1
count = []
for i in range(1,32):
count.append(dataplot.loc[dataplot['province']==i,'happiness'].mean())
count = [i if (1-pd.isnull(i)) else 3 for i in count]
#plt.scatter(range(1,32),count)
reg1 = [i for i in range(1,32) if count[i-1]<3.2]
reg2 = [i for i in range(1,32) if 3.2<count[i-1]<3.9]
reg3 = [i for i in range(1,32) if count[i-1]>=3.9]
def spl(x):
if x in [2,3,8,13,14,20,23,25,26,30]:
return 0
else:
return 1
def spl1(x):
if x in reg1:
return 0
elif x in reg2:
return 1
elif x in reg3:
return 2
dataproc['province_1'] = dataproc['province'].map(spl) #新增两个变量
dataproc['province_2'] = dataproc['province'].map(spl1)
dataproc['gender'] = dataproc['gender'].map(lambda x:x-1) #变0-1
dataproc['age'] = dataproc['survey_time'].map(lambda x:int(x[:4]))-dataproc['birth']
dataproc.loc[dataproc['nationality']<0,'nationality'] = 1
dataproc = dataproc.join(pd.get_dummies(dataproc["nationality"],prefix="nationality"))
def nation(x):
if x==1:
return 1
else:
return 0
dataproc['nationality1'] = dataproc['nationality'].map(nation)#新特征,是否为汉族
del dataproc['nationality']
def relfreq(x):
if x<2:
return 0
elif x<5:
return 1
else:
return 2
dataproc['religion_freq'] = dataproc['religion_freq'].map(relfreq)
from scipy import stats
dataproc.loc[dataproc['edu']<0,'edu'] = stats.mode(dataproc['edu'])[0][0]
del dataproc['edu_other']
dataproc = dataproc.join(pd.get_dummies(dataproc["edu_status"],prefix="edu_status"))
del dataproc["edu_status"]
def eduyr(x):
if (x>0) and (not pd.isnull(x)):
return x
else:
return 0
dataproc['edu_yr'] = dataproc['edu_yr'].map(eduyr)
dataproc['edu_yr'] = dataproc['edu_yr']-dataproc['birth']
def eduyr1(x):
if x>0:
return x
else:
return 0
dataproc['edu_yr'] = dataproc['edu_yr'].map(eduyr1)
dataproc.loc[dataproc['income']<0,'income'] = stats.mode(dataproc['income'])[0][0]
dataproc['income'] = dataproc['income'].map(lambda x:np.log(x+1))
dataproc.loc[dataproc['political']<0,'political'] = 1
dataproc = dataproc.join(pd.get_dummies(dataproc["political"],prefix="political"))
del dataproc['political']
def joinparty(x):
if pd.isnull(x):
return 0
if x<0:
return 0
else:
return x
dataproc['join_party'] = (dataproc['join_party']-dataproc['birth']).map(joinparty)
del dataproc['property_other']
dataproc.loc[(dataproc['weight_jin']<=80)&(dataproc['height_cm']>=160),'weight_jin']= dataproc['weight_jin']*2 #对体重修正
dataproc.loc[dataproc['weight_jin']<=60,'weight_jin']= dataproc['weight_jin']*2
dataproc['bmi'] = dataproc['weight_jin'].map(lambda x:x/2)/dataproc['height_cm'].map(lambda x:(x/100)**2)
dataproc.loc[dataproc['health']<0,'health'] = stats.mode(dataproc['health'])[0][0]
dataproc.loc[dataproc['health_problem']<0,'health_problem'] = stats.mode(dataproc['health_problem'])[0][0]
dataproc.loc[dataproc['depression']<0,'depression'] = stats.mode(dataproc['depression'])[0][0]
dataproc.loc[dataproc['media_1']<0,'media_1'] = stats.mode(dataproc['media_1'])[0][0]
dataproc.loc[dataproc['media_2']<0,'media_2'] = stats.mode(dataproc['media_2'])[0][0]
dataproc.loc[dataproc['media_3']<0,'media_3'] = stats.mode(dataproc['media_3'])[0][0]
dataproc.loc[dataproc['media_4']<0,'media_4'] = stats.mode(dataproc['media_4'])[0][0]
dataproc.loc[dataproc['media_5']<0,'media_5'] = stats.mode(dataproc['media_5'])[0][0]
dataproc.loc[dataproc['media_6']<0,'media_6'] = stats.mode(dataproc['media_6'])[0][0]
dataproc['media'] = (dataproc['media_1']+dataproc['media_2']+dataproc['media_3']+dataproc['media_4']+
dataproc['media_5']+dataproc['media_6']).map(lambda x:x/6)
for i in range(1,13):
dataproc.loc[dataproc['leisure_'+str(i)]<0,'leisure_'+str(i)] = stats.mode(dataproc['leisure_'+str(i)])[0][0]
dataproc['leisure'] = (dataproc['leisure_1']+dataproc['leisure_2']+dataproc[&#