本文数据来源为天池实验室挖掘幸福感项目。本文对数据集进行探索性分析后,进行了一系列的数据预处理工作,并针对预处理后的数据进行建模调参,最终得到优化后模型的拟合结果。本文的目的一是通过对该数据的处理和建模,预测影响幸福感的影响因素;二是通过该过程建立一个较为通用的分析建模模版,快速运用到其他项目当中。
# -*- coding: utf-8 -*-
"""
探寻幸福感
Created on 2019-5-21 21:38:05
Updated on 2019-12-7 16:08:17
@author: lzy
"""
# 模版设置
# 训练集名称 happiness_train
# 测试集名称 happiness_test
# 查看和设置工作目录
import os
os.getcwd()
os.chdir('E:\\python')
# 载入常用包
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm
from sklearn.decomposition import pca
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler # 标准化
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
# 读取数据
happiness_train = pd.read_csv('happiness_train_abbr.csv')
happiness_test = pd.read_csv('happiness_test_abbr.csv')
# 查看变量分布情况
pd.set_option('display.max_columns',None) # 强制显示describe所有内容
train_describe = happiness_train.describe()
print(train_describe)
# 绘制目标值正态分布图--处理完目标缺失值再用
sns.distplot(happiness_train['happiness'], fit=norm) # 此处将绘制出一幅目标值的直方图,以及一条拟合曲线和正态分布曲线
(mu, sigma) = norm.fit(happiness_train['happiness']) # 获取图例值
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],loc='best') # 绘制图例
plt.ylabel('Frequency') # 添加y轴标签
# 绘制目标值PP图
fig = plt.figure() # 默认绘图对象宽度和高度
res = stats.probplot(happiness_train['happiness'],plot=plt)
plt.show()
# 查看是否有缺失值
happiness_train_na = pd.read_csv('happiness_train_abbr.csv',na_values='-8') # 另外提一份设置na值的以统计缺失情况
train_missing = happiness_train_na.isnull().sum() / len(happiness_train_na) # 计算各变量缺失值比例
print(train_missing)
# 用热力图观察变量和目标之间的相关性
corrmat = happiness_train.corr()
fig = plt.figure(figsize=(20,9))
# f, ax = plt.subplots(figsize=(20, 9)) 换成这句效果相似
sns.heatmap(corrmat, vmax=0.8, annot=True)
plt.show()
# 训练集预处理
'''
id 删除 yes
happiness剔除-8 yes
county剔除 yes
survey_time剔除 yes
birth换算为age yes
nationality将-8都替换为1 yes
religion填充众数1 yes
religion_freq填充众数1 yes
edu 根据年龄插补(相关0.42) yes
income<0 填充中位数(去掉负数后的均值) yes
political填充众数1 yes
height_cm,weight_jin暂不处理(后期如需进一步研究再处理)
health -8替换为和health_problem一致(相关性0.54)
health_problem -8替换为和health_problem一致 (最后再将两列中-8全替换为3,防止有两列都缺失的) yes
depression -8替换为health(相关性0.42) yes
socialize -8替换为3 yes
relax 替换为socialize yes
learn和edu(相关0.52) 计算edu对应learn值 四舍五入替换 yes
equity -8替换为3 yes
family_status -8替换为3 yes
class 替换为family_status*2-1 yes
work_status 删除 yes
work_yr 删除 yes
work_type 删除 yes
work_manage 删除 yes
family_m > 10 替换为10 <1 替换为2 yes
family_income = (income+mean(income))/2*family_m yes
house <0 替换为1 yes
car -8替换为2 yes
status_peer -8替换为2 yes
status_3_before -8替换为2 yes
view -8替换为3 yes
inc_ability -8替换为2 yes
特征处理(高级):(未进行)
城市和收入结合
个人收入和家庭收入结合
'''
'''https://www.cnblogs.com/gczr/p/6761613.html
https://blog.youkuaiyun.com/qq_32618817/article/details/80587228 # Python中的groupby分组
接下来的工作:
删除缺失目标值的行
删除不需要的列
新增列,删除列
定义替换函数,逐个替换
对特殊处理的单独再处理
对处理后的数据描述统计/探索,合理后划分训练样本和测试样本
'''
# 查看数据集行列数
train_processed.shape
# 数据预处理
train_processed = happiness_train_na # 这里使用设置了na值的,方便预处理,保留源数据集,赋值一份新数据来处理
train_processed = train_processed.drop(['id','survey_time','county','work_status','work_yr','work_type','work_manage'],axis=1) # 删除多列
train_processed = train_processed[~train_processed['happiness'].isnull()]
train_processed.describe()
# np.shape(train_processed) # 检查预处理后的结果