kaggle竞赛的小结基于--House Prices: Advanced Regression Techniques(回归类问题)

本文分享了参加天池精准医疗大赛的经验,重点介绍了特征工程的方法,包括数据预处理、缺失值处理、异常点分析等,并提供了stacking和ensemble的示例代码。
部署运行你感兴趣的模型镜像

    前天报名了天池的天池精准医疗大赛——人工智能辅助糖尿病遗传风险预测的比赛,今天的提交排名是180多名,但是总排名300+,还没有进行数据工程的工作,所以去kaggle找了这个有相似度的题。下面将我这两天看的一些特征工程方面的代码贴在下面,数据的预处理可以按照这个步骤,其中kaggle的链接kaggle的处理参考

#!/usr/bin/python
#coding:utf-8
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
import xgboost as xgb

warnings.filterwarnings('ignore')

df_train = pd.read_csv('data/train.csv')

'''
============================================================
0.总体属性的概览
============================================================
'''
# 显示数据的所有列元素
# print (df_train.columns)
'''
# 显示数据
print df_train['SalePrice'].describe()
sns.distplot(df_train['SalePrice'])
plt.show(sns.distplot(df_train['SalePrice']))
'''

'''
#skewness偏度 and kurtosis峰度
print("Skewness: %f" % df_train['SalePrice'].skew())
print("Kurtosis: %f" % df_train['SalePrice'].kurt())
'''

'''
============================================================
1.单一属性的分析
============================================================
'''

'''
#scatter plot grlivarea/saleprice 查看GrLivArea属性和最终预测值之间的可能关系
var = 'GrLivArea'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
plt.show(data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000)))
'''

'''
#scatter plot totalbsmtsf/saleprice 查看TotalBsmtSF属性和最终预测值之间的可能关系
var = 'TotalBsmtSF'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
plt.show(data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000)))
'''

'''
#box plot overallqual/saleprice 查看OverallQual属性和最终预测值之间的可能关系
var = 'OverallQual'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
plt.show(fig.axis(ymin=0, ymax=800000))
'''

'''
var = 'YearBuilt' # 查看YearBuilt年份和最终预测值之间的关系
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(18, 8))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000)
plt.show(plt.xticks(rotation=90))
# 用scatter表示
# plt.scatter(df_train['YearBuilt'], df_train['SalePrice'])
# plt.show()
'''

'''
============================================================
2.相关关系的分析
============================================================
'''

'''
#correlation matrix 所有属性的相关关系分析
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(18, 9))
plt.show(sns.heatmap(corrmat, vmax=.8, square=True))
'''

'''
#saleprice correlation matrix 最终预测值关联的相关矩阵,还有特征值之间的相关关系
corrmat = df_train.corr()
k = 11 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=0.75)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
'''

'''
#scatterplot 关联特征值的其他显示形式,方便查看特征值之间的关联
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(df_train[cols], size=2.5)
plt.show()
'''

'''
============================================================
3.缺失值的处理
============================================================
'''

'''
#missing data 缺失值的查询和显示
# 当缺失值大于15%时,直接删掉数据
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
print missing_data.head(20)

#dealing with missing data
df_train = df_train.drop((missing_data[missing_data['Total'] > 1]).index,1) # 删除上述前18个特征
df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index) # 删除 Electrical 取值丢失的样本
print "isnull", df_train.isnull().sum().max() #just checking that there's no missing data missing...
'''

'''
============================================================
4.异常点的分析
============================================================
'''

'''
#standardizing data 将预测值的数据标准化(0,1),检测出较大的异常那些数据(单一变量的异常)
saleprice_scaled = StandardScaler().fit_transform(df_train['SalePrice'][:, np.newaxis])
low_range = saleprice_scaled[saleprice_scaled[:, 0].argsort()][:10]
high_range = saleprice_scaled[saleprice_scaled[:, 0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)
'''

'''
#bivariate analysis saleprice/grlivarea相关属性异常点的检测(两个变量的异常)
var = 'GrLivArea'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000))
plt.show()

# 删除两个异常点
df_train.sort_values(by = 'GrLivArea', ascending = False)[:2]
df_train = df_train.drop(df_train[df_train['Id'] == 1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)
'''

'''
#histogram and normal probability plot 查看预测的属性是否标准分布,标准
# 的正态分布对结果具有很好的效果
sns.distplot(df_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)
plt.show()
'''

'''
#applying log transformation 将预测属性标准正态化
df_train['SalePrice'] = np.log(df_train['SalePrice'])
#transformed histogram and normal probability plot
sns.distplot(df_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)
plt.show()
'''

'''
#histogram and normal probability plot  查看GrLivArea属性是否符合正态分布
sns.distplot(df_train['GrLivArea'], fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)
plt.show()
'''

'''
#data transformation 将GrLivArea属性转换成符合正态分布
df_train['GrLivArea'] = np.log(df_train['GrLivArea'])
sns.distplot(df_train['GrLivArea'], fit=norm)
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)
plt.show()
'''



这里,在参加比赛几天,贴上一个stacking的代码,特征工程和参数都没有怎么做,但是这是一个stacking的很好的例子,含有CV验证,只需要将自己的特征工程拷贝到本程序的相应位置。

#coding: UTF-8
import numpy as np
import pandas as pd
from dateutil.parser import parse
from sklearn.cross_validation import KFold
from scipy.stats import norm, skew
from scipy.special import boxcox1p

data_path = 'data/'

train = pd.read_csv(data_path+'d_train_20180102.csv')
test = pd.read_csv(data_path+'d_test_A_20180102.csv')
train['血糖'] = np.log(train['血糖'])

y_train = train['血糖'].values

ntrain = train.shape[0]
ntest = test.shape[0]

def make_feat(train,test):
    train_id = train.id.values.copy()
    test_id = test.id.values.copy()
    data = pd.concat([train, test])

    data = data.drop(['血糖'], axis=1)

    data['性别'] = data['性别'].map({'男':1,'女':0,None: -1})
    data['体检日期'] = (pd.to_datetime(data['体检日期']) - parse('2016-10-09')).dt.days

    data['*r-谷氨酰基转换酶'] = data['*r-谷氨酰基转换酶'].fillna(data['*r-谷氨酰基转换酶'].median())
    #data['*r-谷氨酰基转换酶'] = np.log1p(data['*r-谷氨酰基转换酶'])
    data['*丙氨酸氨基转换酶'] = data['*丙氨酸氨基转换酶'].fillna(data['*丙氨酸氨基转换酶'].median())
    #data['*丙氨酸氨基转换酶'] = np.log1p(data['*丙氨酸氨基转换酶'])
    data['*天门冬氨酸氨基转换酶'] = data['*天门冬氨酸氨基转换酶'].fillna(data['*天门冬氨酸氨基转换酶'].median())
    #data['*天门冬氨酸氨基转换酶'] = np.log1p(data['*天门冬氨酸氨基转换酶'])
    data['*总蛋白'] = data['*总蛋白'].fillna(data['*总蛋白'].median())
    #data['*总蛋白'] = np.log1p(data['*总蛋白'])
    data['*球蛋白'] = data['*球蛋白'].fillna(data['*球蛋白'].median())
    #data['*球蛋白'] = np.log1p(data['*球蛋白'])
    data['*碱性磷酸酶'] = data['*碱性磷酸酶'].fillna(data['*碱性磷酸酶'].median())
    #data['*碱性磷酸酶'] = np.log1p(data['*碱性磷酸酶'])
    data['中性粒细胞%'] = data['中性粒细胞%'].fillna(data['中性粒细胞%'].median())
    #data['中性粒细胞%'] = np.log1p(data['中性粒细胞%'])
    data['乙肝e抗体'] = data['乙肝e抗体'].fillna(data['乙肝e抗体'].median())
    #data['乙肝e抗体'] = np.log1p(data['乙肝e抗体'])
    data['乙肝e抗原'] = data['乙肝e抗原'].fillna(data['乙肝e抗原'].median())
    #data['乙肝e抗原'] = np.log1p(data['乙肝e抗原'])
    data['乙肝核心抗体'] = data['乙肝核心抗体'].fillna(data['乙肝核心抗体'].median())
    #data['乙肝核心抗体'] = np.log1p(data['乙肝核心抗体'])
    data['乙肝表面抗体'] = data['乙肝表面抗体'].fillna(data['乙肝表面抗体'].median())
    #data['乙肝表面抗体'] = np.log1p(data['乙肝表面抗体'])
    data['乙肝表面抗原'] = data['乙肝表面抗原'].fillna(data['乙肝表面抗原'].median())
    #data['乙肝表面抗原'] = np.log1p(data['乙肝表面抗原'])
    data['单核细胞%'] = data['单核细胞%'].fillna(data['单核细胞%'].median())
    #data['单核细胞%'] = np.log1p(data['单核细胞%'])
    data['嗜碱细胞%'] = data['嗜碱细胞%'].fillna(data['嗜碱细胞%'].median())
    #data['嗜碱细胞%'] = np.log1p(data['嗜碱细胞%'])
    data['嗜酸细胞%'] = data['嗜酸细胞%'].fillna(data['嗜酸细胞%'].median())
    #data['嗜酸细胞%'] = np.log1p(data['嗜酸细胞%'])
    data['尿素'] = data['尿素'].fillna(data['尿素'].median())
    #data['尿素'] = np.log1p(data['尿素'])
    data['尿酸'] = data['尿酸'].fillna(data['尿酸'].median())
    #data['尿酸'] = np.log1p(data['尿酸'])
    data['总胆固醇'] = data['总胆固醇'].fillna(data['总胆固醇'].median())
    #data['总胆固醇'] = np.log1p(data['总胆固醇'])
    data['淋巴细胞%'] = data['淋巴细胞%'].fillna(data['淋巴细胞%'].median())
    #data['淋巴细胞%'] = np.log1p(data['淋巴细胞%'])
    data['甘油三酯'] = data['甘油三酯'].fillna(data['甘油三酯'].median())
    #data['甘油三酯'] = np.log1p(data['甘油三酯'])
    data['白球比例'] = data['白球比例'].fillna(data['白球比例'].median())
    #data['白球比例'] = np.log1p(data['白球比例'])
    data['白细胞计数'] = data['白细胞计数'].fillna(data['白细胞计数'].median())
    #data['白细胞计数'] = np.log1p(data['白细胞计数'])
    data['白蛋白'] = data['白蛋白'].fillna(data['白蛋白'].median())
    #data['白蛋白'] = np.log1p(data['白蛋白'])
    data['红细胞体积分布宽度'] = data['红细胞体积分布宽度'].fillna(data['红细胞体积分布宽度'].median())
    #data['红细胞体积分布宽度'] = np.log1p(data['红细胞体积分布宽度'])
    data['红细胞压积'] = data['红细胞压积'].fillna(data['红细胞压积'].median())
    #data['红细胞压积'] = np.log1p(data['红细胞压积'])
    data['红细胞平均体积'] = data['红细胞平均体积'].fillna(data['红细胞平均体积'].median())
    #data['红细胞平均体积'] = np.log1p(data['红细胞平均体积'])
    data['红细胞平均血红蛋白浓度'] = data['红细胞平均血红蛋白浓度'].fillna(data['红细胞平均血红蛋白浓度'].median())
    #data['红细胞平均血红蛋白浓度'] = np.log1p(data['红细胞平均血红蛋白浓度'])
    data['红细胞平均血红蛋白量'] = data['红细胞平均血红蛋白量'].fillna(data['红细胞平均血红蛋白量'].median())
    #data['红细胞平均血红蛋白量'] = np.log1p(data['红细胞平均血红蛋白量'])
    data['红细胞计数'] = data['红细胞计数'].fillna(data['红细胞计数'].median())
    #data['红细胞计数'] = np.log1p(data['红细胞计数'])
    data['肌酐'] = data['肌酐'].fillna(data['肌酐'].median())
    #data['肌酐'] = np.log1p(data['肌酐'])

    data['血小板体积分布宽度'] = data['血小板体积分布宽度'].fillna(data['血小板体积分布宽度'].median())
    #data['血小板体积分布宽度'] = np.log1p(data['血小板体积分布宽度'])
    data['血小板平均体积'] = data['血小板平均体积'].fillna(data['血小板平均体积'].median())
    #data['血小板平均体积'] = np.log1p(data['血小板平均体积'])
    data['血小板比积'] = data['血小板比积'].fillna(data['血小板比积'].median())
    #data['血小板比积'] = np.log1p(data['血小板比积'])
    data['血小板计数'] = data['血小板计数'].fillna(data['血小板计数'].median())
    #data['血小板计数'] = np.log1p(data['血小板计数'])
    data['血红蛋白'] = data['血红蛋白'].fillna(data['血红蛋白'].median())
    #data['血红蛋白'] = np.log1p(data['血红蛋白'])
    data['高密度脂蛋白胆固醇'] = data['高密度脂蛋白胆固醇'].fillna(data['高密度脂蛋白胆固醇'].median())
    #data['高密度脂蛋白胆固醇'] = np.log1p(data['高密度脂蛋白胆固醇'])
    data['低密度脂蛋白胆固醇'] = data['低密度脂蛋白胆固醇'].fillna(data['低密度脂蛋白胆固醇'].median())
    #data['低密度脂蛋白胆固醇'] = np.log1p(data['低密度脂蛋白胆固醇'])
    data['性别'] = data['性别'].fillna(1)

    data.fillna(data.median(axis=0))

    '''
    numeric_features = data.dtypes[data.dtypes != "object"].index

    skewed = data[numeric_features].apply(lambda x: skew(x.dropna().astype(float)))
    skewed = skewed[abs(skewed) > 0.75]
    skewed = skewed.index

    data[skewed] = np.log1p(data[skewed])
    '''




    train_feat = np.array(data[:train.shape[0]])
    test_feat = np.array(data[train.shape[0]:])

    print "train_feat======================", train_feat.shape
    '''
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    print missing_data.head(20)
    '''
    return train_feat, test_feat

x_train, x_test = make_feat(train, test)



########################################################################
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, Lasso
from math import sqrt

NFOLDS = 10
SEED = 5
SUBMISSION_FILE = 'sample_submission.csv'

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)


class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)


class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))


def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


et_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
    'nrounds': 500
}



rd_params={
    'alpha': 10
}


ls_params={
    'alpha': 0.005
}


xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params)
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)

xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
rd_oof_train, rd_oof_test = get_oof(rd)
ls_oof_train, ls_oof_test = get_oof(ls)

print("XG-CV: {}".format(0.5*(mean_squared_error(np.exp(y_train), np.exp(xg_oof_train)))))
print("ET-CV: {}".format(0.5*(mean_squared_error(np.exp(y_train), np.exp(et_oof_train)))))
print("RF-CV: {}".format(0.5*(mean_squared_error(np.exp(y_train), np.exp(rf_oof_train)))))
print("RD-CV: {}".format(0.5*(mean_squared_error(np.exp(y_train), np.exp(ls_oof_train)))))


x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, rd_oof_train, ls_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, rd_oof_test, ls_oof_test), axis=1)

print("{},{}".format(x_train.shape, x_test.shape))

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 1,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
}

res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=4, seed=SEED, stratified=False,
             early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))

gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

'''
submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = gbdt.predict(dtest)
saleprice = np.exp(submission['血糖'])
submission['SalePrice'] = saleprice
submission.to_csv('xgstacker_starter.sub.csv', index=None)
'''
y_pred = np.exp(gbdt.predict(dtest))
xgb_pred_df = pd.DataFrame(y_pred)
xgb_pred_df = xgb_pred_df.round(3)
xgb_pred_df.to_csv('xgstacker.csv', header=None, index=False, float_format="%.4f")

在这里,我在贴一个ensemble的代码,使用的时候必须修改其中的RMSE,来保证cv的正确公式,得到比较好的本地得分,这里记录一下LB和CV可能得到不一样的分数,这和x的分布改变有关系。

#coding: UTF-8
import numpy as np
import pandas as pd
from dateutil.parser import parse
from sklearn.cross_validation import KFold
from scipy.stats import norm, skew
from scipy.special import boxcox1p
from sklearn.preprocessing import StandardScaler, RobustScaler

data_path = 'data/'

train = pd.read_csv(data_path+'d_train_20180102.csv')
test = pd.read_csv(data_path+'d_test_A_20180102.csv')


def make_feat(train,test):
    train_id = train.id.values.copy()
    test_id = test.id.values.copy()
    data = pd.concat([train, test])

    data = data.drop(['血糖'], axis=1)

    data['性别'] = data['性别'].map({'男':1,'女':0,None: -1})
    data['体检日期'] = (pd.to_datetime(data['体检日期']) - parse('2016-10-09')).dt.days

    data['*r-谷氨酰基转换酶'] = data['*r-谷氨酰基转换酶'].fillna(data['*r-谷氨酰基转换酶'].median())
    #data['*r-谷氨酰基转换酶'] = np.log1p(data['*r-谷氨酰基转换酶'])
    data['*丙氨酸氨基转换酶'] = data['*丙氨酸氨基转换酶'].fillna(data['*丙氨酸氨基转换酶'].median())
    #data['*丙氨酸氨基转换酶'] = np.log1p(data['*丙氨酸氨基转换酶'])
    data['*天门冬氨酸氨基转换酶'] = data['*天门冬氨酸氨基转换酶'].fillna(data['*天门冬氨酸氨基转换酶'].median())
    #data['*天门冬氨酸氨基转换酶'] = np.log1p(data['*天门冬氨酸氨基转换酶'])
    data['*总蛋白'] = data['*总蛋白'].fillna(data['*总蛋白'].median())
    #data['*总蛋白'] = np.log1p(data['*总蛋白'])
    data['*球蛋白'] = data['*球蛋白'].fillna(data['*球蛋白'].median())
    #data['*球蛋白'] = np.log1p(data['*球蛋白'])
    data['*碱性磷酸酶'] = data['*碱性磷酸酶'].fillna(data['*碱性磷酸酶'].median())
    #data['*碱性磷酸酶'] = np.log1p(data['*碱性磷酸酶'])
    data['中性粒细胞%'] = data['中性粒细胞%'].fillna(data['中性粒细胞%'].median())
    #data['中性粒细胞%'] = np.log1p(data['中性粒细胞%'])
    data['乙肝e抗体'] = data['乙肝e抗体'].fillna(data['乙肝e抗体'].median())
    #data['乙肝e抗体'] = np.log1p(data['乙肝e抗体'])
    data['乙肝e抗原'] = data['乙肝e抗原'].fillna(data['乙肝e抗原'].median())
    #data['乙肝e抗原'] = np.log1p(data['乙肝e抗原'])
    data['乙肝核心抗体'] = data['乙肝核心抗体'].fillna(data['乙肝核心抗体'].median())
    #data['乙肝核心抗体'] = np.log1p(data['乙肝核心抗体'])
    data['乙肝表面抗体'] = data['乙肝表面抗体'].fillna(data['乙肝表面抗体'].median())
    #data['乙肝表面抗体'] = np.log1p(data['乙肝表面抗体'])
    data['乙肝表面抗原'] = data['乙肝表面抗原'].fillna(data['乙肝表面抗原'].median())
    #data['乙肝表面抗原'] = np.log1p(data['乙肝表面抗原'])
    data['单核细胞%'] = data['单核细胞%'].fillna(data['单核细胞%'].median())
    #data['单核细胞%'] = np.log1p(data['单核细胞%'])
    data['嗜碱细胞%'] = data['嗜碱细胞%'].fillna(data['嗜碱细胞%'].median())
    #data['嗜碱细胞%'] = np.log1p(data['嗜碱细胞%'])
    data['嗜酸细胞%'] = data['嗜酸细胞%'].fillna(data['嗜酸细胞%'].median())
    #data['嗜酸细胞%'] = np.log1p(data['嗜酸细胞%'])
    data['尿素'] = data['尿素'].fillna(data['尿素'].median())
    #data['尿素'] = np.log1p(data['尿素'])
    data['尿酸'] = data['尿酸'].fillna(data['尿酸'].median())
    #data['尿酸'] = np.log1p(data['尿酸'])
    data['总胆固醇'] = data['总胆固醇'].fillna(data['总胆固醇'].median())
    #data['总胆固醇'] = np.log1p(data['总胆固醇'])
    data['淋巴细胞%'] = data['淋巴细胞%'].fillna(data['淋巴细胞%'].median())
    #data['淋巴细胞%'] = np.log1p(data['淋巴细胞%'])
    data['甘油三酯'] = data['甘油三酯'].fillna(data['甘油三酯'].median())
    #data['甘油三酯'] = np.log1p(data['甘油三酯'])
    data['白球比例'] = data['白球比例'].fillna(data['白球比例'].median())
    #data['白球比例'] = np.log1p(data['白球比例'])
    data['白细胞计数'] = data['白细胞计数'].fillna(data['白细胞计数'].median())
    #data['白细胞计数'] = np.log1p(data['白细胞计数'])
    data['白蛋白'] = data['白蛋白'].fillna(data['白蛋白'].median())
    #data['白蛋白'] = np.log1p(data['白蛋白'])
    data['红细胞体积分布宽度'] = data['红细胞体积分布宽度'].fillna(data['红细胞体积分布宽度'].median())
    #data['红细胞体积分布宽度'] = np.log1p(data['红细胞体积分布宽度'])
    data['红细胞压积'] = data['红细胞压积'].fillna(data['红细胞压积'].median())
    #data['红细胞压积'] = np.log1p(data['红细胞压积'])
    data['红细胞平均体积'] = data['红细胞平均体积'].fillna(data['红细胞平均体积'].median())
    #data['红细胞平均体积'] = np.log1p(data['红细胞平均体积'])
    data['红细胞平均血红蛋白浓度'] = data['红细胞平均血红蛋白浓度'].fillna(data['红细胞平均血红蛋白浓度'].median())
    #data['红细胞平均血红蛋白浓度'] = np.log1p(data['红细胞平均血红蛋白浓度'])
    data['红细胞平均血红蛋白量'] = data['红细胞平均血红蛋白量'].fillna(data['红细胞平均血红蛋白量'].median())
    #data['红细胞平均血红蛋白量'] = np.log1p(data['红细胞平均血红蛋白量'])
    data['红细胞计数'] = data['红细胞计数'].fillna(data['红细胞计数'].median())
    #data['红细胞计数'] = np.log1p(data['红细胞计数'])
    data['肌酐'] = data['肌酐'].fillna(data['肌酐'].median())
    #data['肌酐'] = np.log1p(data['肌酐'])

    data['血小板体积分布宽度'] = data['血小板体积分布宽度'].fillna(data['血小板体积分布宽度'].median())
    #data['血小板体积分布宽度'] = np.log1p(data['血小板体积分布宽度'])
    data['血小板平均体积'] = data['血小板平均体积'].fillna(data['血小板平均体积'].median())
    #data['血小板平均体积'] = np.log1p(data['血小板平均体积'])
    data['血小板比积'] = data['血小板比积'].fillna(data['血小板比积'].median())
    #data['血小板比积'] = np.log1p(data['血小板比积'])
    data['血小板计数'] = data['血小板计数'].fillna(data['血小板计数'].median())
    #data['血小板计数'] = np.log1p(data['血小板计数'])
    data['血红蛋白'] = data['血红蛋白'].fillna(data['血红蛋白'].median())
    #data['血红蛋白'] = np.log1p(data['血红蛋白'])
    data['高密度脂蛋白胆固醇'] = data['高密度脂蛋白胆固醇'].fillna(data['高密度脂蛋白胆固醇'].median())
    #data['高密度脂蛋白胆固醇'] = np.log1p(data['高密度脂蛋白胆固醇'])
    data['低密度脂蛋白胆固醇'] = data['低密度脂蛋白胆固醇'].fillna(data['低密度脂蛋白胆固醇'].median())
    #data['低密度脂蛋白胆固醇'] = np.log1p(data['低密度脂蛋白胆固醇'])
    data['性别'] = data['性别'].fillna(1)

    data.fillna(data.median(axis=0))

    '''
    numeric_features = data.dtypes[data.dtypes != "object"].index

    skewed = data[numeric_features].apply(lambda x: skew(x.dropna().astype(float)))
    skewed = skewed[abs(skewed) > 0.8]
    skewed = skewed.index

    data[skewed] = np.log1p(data[skewed])


    scaler = StandardScaler()
    scaler.fit(data[numeric_features])

    scaled = scaler.transform(data[numeric_features])
    for i, col in enumerate(numeric_features):
        data[col] = scaled[:, i]

    '''

    train_feat = data[data.id.isin(train_id)]
    test_feat = data[data.id.isin(test_id)]

    print "train_feat======================", train_feat.shape
    '''
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    print missing_data.head(20)
    '''
    return train_feat, test_feat

train_feat, test_feat = make_feat(train, test)

predictors = [f for f in test_feat.columns if f not in ['血糖']]


###################################################################
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.base import BaseEstimator, RegressorMixin

from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew

class CustomEnsembleRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, regressors=None):
        self.regressors = regressors

    def fit(self, X, y):
        for regressor in self.regressors:
            regressor.fit(X, y)

    def predict(self, X):
        self.predictions_ = list()
        for regressor in self.regressors:
            self.predictions_.append(np.exp(regressor.predict(X).ravel()))

        return np.log1p(np.mean(self.predictions_, axis=0))

################################################################################
# RMSE
def rmse(y_true, y_pred):
    return 0.5*(mean_squared_error(np.exp(y_true), np.exp(y_pred)))


# Cross-validation
def evaludate_model(model, x, y):
    # print('Cross_validation..')
    n_splits_val = 3
    kf = KFold(n_splits=n_splits_val, shuffle=False)
    idx = 0
    rmse_buf = np.empty(n_splits_val)
    for train, test in kf.split(x):
        model.fit(x.iloc[train], y.iloc[train])
        y_cv = model.predict(x.iloc[test])
        rmse_buf[idx] = rmse(y.iloc[test], y_cv)
        print('Interation #' + str(idx) + ': MSE = %.5f' % rmse_buf[idx])
        idx += 1

    mean_rmse = np.mean(rmse_buf)
    print('   Mean MSE = %.5f' % mean_rmse + ' +/- %.5f' % np.std(rmse_buf))

    return mean_rmse

def evaludate_submodels(models, x, y):
    # print('Cross_validation..')
    n_splits_val = 10
    kf = KFold(n_splits=n_splits_val, shuffle=False)
    for m_i, model in enumerate(models.regressors):
        rmse_buf = np.empty(n_splits_val)
        idx = 0
        for train, test in kf.split(x):
            model.fit(x.iloc[train], y.iloc[train])
            y_cv = model.predict(x.iloc[test])
            rmse_buf[idx] = rmse(y.iloc[test], y_cv)
            # print('Interation #' + str(idx) + ': RMSE = %.5f' % rmse_buf[idx])
            idx += 1

        mean_rmse = np.mean(rmse_buf)
        print('Model #' + str(m_i) + ': mean RMSE = %.5f' % mean_rmse + \
              ' +/- %.5f' % np.std(rmse_buf))

train_df_munged, test_df_munged = train_feat, test_feat

label_df = pd.DataFrame(index=train_df_munged.index, columns=['血糖'])
label_df['血糖'] = np.log(train['血糖'])

################################################################################
regr1 = xgb.XGBRegressor(
    colsample_bytree=0.2,
    gamma=0.0,
    learning_rate=0.01,
    max_depth=4,
    min_child_weight=1.5,
    n_estimators=30000,
    reg_alpha=0.9,
    reg_lambda=0.6,
    subsample=0.2,
    seed=42,
    silent=1)

best_alpha = 0.00098
regr2 = Lasso(alpha=best_alpha, max_iter=50000)

regr3 = ElasticNet(alpha=0.001)

regr4 = KernelRidge(alpha=0.3, kernel='polynomial', degree=2, coef0=1.85)

regr = CustomEnsembleRegressor([regr1, regr2, regr3])

# Evaluation was commented to make it run as  kernel
print('Evaluating each model separately..')
evaludate_submodels(regr, train_df_munged, label_df)

print('Evaluating ensemble..')
evaludate_model(regr, train_df_munged, label_df)

print('Fitting ensemble and predicting..')
# Fit the ensemble
regr.fit(train_df_munged, label_df)

# Run prediction on the Kaggle test set.
y_pred = regr.predict(test_df_munged)


################################################################################
print('Saving results..')
# Blend the results of the two regressors and save the prediction to a CSV file.
y_pred = np.exp(y_pred)
sub = pd.DataFrame()
sub['血糖'] = y_pred
sub.to_csv('ensemble_output.csv', header=False, index=False)

这些都是我在kaggle上面自己拔下来的代码,我觉得还是去kaggle上看开放的kernel比较好。

您可能感兴趣的与本文相关的镜像

Python3.9

Python3.9

Conda
Python

Python 是一种高级、解释型、通用的编程语言,以其简洁易读的语法而闻名,适用于广泛的应用,包括Web开发、数据分析、人工智能和自动化脚本

<think>好的,用户需要完整的代码来参加Kaggle的房价预测竞赛,并且完成对测试集数据的预测。首先,我需要回忆一下这个竞赛的基本情况。Kaggle的房价预测是一个回归问题,要求参赛者利用提供的训练数据建立模型,预测测试集中房屋的销售价格。 用户可能已经了解过这个竞赛,但需要具体的代码实现。因此,我需要提供一个完整的流程,包括数据加载、预处理、特征工程、模型选择、训练以及预测生成。考虑到用户可能对机器学习流程不熟悉,代码需要详细但不过于复杂,同时要有注释说明每一步的作用。 首先,数据预处理是关键。竞赛的数据通常包含许多缺失值和类别特征,需要处理这些才能输入模型。例如,填充缺失值,对类别变量进行编码(如One-Hot编码)。然后,特征工程可能包括创建新特征或转换现有特征,比如计算房间总数或处理偏态分布的特征。 接下来是模型选择。由于这是一个回归问题,常用的算法有线性回归、随机森林、梯度提升机(如XGBoost或LightGBM)。考虑到性能和准确度,梯度提升树模型通常表现较好,所以可能会选择LightGBM或XGBoost。 然后,模型训练需要交叉验证来评估性能,避免过拟合。可能需要使用GridSearch或RandomizedSearch进行超参数调优。不过,用户可能希望快速得到一个可用的代码,因此可以先使用默认参数,再建议调参的步骤。 最后,生成预测结果并保存为CSV文件,符合Kaggle提交格式。需要确保提交的文件包含正确的ID列和预测值,并且格式正确。 同时,用户可能需要一些注意事项,比如如何安装必要的库,如何处理不同的数据情况,或者如何进一步优化模型。因此,在代码之后添加一些说明和潜在问题会很有帮助。 检查是否有遗漏的步骤,比如数据探索性分析(EDA),但考虑到用户需要完整的代码,可能简要提及或建议用户进行EDA以更好地理解数据。此外,确保代码中的路径正确,用户需要根据实际情况修改文件路径。 最后,生成相关问题,帮助用户深入理解流程中的各个部分,比如特征工程的方法、模型调参技巧,或者如何处理过拟合问题。</think>### Kaggle房价预测竞赛完整代码实现 以下是基于XGBoost模型实现Kaggle房价预测竞赛的完整代码框架,包含测试集预测功能: ```python # 环境准备 import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import mean_squared_error import xgboost as xgb # 数据加载 train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv') test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv') # 数据预处理 def preprocess(df): # 处理缺失值 df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].mean()) df['MasVnrArea'] = df['MasVnrArea'].fillna(0) df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt']) # 特征工程 df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF'] df['TotalBath'] = df['FullBath'] + 0.5*df['HalfBath'] + df['BsmtFullBath'] + 0.5*df['BsmtHalfBath'] # 删除非特征列 return df.drop(['Id'], axis=1, errors='ignore') train_processed = preprocess(train) test_processed = preprocess(test) # 特征编码 combined = pd.concat([train_processed, test_processed], axis=0) combined = pd.get_dummies(combined) # 数据分割 train_final = combined[:len(train)] test_final = combined[len(train):] X = train_final.drop('SalePrice', axis=1) y = np.log(train_final['SalePrice']) # 对数变换 # 模型训练 xgb_model = xgb.XGBRegressor( n_estimators=1000, learning_rate=0.01, max_depth=3, subsample=0.8, colsample_bytree=0.8, random_state=42 ) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False) # 测试集预测 test_pred = np.exp(xgb_model.predict(test_final)) # 指数变换还原 # 生成提交文件 submission = pd.DataFrame({ 'Id': test['Id'], 'SalePrice': test_pred }) submission.to_csv('submission.csv', index=False) ``` ### 关键步骤说明 1. **数据预处理**:处理常见缺失值字段,创建组合特征$TotalSF$和$TotalBath$,删除ID列 2. **特征编码**:使用独热编码处理类别特征,确保训练/测试集编码一致性 3. **对数变换**:对目标值$SalePrice$进行对数变换,符合线性回归假设[^1] 4. **模型参数**:设置早停机制防止过拟合,使用正则化参数控制模型复杂度 ### 性能优化建议 1. 使用GridSearchCV进行超参数调优 2. 添加更复杂的特征交叉项 3. 尝试Stacking集成多个模型 4. 使用K折交叉验证
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值