task4 建模调参

最新推荐文章于 2022-10-04 16:08:23 发布

原创最新推荐文章于 2022-10-04 16:08:23 发布 · 533 阅读

CC 4.0 BY-SA版权

1 读取数据

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

note: 将整型变量的类型尽量压缩，逐步判断并转化为int8,int16,int32,int64

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

sample_feature = reduce_mem_usage(pd.read_csv('data_for_tree.csv'))

Memory usage of dataframe is 62099672.00 MB
Memory usage after optimization is: 16520303.00 MB
Decreased by 73.4%

sample_feature.head()

	SaleID	name	model	brand	bodyType	gearbox	power	kilometer	notRepairedDamage	...	used_time	city	brand_amount	brand_price_max	brand_price_median	brand_price_min	brand_price_sum	brand_price_std	brand_price_average	power_bin
0	0	736	30.0	6	1.0	0.0	60	12.5	0.0	...	4384.0	1.0	10192.0	35990.0	1800.0	13.0	36457520.0	4564.0	3576.0	5.0
1	1	2262	40.0	1	2.0	0.0	0	15.0	-	...	4756.0	4.0	13656.0	84000.0	6400.0	15.0	124044600.0	8992.0	9080.0	NaN
2	2	14874	115.0	15	1.0	0.0	163	12.5	0.0	...	4384.0	2.0	1458.0	45000.0	8496.0	100.0	14373814.0	5424.0	9848.0	16.0
3	3	71865	109.0	10	0.0	1.0	193	15.0	0.0	...	7124.0	NaN	13992.0	92900.0	5200.0	15.0	113034208.0	8248.0	8076.0	19.0
4	4	111080	110.0	5	1.0	0.0	68	5.0	0.0	...	1531.0	6.0	4664.0	31500.0	2300.0	20.0	15414322.0	3344.0	3306.0	6.0

5 rows × 39 columns

continuous_feature_names = [x for x in sample_feature.columns if x not in ['price','brand','model']]

continuous_feature_names

['SaleID',
 'name',
 'bodyType',
 'fuelType',
 'gearbox',
 'power',
 'kilometer',
 'notRepairedDamage',
 'seller',
 'offerType',
 'v_0',
 'v_1',
 'v_2',
 'v_3',
 'v_4',
 'v_5',
 'v_6',
 'v_7',
 'v_8',
 'v_9',
 'v_10',
 'v_11',
 'v_12',
 'v_13',
 'v_14',
 'train',
 'used_time',
 'city',
 'brand_amount',
 'brand_price_max',
 'brand_price_median',
 'brand_price_min',
 'brand_price_sum',
 'brand_price_std',
 'brand_price_average',
 'power_bin']

2 线性回归 & 五折交叉验证 & 模拟真实业务情况

下面一段代码是对sample_feature（汽车有尚未修复的损坏）进行转化，这货原来是什么情况？来看一下

sample_feature.notRepairedDamage.value_counts()

0.0    147809
-       32251
1.0     18977
Name: notRepairedDamage, dtype: int64

notRepairedDamage原来是category类型，将列转换成float32类型。首先用0代替-，应该是0代表没有损坏，而1是有损坏

sample_feature = sample_feature.dropna().replace('-', 0).reset_index(drop=True)
sample_feature['notRepairedDamage'] = sample_feature['notRepairedDamage'].astype(np.float32)
train = sample_feature[continuous_feature_names + ['price']]

train_X = train[continuous_feature_names]
train_y = train['price']

然后将特征和标签提取出来了

简单建模

所谓简单就是线性回归了

from sklearn.linear_model import LinearRegression

虽然简单，但是这里输入参数normalize=True啥意思呢？就是归一化的意思，每个值先减平均数，然后再除以二范数

model = LinearRegression(normalize=True)

model = model.fit(train_X, train_y)

查看训练的线性回归模型的截距（intercept）与权重(coef)

下面这段代码就是求出模型的截距和系数，有一点需要自己学习吧，就是他这个函数用法，我自己是想破头也写不出来的。其实他做的就是让系数从大到小排序，肯定越靠前说明重要程度越大啦，但负值越大应该也影响越大

print('intercept:'+ str(model.intercept_))
sorted(dict(zip(continuous_feature_names, model.coef_)).items(), key=lambda x:x[1], reverse=True)

intercept:-110670.68277237505





[('v_6', 3367064.341641913),
 ('v_8', 700675.5609398851),
 ('v_9', 170630.27723221114),
 ('v_7', 32322.66193201868),
 ('v_12', 20473.670796932514),
 ('v_3', 17868.079541480864),
 ('v_11', 11474.938996683431),
 ('v_13', 11261.76456000961),
 ('v_10', 2683.9200905975536),
 ('gearbox', 881.8225039248213),
 ('fuelType', 363.90425072160974),
 ('bodyType', 189.60271012071914),
 ('city', 44.94975120522923),
 ('power', 28.55390161674886),
 ('brand_price_median', 0.5103728134079288),
 ('brand_price_std', 0.45036347092635),
 ('brand_amount', 0.14881120395065447),
 ('brand_price_max', 0.0031910186703120124),
 ('SaleID', 5.355989919860818e-05),
 ('train', 9.12696123123169e-08),
 ('seller', -1.2324308045208454e-06),
 ('offerType', -1.2362143024802208e-06),
 ('brand_price_sum', -2.1750068681875495e-05),
 ('name', -0.0002980012713068734),
 ('used_time', -0.0025158943328551053),
 ('brand_price_average', -0.40490484510119196),
 ('brand_price_min', -2.2467753486892215),
 ('power_bin', -34.42064411723048),
 ('v_14', -274.7841180773099),
 ('kilometer', -372.89752666072053),
 ('notRepairedDamage', -495.1903844629022),
 ('v_0', -2045.05495735435),
 ('v_5', -11022.98624056226),
 ('v_4', -15121.731109853255),
 ('v_2', -26098.299920495414),
 ('v_1', -45556.189297267025)]

from matplotlib import pyplot as plt

取随机样本画图

subsample_index = np.random.randint(low=0, high=len(train_y), size=50)

绘制特征v_9的值与标签的散点图，图片发现模型的预测结果（蓝色点）与真实标签（黑色点）的分布差异较大，且部分预测值出现了小于0的情况，说明我们的模型存在一些问题

一堆点，这能看出啥呀，哎，搞不懂，也可能是点取的太多了吧。但一个明显的错误是能看出来的，就是竟然能预测出小于0的值，这肯定不好吧

plt.scatter(train_X['v_9'][subsample_index], train_y[subsample_index], color='black')
plt.scatter(train_X['v_9'][subsample_index], model.predict(train_X.loc[subsample_index]), color='blue')
plt.xlabel('v_9')
plt.ylabel('price')
plt.legend(['True Price','Predicted Price'],loc='upper right')
print('The predicted price is obvious different from true price')
plt.show()

The predicted price is obvious different from true price

在这里插入图片描述

通过作图我们发现数据的标签（price）呈现长尾分布，不利于我们的建模预测。原因是很多模型都假设数据误差项符合正态分布，而长尾分布的数据违背了这一假设。

这个做了分布的图，说是长尾分布不是正态分布，所以会出现预测较大的偏差

import seaborn as sns
print('It is clear to see the price shows a typical exponential distribution')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
train_y.hist()
plt.subplot(1,2,2)
train_y[train_y < np.quantile(train_y, 0.9)].hist()

It is clear to see the price shows a typical exponential distribution

在这里插入图片描述

在这里我们对标签进行了 $l o g (x + 1)$ 变换，使标签贴近于正态分布

note:用np.log进行对数运算，这一点好神奇，长尾分布取个log就变成正态分布了！不过自己的理解就是那些比较格路的幺蛾子值取个log就像均值靠拢了，所以更接近于正态分布了吧


train_y_ln = np.log(train_y + 1)

import seaborn as sns
print('The transformed price seems like normal distribution')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
train_y_ln.hist()
plt.subplot(1,2,2)
train_y_ln[train_y_ln < np.quantile(train_y_ln, 0.9)].hist()

The transformed price seems like normal distribution

在这里插入图片描述

dict(zip(continuous_feature_names, model.coef_)).items()可以将键值对都取出来，然后组成一个tuple数组

model = model.fit(train_X, train_y_ln)

print('intercept:'+ str(model.intercept_))
sorted(dict(zip(continuous_feature_names, model.coef_)).items(), key=lambda x:x[1], reverse=True)

intercept:18.750749465562816





[('v_9', 8.052409900567445),
 ('v_5', 5.764236596653074),
 ('v_12', 1.6182081236792127),
 ('v_1', 1.4798310582986653),
 ('v_11', 1.1669016563599888),
 ('v_13', 0.9404711296034274),
 ('v_7', 0.7137273083566703),
 ('v_3', 0.6837875771084441),
 ('v_0', 0.008500518010074017),
 ('power_bin', 0.00849796930289183),
 ('gearbox', 0.00792237727832901),
 ('fuelType', 0.006684769706822828),
 ('bodyType', 0.004523520092702889),
 ('power', 0.0007161894205358969),
 ('brand_price_min', 3.334351114748527e-05),
 ('brand_amount', 2.8978797042779114e-06),
 ('brand_price_median', 1.2571172873010354e-06),
 ('brand_price_std', 6.659176363425468e-07),
 ('brand_price_max', 6.194956307517108e-07),
 ('brand_price_average', 5.999345965068619e-07),
 ('SaleID', 2.11941700396494e-08),
 ('seller', 4.986766555248323e-11),
 ('train', 1.0800249583553523e-11),
 ('offerType', -3.7552183584921295e-11),
 ('brand_price_sum', -1.5126504215930698e-10),
 ('name', -7.015512588892066e-08),
 ('used_time', -4.122479372352577e-06),
 ('city', -0.0022187824810425832),
 ('v_14', -0.004234223418120774),
 ('kilometer', -0.013835866226882912),
 ('notRepairedDamage', -0.27027942349846146),
 ('v_4', -0.8315701200994835),
 ('v_2', -0.9470842241619211),
 ('v_10', -1.6261466689762891),
 ('v_8', -40.34300748761719),
 ('v_6', -238.7903638550714)]

再次进行可视化，发现预测结果与真实值较为接近，且未出现异常状况

plt.figure(figsize=(30,10))
plt.scatter(train_X['v_9'][subsample_index], train_y[subsample_index], color='black')
plt.scatter(train_X['v_9'][subsample_index], np.exp(model.predict(train_X.loc[subsample_index])), color='blue')
plt.xlabel('v_9')
plt.ylabel('price')
plt.legend(['True Price','Predicted Price'],loc='upper right')
print('The predicted price seems normal after np.log transforming')
plt.show()

The predicted price seems normal after np.log transforming

在这里插入图片描述

五折交叉验证

这里面引用了很多函数需要注意

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,  make_scorer

note: np.nan_to_num()将nan变成0，将inf变成finit数

def log_transfer(func):
    def wrapper(y, yhat):
        result = func(np.log(y), np.nan_to_num(np.log(yhat)))
        return result
    return wrapper

scores = cross_val_score(model, X=train_X, y=train_y, verbose=1, cv = 5, scoring=make_scorer(log_transfer(mean_absolute_error)))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.7s finished

使用线性回归模型，对未处理标签的特征数据进行五折交叉验证

print('AVG:', np.mean(scores))

AVG: 1.3658023920313753

使用线性回归模型，对处理过标签的特征数据进行五折交叉验证

scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=1, cv = 5, scoring=make_scorer(mean_absolute_error))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.1s finished

print('AVG:', np.mean(scores))

AVG: 0.1932530183704746

pd.DataFrame(scores.reshape(1,-1))

	0	1	2	3	4
0	1.348304	1.36349	1.380712	1.378401	1.358105

scores = pd.DataFrame(scores.reshape(1,-1))
scores.columns = ['cv' + str(x) for x in range(1, 6)]
scores.index = ['MAE']
scores

	cv1	cv2	cv3	cv4	cv5
MAE	1.348304	1.36349	1.380712	1.378401	1.358105

模拟真实业务情况

但在事实上，由于我们并不具有预知未来的能力，五折交叉验证在某些与时间相关的数据集上反而反映了不真实的情况。通过2018年的二手车价格预测2017年的二手车价格，这显然是不合理的，因此我们还可以采用时间顺序对数据集进行分隔。在本例中，我们选用靠前时间的4/5样本当作训练集，靠后时间的1/5当作验证集，最终结果与五折交叉验证差距不大

import datetime

sample_feature

	SaleID	name	model	brand	bodyType	fuelType	gearbox	power	kilometer	notRepairedDamage	...	used_time	city	brand_amount	brand_price_max	brand_price_median	brand_price_min	brand_price_sum	brand_price_std	brand_price_average	power_bin
0	0	736	30.0	6	1.0	0.0	0.0	60	12.5	0.0	...	4384.0	1.0	10192.0	35990.0	1800.0	13.0	36457520.0	4564.0	3576.0	5.0
1	2	14874	115.0	15	1.0	0.0	0.0	163	12.5	0.0	...	4384.0	2.0	1458.0	45000.0	8496.0	100.0	14373814.0	5424.0	9848.0	16.0
2	4	111080	110.0	5	1.0	0.0	0.0	68	5.0	0.0	...	1531.0	6.0	4664.0	31500.0	2300.0	20.0	15414322.0	3344.0	3306.0	6.0
3	5	137642	24.0	10	0.0	1.0	0.0	109	10.0	0.0	...	2482.0	3.0	13992.0	92900.0	5200.0	15.0	113034208.0	8248.0	8076.0	10.0
4	6	2402	13.0	4	0.0	0.0	1.0	150	15.0	0.0	...	6184.0	3.0	16576.0	99999.0	6000.0	12.0	138279072.0	8088.0	8344.0	14.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
91724	149994	43073	42.0	1	1.0	0.0	0.0	122	3.0	0.0	...	1538.0	5.0	13656.0	84000.0	6400.0	15.0	124044600.0	8992.0	9080.0	12.0
91725	149995	163978	121.0	10	4.0	0.0	1.0	163	15.0	0.0	...	5772.0	4.0	13992.0	92900.0	5200.0	15.0	113034208.0	8248.0	8076.0	16.0
91726	149996	184535	116.0	11	0.0	0.0	0.0	125	10.0	0.0	...	2322.0	2.0	2944.0	34500.0	2900.0	30.0	13398006.0	4724.0	4548.0	12.0
91727	149997	147587	60.0	11	1.0	1.0	0.0	90	6.0	0.0	...	2003.0	3.0	2944.0	34500.0	2900.0	30.0	13398006.0	4724.0	4548.0	8.0
91728	149998	45907	34.0	10	3.0	1.0	0.0	156	15.0	0.0	...	3672.0	1.0	13992.0	92900.0	5200.0	15.0	113034208.0	8248.0	8076.0	15.0

91729 rows × 39 columns

重设索引，并将原索引丢掉

sample_feature = sample_feature.reset_index(drop=True)

split_point = len(sample_feature) // 5 * 4

train = sample_feature.loc[:split_point].dropna()
val = sample_feature.loc[split_point:].dropna()

train_X = train[continuous_feature_names]
train_y_ln = np.log(train['price'] + 1)
val_X = val[continuous_feature_names]
val_y_ln = np.log(val['price'] + 1)

model = model.fit(train_X, train_y_ln)

mean_absolute_error(val_y_ln, model.predict(val_X))

0.19577667270300989

绘制学习率曲线与验证曲线

from sklearn.model_selection import learning_curve, validation_curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,n_jobs=1, train_size=np.linspace(.1, 1.0, 5 )):  
    plt.figure()  
    plt.title(title)  
    if ylim is not None:  
        plt.ylim(*ylim)  
    plt.xlabel('Training example')  
    plt.ylabel('score')  
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_size, scoring = make_scorer(mean_absolute_error))  
    train_scores_mean = np.mean(train_scores, axis=1)  
    train_scores_std = np.std(train_scores, axis=1)  
    test_scores_mean = np.mean(test_scores, axis=1)  
    test_scores_std = np.std(test_scores, axis=1)  
    plt.grid()#区域  
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,  
                     train_scores_mean + train_scores_std, alpha=0.1,  
                     color="r")  
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,  
                     test_scores_mean + test_scores_std, alpha=0.1,  
                     color="g")  
    plt.plot(train_sizes, train_scores_mean, 'o-', color='r',  
             label="Training score")  
    plt.plot(train_sizes, test_scores_mean,'o-',color="g",  
             label="Cross-validation score")  
    plt.legend(loc="best")  
    return plt

plot_learning_curve(LinearRegression(), 'Liner_model', train_X[:1000], train_y_ln[:1000], ylim=(0.0, 0.5), cv=5, n_jobs=1)

在这里插入图片描述

多种模型对比

train = sample_feature[continuous_feature_names + ['price']].dropna()

train_X = train[continuous_feature_names]
train_y = train['price']
train_y_ln = np.log(train_y + 1)

线性模型 & 嵌入式特征选择

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

models = [LinearRegression(),
          Ridge(),
          Lasso()]

result = dict()
for model in models:
    model_name = str(model).split('(')[0]
    scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error))
    result[model_name] = scores
    print(model_name + ' is finished')

LinearRegression is finished
Ridge is finished
Lasso is finished

result = pd.DataFrame(result)
result.index = ['cv' + str(x) for x in range(1, 6)]
result

	LinearRegression	Ridge	Lasso
cv1	0.190792	0.194832	0.383899
cv2	0.193758	0.197632	0.381893
cv3	0.194132	0.198123	0.384090
cv4	0.191825	0.195670	0.380526
cv5	0.195758	0.199676	0.383611

model = LinearRegression().fit(train_X, train_y_ln)
print('intercept:'+ str(model.intercept_))
sns.barplot(abs(model.coef_), continuous_feature_names)

intercept:18.750749465575524

在这里插入图片描述

model = Ridge().fit(train_X, train_y_ln)
print('intercept:'+ str(model.intercept_))
sns.barplot(abs(model.coef_), continuous_feature_names)

intercept:4.671709786976683

在这里插入图片描述

L1正则化有助于生成一个稀疏权值矩阵，进而可以用于特征选择。如下图，我们发现power与userd_time特征非常重要。

model = Lasso().fit(train_X, train_y_ln)
print('intercept:'+ str(model.intercept_))
sns.barplot(abs(model.coef_), continuous_feature_names)

intercept:8.6721824626662

在这里插入图片描述

除此之外，决策树通过信息熵或GINI指数选择分裂节点时，优先选择的分裂特征也更加重要，这同样是一种特征选择的方法。XGBoost与LightGBM模型中的model_importance指标正是基于此计算的

非线性模型

除了线性模型以外，还有许多我们常用的非线性模型如下，在此篇幅有限不再一一讲解原理。我们选择了部分常用模型与线性模型进行效果比对。

下面用到的方法有：线性回归、支持向量机、决策树回归、随机森林、梯度提升回归？、多层感知器、xgboost、lightgbm

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm.sklearn import LGBMRegressor

models = [LinearRegression(),
          DecisionTreeRegressor(),
          RandomForestRegressor(),
          GradientBoostingRegressor(),
          MLPRegressor(solver='lbfgs', max_iter=100), 
          XGBRegressor(n_estimators = 100, objective='reg:squarederror'), 
          LGBMRegressor(n_estimators = 100)]

models[0]

[LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'),
 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False),
 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                           init=None, learning_rate=0.1, loss='ls', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0, warm_start=False),
 MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=100,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False),
 XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='reg:squarederror', random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=None,
              tree_method=None, validate_parameters=False, verbosity=None),
 LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)]

result = dict()
for model in models:
    model_name = str(model).split('(')[0]
    scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error))
    result[model_name] = scores
    print(model_name + ' is finished')

LinearRegression is finished
DecisionTreeRegressor is finished
RandomForestRegressor is finished
GradientBoostingRegressor is finished
MLPRegressor is finished
XGBRegressor is finished
LGBMRegressor is finished

result = pd.DataFrame(result)
result.index = ['cv' + str(x) for x in range(1, 6)]
result

	LinearRegression	DecisionTreeRegressor	RandomForestRegressor	GradientBoostingRegressor	MLPRegressor	XGBRegressor	LGBMRegressor
cv1	0.190792	0.196785	0.132981	0.168903	336.480763	0.142378	0.141544
cv2	0.193758	0.193241	0.134480	0.171857	399.941663	0.140922	0.145501
cv3	0.194132	0.189123	0.133667	0.170915	266.022859	0.139393	0.143887
cv4	0.191825	0.190084	0.132413	0.169083	353.765162	0.137492	0.142497
cv5	0.195758	0.204320	0.137153	0.174078	720.029717	0.143733	0.144852

可以看到随机森林模型在每一个fold中均取得了更好的效果

模型调参

在此我们介绍了三种常用的调参方法如下：

贪心算法 https://www.jianshu.com/p/ab89df9759c8
网格调参 https://blog.youkuaiyun.com/weixin_43172660/article/details/83032029
贝叶斯调参 https://blog.youkuaiyun.com/linxid/article/details/81189154

## LGB的参数集合：

objective = ['regression', 'regression_l1', 'mape', 'huber', 'fair']

num_leaves = [3,5,10,15,20,40, 55]# 叶子节点个数
max_depth = [3,5,10,15,20,40, 55]# 树最大深度
bagging_fraction = []
feature_fraction = []
drop_rate = []

1 贪心调参

best_obj = dict()
for obj in objective:
    model = LGBMRegressor(objective=obj)
    score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_obj[obj] = score
    
best_leaves = dict()
for leaves in num_leaves:
    model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=leaves)
    score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_leaves[leaves] = score
    
best_depth = dict()
for depth in max_depth:
    model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
                          num_leaves=min(best_leaves.items(), key=lambda x:x[1])[0],
                          max_depth=depth)
    score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_depth[depth] = score

2 Grid Search 调参

from sklearn.model_selection import GridSearchCV

parameters = {'objective': objective , 'num_leaves': num_leaves, 'max_depth': max_depth}
model = LGBMRegressor()
clf = GridSearchCV(model, parameters, cv=5)
clf = clf.fit(train_X, train_y)

clf.best_params_

{'max_depth': 15, 'num_leaves': 55, 'objective': 'regression'}

model = LGBMRegressor(objective='regression',
                          num_leaves=55,
                          max_depth=15)

np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))

0.13754833106731224

3 贝叶斯调参

from bayes_opt import BayesianOptimization

首先定义一个函数，返回的应该是k折验证的准确率

def rf_cv(num_leaves, max_depth, subsample, min_child_samples):
    val = cross_val_score(
        LGBMRegressor(objective = 'regression_l1',
            num_leaves=int(num_leaves),
            max_depth=int(max_depth),
            subsample = subsample,
            min_child_samples = int(min_child_samples)
        ),
        X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)
    ).mean()
    return 1 - val

rf_bo = BayesianOptimization(
    rf_cv,
    {
    'num_leaves': (2, 100),
    'max_depth': (2, 100),
    'subsample': (0.1, 1),
    'min_child_samples' : (2, 100)
    }
)

rf_bo.maximize()

[31mInitialization[0m
[94m----------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   max_depth |   min_child_samples |   num_leaves |   subsample | 
    1 | 00m18s | [35m   0.86150[0m | [32m    41.7364[0m | [32m             9.9106[0m | [32m     43.8343[0m | [32m     0.4737[0m | 
    2 | 00m25s | [35m   0.86523[0m | [32m    20.0225[0m | [32m            11.1218[0m | [32m     62.7188[0m | [32m     0.3173[0m | 
    3 | 00m17s |    0.86143 |     45.6899 |             64.7975 |      43.3648 |      0.8202 | 
    4 | 00m10s |    0.83443 |     35.0734 |             94.4035 |       8.8507 |      0.8622 | 
    5 | 00m18s |    0.86244 |     85.3179 |             23.6400 |      46.9726 |      0.7228 | 
[31mBayesian Optimization[0m
[94m----------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   max_depth |   min_child_samples |   num_leaves |   subsample | 
    6 | 00m36s | [35m   0.86920[0m | [32m    99.4589[0m | [32m            99.1108[0m | [32m     99.7631[0m | [32m     0.8204[0m | 
    7 | 00m24s |    0.80646 |      2.5444 |             96.8373 |      99.3405 |      0.8690 | 
    8 | 00m35s |    0.86895 |     92.8784 |              2.6496 |      99.7885 |      0.2488 | 
    9 | 00m36s |    0.86895 |     42.1611 |              2.5265 |      99.9897 |      0.1828 | 
   10 | 00m37s | [35m   0.86921[0m | [32m    99.1173[0m | [32m            57.5858[0m | [32m     99.6229[0m | [32m     0.1268[0m | 
   11 | 00m31s |    0.85987 |     99.4359 |             99.6975 |      37.0181 |      0.3138 | 
   12 | 00m41s |    0.86780 |     66.8997 |             21.5978 |      81.4402 |      0.9551 | 
   13 | 00m41s |    0.86692 |     99.9751 |             75.6942 |      72.3071 |      0.8105 | 
   14 | 00m29s |    0.82570 |      3.3240 |              2.7883 |      98.3646 |      0.8390 | 
   15 | 00m18s |    0.77190 |     99.7659 |              5.9422 |       2.2912 |      0.4362 | 
   16 | 00m29s |    0.86532 |     57.5752 |             99.7148 |      61.8887 |      0.1556 | 
   17 | 00m20s |    0.82546 |      3.3740 |             99.9999 |      42.6172 |      0.1799 | 
   18 | 00m33s |    0.86700 |     52.7029 |              2.9349 |      73.3034 |      0.1083 | 
   19 | 00m20s |    0.80204 |      3.1007 |              3.6067 |       3.0243 |      0.5249 | 
   20 | 00m39s | [35m   0.86921[0m | [32m    64.1085[0m | [32m            98.4171[0m | [32m     99.6344[0m | [32m     0.1344[0m | 
   21 | 00m41s | [35m   0.86927[0m | [32m    43.9552[0m | [32m            47.2108[0m | [32m     99.2895[0m | [32m     0.1297[0m | 
   22 | 00m38s |    0.86555 |     74.0348 |             52.8610 |      61.7144 |      0.1191 | 
   23 | 00m42s |    0.86703 |     99.5047 |             28.7613 |      74.8431 |      0.1029 | 
   24 | 00m41s |    0.86601 |     36.6047 |             34.0949 |      64.7690 |      0.1102 | 
   25 | 00m55s |    0.86916 |     66.7270 |             65.1533 |      99.8953 |      0.9703 | 
   26 | 00m44s | [35m   0.86940[0m | [32m    72.6885[0m | [32m            25.6077[0m | [32m     99.7277[0m | [32m     0.1478[0m | 
   27 | 00m47s |    0.86909 |     99.9445 |             20.9597 |      98.0581 |      0.9664 | 
   28 | 00m45s |    0.86758 |     83.0911 |             99.8907 |      79.5884 |      0.6953 | 
   29 | 00m48s |    0.86923 |     49.6717 |             17.7407 |      99.3922 |      0.9610 | 
   30 | 00m48s |    0.86895 |     65.9624 |              2.8017 |      99.2133 |      0.6879 |

rf_bo.res['max']

{'max_val': 0.8693980187983674,
 'max_params': {'num_leaves': 99.72769486848688,
  'max_depth': 72.68851665306862,
  'subsample': 0.14775375691262158,
  'min_child_samples': 25.60766609629669}}

1-rf_bo.res['max']['max_val']

0.13060198120163258

总结

在本章中，我们完成了建模与调参的工作，并对我们的模型进行了验证。此外，我们还采用了一些基本方法来提高预测的精度，提升如下图所示。

plt.figure(figsize=(13,5))
sns.lineplot(x=['0_origin','1_log_transfer','2_L1_&_L2','3_change_model','4_parameter_turning'], y=[1.36 ,0.19, 0.19, 0.14, 0.13])