使用树集合进行特征转换¶
将您的功能转换为更高维度的稀疏空间。然后训练这些特征的线性模型。
First fit an ensemble of trees (totally random trees, a random forest, or gradient boosted trees) on the training set. Then each leaf of each tree in the ensemble is assigned a fixed arbitrary feature index in a new feature space. These leaf indices are then encoded in a one-hot fashion.
Each sample goes through the decisions of each tree of the ensemble and ends up in one leaf per tree. The sample is encoded by setting feature values for these leaves to 1 and the other feature values to 0.
The resulting transformer has then learned a supervised, sparse, high-dimensional categorical embedding of the data.
In [5]:
# Author: Tim Head <betatim@gmail.com> # # License: BSD 3 clause import numpy as np np.random.seed(10) import matplotlib.pyplot as plt from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier, GradientBoostingClassifier) from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import roc_curve from sklearn.pipeline import make_pipeline
In [9]:
n_estimator = 10 X, y = make_classification(n_samples=80000) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # It is important to train the ensemble of trees on a different subset # of the training data than the linear regression model to avoid # overfitting, in particular if the total number of leaves is # similar to the number of training samples X_train, X_train_lr, y_train, y_train_lr = train_test_split( X_train, y_train, test_size=0.5) # Unsupervised transformation based on totally random trees # 基于全随机树的无监督变换 rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator, random_state=0) rt
Out[9]:
RandomTreesEmbedding(max_depth=3, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None, random_state=0, sparse_output=True, verbose=0, warm_start=False)
In [10]:
rt_lm = LogisticRegression(solver='lbfgs', max_iter=1000) rt_lm
Out[10]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=1000, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
In [11]:
pipeline = make_pipeline(rt, rt_lm) pipeline
Out[11]:
Pipeline(memory=None, steps=[('randomtreesembedding', RandomTreesEmbedding(max_depth=3, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None, random_state=0, ...enalty='l2', random_state=None, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))])
In [12]:
# sklearn的roc_curve()函数分析 # https://blog.youkuaiyun.com/Titan0427/article/details/79356290 pipeline.fit(X_train, y_train) y_pred_rt = pipeline.predict_proba(X_test)[:, 1] fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) # Supervised transformation based on random forests 基于随机森林的监督变换 rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = OneHotEncoder(categories='auto') rf_lm = LogisticRegression(solver='lbfgs', max_iter=1000) rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) # Supervised transformation based on gradient boosted trees 基于梯度增强树的监督变换 grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = OneHotEncoder(categories='auto') grd_lm = LogisticRegression(solver='lbfgs', max_iter=1000) grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_lm = grd_lm.predict_proba( grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm) # The gradient boosted model by itself 梯度提升模型 y_pred_grd = grd.predict_proba(X_test)[:, 1] fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd) # The random forest model by itself 随机森林模型 y_pred_rf = rf.predict_proba(X_test)[:, 1] fpr_rf, tpr_rf, thresholds_skl = roc_curve(y_test, y_pred_rf) print(fpr_rf,'\n', tpr_rf,'\n', thresholds_skl)
[0. 0. 0. ... 0.99894916 0.99989992 1. ] [0.00000000e+00 4.99600320e-05 9.99200639e-05 ... 1.00000000e+00 1.00000000e+00 1.00000000e+00] [1.86755011 0.86755011 0.86707087 ... 0.09745395 0.09734498 0.09422677]
In [4]:
plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR') plt.plot(fpr_rf, tpr_rf, label='RF') plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR') plt.plot(fpr_grd, tpr_grd, label='GBT') plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') plt.show()
In [13]:
plt.figure(2) plt.xlim(0, 0.2) plt.ylim(0.8, 1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR') plt.plot(fpr_rf, tpr_rf, label='RF') plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR') plt.plot(fpr_grd, tpr_grd, label='GBT') plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve (zoomed in at top left)') plt.legend(loc='best') plt.show()
https://community.alteryx.com/t5/Data-Science-Blog/ROC-Curves-in-Python-and-R/ba-p/138430
In [14]:
import numpy as np np.random.seed(10) import matplotlib.pyplot as plt import xgboost as xgb from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.ensemble import ( RandomForestClassifier, GradientBoostingClassifier) from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import roc_curve, roc_auc_score from scipy.sparse import hstack X, y = make_classification(n_samples=80000) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # It is important to train the ensemble of trees on a different subset # of the training data than the linear regression model to avoid # overfitting, in particular if the total number of leaves is # similar to the number of training samples X_train, X_train_lr, y_train, y_train_lr = train_test_split( X_train, y_train, test_size=0.5) n_estimator = 10 ''' X_train为20000*20 X_train_lr为20000*20 y_train为20000*1 y_train_lr为20000*1 y_test 为40000*1 ''' X_train.shape
Out[14]:
(20000, 20)
RF+LR
我们首先使用随机森林进行实验。这里我们需要对代码进行一个解读。
In [15]:
# Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = OneHotEncoder() rf_lm = LogisticRegression(solver='lbfgs', max_iter=1000) # 使用随机森林进行训练 rf.fit(X_train, y_train) ''' rf.apply(X_train)的大小为20000*10,10就是10颗树。 rf.apply(X_train)的元素代表了哪一个样本落在了哪一个树的第几个叶子节点上,如32,就代表了落在了一棵树的第32个叶子节点上。 rf_enc.fit(rf.apply(X_train))对rf.apply(X_train)进行了onehot编码 ''' rf_enc.fit(rf.apply(X_train)) ''' rf_enc.transform(rf.apply(X_train_lr))按照rf.apply(X_train)的编码方式对rf.apply(X_train_lr)进行了onehot编码。 这里需要注意得是:我们并没有像常规方式那样对rf.apply(X_train)和rf.apply(X_train_lr)先进行合并再进行onehot,这是因为在训练完成之后随机森林的树模型已经固定,即叶子节点的架构已经确定。 因此rf.apply(X_train_lr)的值的范围和rf.apply(X_train)的值的范围必然一样! rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)使用随机森林构造的特征来训练LR ''' rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) print("RF+LR的AUC为:", roc_auc_score(y_test, y_pred_rf_lm))
C:\Users\ljt\Anaconda3\lib\site-packages\sklearn\preprocessing\_encoders.py:371: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values. If you want the future behaviour and silence this warning, you can specify "categories='auto'". In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly. warnings.warn(msg, FutureWarning)
RF+LR的AUC为: 0.9725327559933575
GBDT+LR
我们其次使用随机森林进行实验。其代码与随机森林几乎一样。不再进行代码解读。
In [18]:
# Supervised transformation based on gradient boosted trees grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = OneHotEncoder() grd_lm = LogisticRegression(solver='lbfgs', max_iter=1000) grd
Out[18]:
GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_iter_no_change=None, presort='auto', random_state=None, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False)
In [19]:
grd_lm
Out[19]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=1000, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
In [21]:
grd.fit(X_train, y_train) grd.apply(X_train)[:, :, 0]
Out[21]:
array([[ 6., 6., 6., ..., 10., 10., 10.], [10., 10., 10., ..., 3., 3., 3.], [ 6., 6., 6., ..., 11., 10., 10.], ..., [ 6., 6., 6., ..., 10., 10., 10.], [ 6., 6., 6., ..., 11., 10., 10.], [ 6., 6., 6., ..., 11., 10., 10.]])
In [16]:
grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_lm = grd_lm.predict_proba( grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm) print("GBT+LR的AUC为:", roc_auc_score(y_test, y_pred_grd_lm))
C:\Users\ljt\Anaconda3\lib\site-packages\sklearn\preprocessing\_encoders.py:371: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values. If you want the future behaviour and silence this warning, you can specify "categories='auto'". In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly. warnings.warn(msg, FutureWarning)
GBT+LR的AUC为: 0.9847114426749124
Xgboost+LR
最后,我们使用Xgboost进行实验。
In [22]:
# Supervised transformation based on xgboost xgb = xgb.XGBClassifier(nthread=4, #含义:nthread=-1时,使用全部CPU进行并行运算(默认), nthread=1时,使用1个CPU进行运算。 learning_rate=0.08, #含义:学习率,控制每次迭代更新权重时的步长,默认0.3。调参:值越小,训练越慢。典型值为0.01-0.2。 n_estimators=50, #含义:总共迭代的次数,即决策树的个数 max_depth=5, #含义:树的深度,默认值为6,典型值3-10。调参:值越大,越容易过拟合;值越小,越容易欠拟合 gamma=0, #含义:惩罚项系数,指定节点分裂所需的最小损失函数下降值。 subsample=0.9, #含义:训练每棵树时,使用的数据占全部训练集的比例。默认值为1,典型值为0.5-1。调参:防止overfitting。 colsample_bytree=0.5) #训练每棵树时,使用的特征占全部特征的比例。默认值为1,典型值为0.5-1。调参:防止overfitting。 xgb_enc = OneHotEncoder() xgb_lm = LogisticRegression(solver='lbfgs', max_iter=1000) xgb.fit(X_train, y_train) xgb_enc.fit(xgb.apply(X_train)) xgb_lm.fit(xgb_enc.transform(xgb.apply(X_train_lr)), y_train_lr) y_pred_xgb_lm = xgb_lm.predict_proba( xgb_enc.transform(xgb.apply(X_test)))[:, 1] fpr_xgb_lm, tpr_xgb_lm, _ = roc_curve(y_test, y_pred_xgb_lm) print("xgboost+LR的AUC为:", roc_auc_score(y_test, y_pred_xgb_lm))
C:\Users\ljt\Anaconda3\lib\site-packages\sklearn\preprocessing\_encoders.py:371: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values. If you want the future behaviour and silence this warning, you can specify "categories='auto'". In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly. warnings.warn(msg, FutureWarning)
xgboost+LR的AUC为: 0.9925876883813635
在之前的代码中,我们只是用树模型构造的新特征来训练LR。
接下来,我们更近一步,将新特征与原始的20个特征进行拼接形成新的数据集来训练LR。
In [23]:
X_train_ext = hstack([xgb_enc.transform(xgb.apply(X_train_lr)), X_train_lr]) X_test_ext = hstack([xgb_enc.transform(xgb.apply(X_test)), X_test]) xgb_lm.fit(X_train_ext, y_train_lr) y_pred_xgb_originalfeature_lm = xgb_lm.predict_proba(X_test_ext)[:, 1] fpr_xgb_originalfeature_lm, tpr_xgb_originalfeature_lm, _ = roc_curve(y_test, y_pred_xgb_originalfeature_lm) print("xgboost新特征与原始特征+LR的AUC为:", roc_auc_score(y_test, y_pred_xgb_originalfeature_lm))
xgboost新特征与原始特征+LR的AUC为: 0.9926323122840888
In [24]:
# The gradient boosted model by itself y_pred_grd = grd.predict_proba(X_test)[:, 1] fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd) print("GBT的AUC为:", roc_auc_score(y_test, y_pred_grd)) # The random forest model by itself y_pred_rf = rf.predict_proba(X_test)[:, 1] fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf) print("RF的AUC为:", roc_auc_score(y_test, y_pred_rf)) # The xgboost model by itself xgb.fit(X_train, y_train) y_pred_xgb = xgb.predict_proba(X_test)[:, 1] fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_xgb) print('xgboost的AUC为:' , roc_auc_score(y_test, y_pred_xgb))
GBT的AUC为: 0.9822001315799502 RF的AUC为: 0.9657628078232098 xgboost的AUC为: 0.9928442730095244
RF+LR的AUC为: 0.972532755993
GBT+LR的AUC为: 0.984711442675
xgboost+LR的AUC为: 0.992587688381
xgboost新特征与原始特征+LR的AUC为: 0.992632312284
GBT的AUC为: 0.98220013158
RF的AUC为: 0.965762807823
xgboost的AUC为: 0.99284427301
我们可以看到:对于RF和GBT,与LR进行融合后的结果要比单独使用RF和GBT要好。而对于Xgboost,单独使用Xgboost效果最好,其次是xgboost新特征与原始特征+LR,最后才是xgboost+LR。这与我们预期不符。为什么会出现这样的结果,值得我们讨论。
画图来进一步看下ROC曲线:
In [26]:
plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_rf, tpr_rf, label='RF') plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR') plt.plot(fpr_grd, tpr_grd, label='GBT') plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR') plt.plot(fpr_xgb, tpr_xgb, label='XGB') plt.plot(fpr_xgb_lm, tpr_xgb_lm, label='XGB + LR') plt.plot(fpr_xgb_originalfeature_lm, tpr_xgb_originalfeature_lm, label='XGB + ori_fea+ LR') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') plt.show() plt.figure(2) plt.xlim(0, 0.2) plt.ylim(0.8, 1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_rf, tpr_rf, label='RF') plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR') plt.plot(fpr_grd, tpr_grd, label='GBT') plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR') plt.plot(fpr_xgb, tpr_xgb, label='XGB') plt.plot(fpr_xgb_lm, tpr_xgb_lm, label='XGB + LR') plt.plot(fpr_xgb_originalfeature_lm, tpr_xgb_originalfeature_lm, label='XGB + ori_fea + LR') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve (zoomed in at top left)') plt.legend(loc='best') plt.show()
上边的图为ROC曲线,下边的图是对ROC曲线左上角进行了放大。
三、为什么Xgboost+LR的融合效果没有想象中那么好
我们提到了仅使用Xgboost的结果反而最好。这是为什么呢?因为XGBoost + LR 只是一种特征工程的方法,并不是一种能自动替代特征工程的方法。
借助参考文献【2】,我们来验证 XGBoost + LR 是尝试自动替代特征工程的方法,还只是一种特征工程的方法。
我们在自己业务的数据上做了一些实验。下图便是实验结果,其中: “xgboost+lr1" 是 XGBoost 的叶子节点特征、原始属性特征和二阶交叉特征一起给 LR 进行训练;"xgboost+lr2" 则只有叶子节点特征给 LR;"lr1" 是原始属性特征和二阶交叉特征; "lr2" 只有原始属性特征。
1) "xgboost+lr2" 明显弱于 "lr1" 方法,说明只用叶子节点特征的 XGBoost + LR 弱于有特征工程的 LR 算法。即 XGBoost 叶子节点不能取代特征工程,XGBoost + LR 无法取代传统的特征工程。
2) "xgboost+lr1" 取得了所有方法中的最好效果,说明了保留原来的特征工程 XGBoost + LR 方法拥有比较好的效果。即 XGBoost 叶子节点特征是一种有效的特征,XGBoost + LR 是一种有效的特征工程手段。
因此,可以得到以下的结论:
尽管XGBoost+LR 在工业和竞赛实践中,都取得了不错的效果。但 XGBoost 的叶子节点不能完全替代人工特征, XGBoost + LR 并没有像深度学习那样试图带来自动特征工程的故事和逻辑。最终,XGBoost + LR 的格局没有超越特征工程。
LightGBM-GBDT-LR
A simple python code of applying GBDT+LR for CTR prediction
The part of GBDT is proceeded by LightGBM, which is recently proposed by Microsoft, please install it first
https://github.com/Microsoft/LightGBM
The part of Logestic Regression is proceeded by sklearn machine learning.
The main idea is from the work of Facebook published in 2014 that merging GBDT and LR for CTR prediction
http://quinonero.net/Publications/predicting-clicks-facebook.pdf
GBDT is used for feature transformation while the LR uses the transformed data for prediction
In [29]:
from __future__ import division import json import lightgbm as lgb import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error from sklearn.linear_model import LogisticRegression # load or create your dataset print('Load data...') # df_train = pd.read_csv('../train.txt', header=None, sep=' ') # df_test = pd.read_csv('../test.txt', header=None, sep=' ') # y_train = df_train[0] # training label # y_test = df_test[0] # testing label # X_train = df_train.drop(0, axis=1) # training dataset # X_test = df_test.drop(0, axis=1) # testing dataset # create dataset for lightgbm lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) # specify your configurations as a dict params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': {'binary_logloss'}, 'num_leaves': 63, 'num_trees': 100, 'learning_rate': 0.01, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 0 } # number of leaves,will be used in feature transformation num_leaf = 63 print('Start training...') # train gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_train) print('Save model...') # save model to file gbm.save_model('model.txt') print('Start predicting...') # predict and get data on leaves, training data y_pred = gbm.predict(X_train,pred_leaf=True) # feature transformation and write result print('Writing transformed training data') transformed_training_matrix = np.zeros([len(y_pred),len(y_pred[0]) * num_leaf],dtype=np.int64) for i in range(0,len(y_pred)): temp = np.arange(len(y_pred[0])) * num_leaf - 1 + np.array(y_pred[i]) transformed_training_matrix[i][temp] += 1 #for i in range(0,len(y_pred)): # for j in range(0,len(y_pred[i])): # transformed_training_matrix[i][j * num_leaf + y_pred[i][j]-1] = 1 # predict and get data on leaves, testing data y_pred = gbm.predict(X_test,pred_leaf=True) # feature transformation and write result print('Writing transformed testing data') transformed_testing_matrix = np.zeros([len(y_pred),len(y_pred[0]) * num_leaf],dtype=np.int64) for i in range(0,len(y_pred)): temp = np.arange(len(y_pred[0])) * num_leaf - 1 + np.array(y_pred[i]) transformed_testing_matrix[i][temp] += 1 #for i in range(0,len(y_pred)): # for j in range(0,len(y_pred[i])): # transformed_testing_matrix[i][j * num_leaf + y_pred[i][j]-1] = 1 print('Calculate feature importances...') # feature importances print('Feature importances:', list(gbm.feature_importance())) print('Feature importances:', list(gbm.feature_importance("gain"))) # Logestic Regression Start print("Logestic Regression Start") # load or create your dataset print('Load data...') c = np.array([1,0.5,0.1,0.05,0.01,0.005,0.001]) for t in range(0,len(c)): lm = LogisticRegression(penalty='l2',C=c[t]) # logestic model construction lm.fit(transformed_training_matrix,y_train) # fitting the data #y_pred_label = lm.predict(transformed_training_matrix ) # For training data #y_pred_label = lm.predict(transformed_testing_matrix) # For testing data #y_pred_est = lm.predict_proba(transformed_training_matrix) # Give the probabilty on each label y_pred_est = lm.predict_proba(transformed_testing_matrix) # Give the probabilty on each label #print('number of testing data is ' + str(len(y_pred_label))) #print(y_pred_est) # calculate predict accuracy #num = 0 #for i in range(0,len(y_pred_label)): #if y_test[i] == y_pred_label[i]: # if y_train[i] == y_pred_label[i]: # num += 1 #print('penalty parameter is '+ str(c[t])) #print("prediction accuracy is " + str((num)/len(y_pred_label))) # Calculate the Normalized Cross-Entropy # for testing data NE = (-1) / len(y_pred_est) * sum(((1+y_test)/2 * np.log(y_pred_est[:,1]) + (1-y_test)/2 * np.log(1 - y_pred_est[:,1]))) # for training data #NE = (-1) / len(y_pred_est) * sum(((1+y_train)/2 * np.log(y_pred_est[:,1]) + (1-y_train)/2 * np.log(1 - y_pred_est[:,1]))) print("Normalized Cross Entropy " + str(NE))
Load data... Start training... [1] training's binary_logloss: 0.684161 [2] training's binary_logloss: 0.675372
C:\Users\ljt\Anaconda3\lib\site-packages\lightgbm\engine.py:99: UserWarning: Found `num_trees` in params. Will use it instead of argument warnings.warn("Found `{}` in params. Will use it instead of argument".format(alias))
[3] training's binary_logloss: 0.666747 [4] training's binary_logloss: 0.658286 [5] training's binary_logloss: 0.650014 [6] training's binary_logloss: 0.641927 [7] training's binary_logloss: 0.633995 [8] training's binary_logloss: 0.626174 [9] training's binary_logloss: 0.618487 [10] training's binary_logloss: 0.610944 [11] training's binary_logloss: 0.603565 [12] training's binary_logloss: 0.596295 [13] training's binary_logloss: 0.589161 [14] training's binary_logloss: 0.582138 [15] training's binary_logloss: 0.57526 [16] training's binary_logloss: 0.568494 [17] training's binary_logloss: 0.561834 [18] training's binary_logloss: 0.555298 [19] training's binary_logloss: 0.54887 [20] training's binary_logloss: 0.542555 [21] training's binary_logloss: 0.536331 [22] training's binary_logloss: 0.530213 [23] training's binary_logloss: 0.524208 [24] training's binary_logloss: 0.518295 [25] training's binary_logloss: 0.512474 [26] training's binary_logloss: 0.506776 [27] training's binary_logloss: 0.501141 [28] training's binary_logloss: 0.4956 [29] training's binary_logloss: 0.490147 [30] training's binary_logloss: 0.484782 [31] training's binary_logloss: 0.479499 [32] training's binary_logloss: 0.474309 [33] training's binary_logloss: 0.469204 [34] training's binary_logloss: 0.464185 [35] training's binary_logloss: 0.45926 [36] training's binary_logloss: 0.454369 [37] training's binary_logloss: 0.44956 [38] training's binary_logloss: 0.44482 [39] training's binary_logloss: 0.440136 [40] training's binary_logloss: 0.435545 [41] training's binary_logloss: 0.430995 [42] training's binary_logloss: 0.426515 [43] training's binary_logloss: 0.422128 [44] training's binary_logloss: 0.417767 [45] training's binary_logloss: 0.41347 [46] training's binary_logloss: 0.409241 [47] training's binary_logloss: 0.405085 [48] training's binary_logloss: 0.401005 [49] training's binary_logloss: 0.39698 [50] training's binary_logloss: 0.393007 [51] training's binary_logloss: 0.389062 [52] training's binary_logloss: 0.385175 [53] training's binary_logloss: 0.381346 [54] training's binary_logloss: 0.377562 [55] training's binary_logloss: 0.373835 [56] training's binary_logloss: 0.370174 [57] training's binary_logloss: 0.366563 [58] training's binary_logloss: 0.36301 [59] training's binary_logloss: 0.359519 [60] training's binary_logloss: 0.356066 [61] training's binary_logloss: 0.352628 [62] training's binary_logloss: 0.349233 [63] training's binary_logloss: 0.345888 [64] training's binary_logloss: 0.342596 [65] training's binary_logloss: 0.339343 [66] training's binary_logloss: 0.336132 [67] training's binary_logloss: 0.332979 [68] training's binary_logloss: 0.329853 [69] training's binary_logloss: 0.326786 [70] training's binary_logloss: 0.323745 [71] training's binary_logloss: 0.320751 [72] training's binary_logloss: 0.317811 [73] training's binary_logloss: 0.314895 [74] training's binary_logloss: 0.312021 [75] training's binary_logloss: 0.309185 [76] training's binary_logloss: 0.306388 [77] training's binary_logloss: 0.303622 [78] training's binary_logloss: 0.300893 [79] training's binary_logloss: 0.298203 [80] training's binary_logloss: 0.295548 [81] training's binary_logloss: 0.292909 [82] training's binary_logloss: 0.290331 [83] training's binary_logloss: 0.287775 [84] training's binary_logloss: 0.285227 [85] training's binary_logloss: 0.282733 [86] training's binary_logloss: 0.280265 [87] training's binary_logloss: 0.277813 [88] training's binary_logloss: 0.275398 [89] training's binary_logloss: 0.273011 [90] training's binary_logloss: 0.270669 [91] training's binary_logloss: 0.268386 [92] training's binary_logloss: 0.26611 [93] training's binary_logloss: 0.263855 [94] training's binary_logloss: 0.261649 [95] training's binary_logloss: 0.259431 [96] training's binary_logloss: 0.257247 [97] training's binary_logloss: 0.255088 [98] training's binary_logloss: 0.252959 [99] training's binary_logloss: 0.250869 [100] training's binary_logloss: 0.2488 Save model... Start predicting... Writing transformed training data Writing transformed testing data Calculate feature importances... Feature importances: [160, 1009, 142, 161, 206, 148, 158, 1005, 950, 113, 122, 142, 162, 171, 205, 188, 782, 149, 153, 74] Feature importances: [845.4382629788703, 25344.571634112664, 706.4202005525343, 836.0539398244389, 1211.1921805301986, 775.4421975353404, 935.3537068494932, 539445.4156525754, 30555.03219095946, 525.6520926086345, 609.4999471581998, 740.5362660885675, 872.5735067457427, 945.9102555144038, 1211.9642164236336, 961.341319939618, 111091.96299654292, 768.3999561554151, 776.2759209290857, 351.2581720100586] Logestic Regression Start Load data...
C:\Users\ljt\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning)
Normalized Cross Entropy 1.4205683800573814
C:\Users\ljt\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning)
Normalized Cross Entropy 1.3673511560136353
C:\Users\ljt\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning)
Normalized Cross Entropy 1.2726684284475507
C:\Users\ljt\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning)
Normalized Cross Entropy 1.237958674970395
C:\Users\ljt\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning)
Normalized Cross Entropy 1.1520037803204248
C:\Users\ljt\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning)
Normalized Cross Entropy 1.1034728664958695
C:\Users\ljt\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning)
Normalized Cross Entropy 0.9423684651387876