lightgbm适用于多个任务(回归,二分类,多分类),具体的参数需要做出变化,下面给出各任务的基本代码。
回归
import sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,mean_squared_error
import numpy as np
import lightgbm as lgb
boston_price = datasets.load_boston()
data = boston_price.data
target = boston_price.target
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)
print("Train data length:", len(X_train))
print("Test data length:", len(X_test))
# 转换为Dataset数据格式
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# 参数
params = {
'boosting_type': 'gbdt', # 设置提升类型
'objective': 'regression', # 目标函数
'metric': {'mse'}, # 评估函数
'num_leaves': 31, # 叶子节点数
'learning_rate': 0.05, # 学习速率
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
# 模型训练
gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval,early_stopping_rounds=5)
# 模型预测
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) # shape [B]
print('mse=%.6f'%(mean_squared_error(y_test, y_pred, )))
二分类
import sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,mean_squared_error
import numpy as np
from matplotlib import pyplot as plt
import lightgbm as lgb
import pickle
boston_price = datasets.load_breast_cancer()
data = boston_price.data
target = boston_price.target
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1024)
print("Train data length:", len(X_train))
print("Test data length:", len(X_test))
# 转换为Dataset数据格式
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# 参数
params = {
'boosting_type': 'gbdt', # 设置提升类型
'objective': 'binary', # 目标函数
'metric': {'auc'}, # 评估函数
'num_leaves': 31, # 叶子节点数
'learning_rate': 0.05, # 学习速率
'nthread': 120,
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
# 模型训练
evals_result = {} # to record eval results
gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval,early_stopping_rounds=5,evals_result=evals_result)
# 模型预测
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) # shape [B]
print('auc=%.6f'%(roc_auc_score(y_test, y_pred, )))
# 特征重要性图
lgb.plot_importance(gbm)
plt.show()
# 指标图
lgb.plot_metric(evals_result)
plt.show()
# 使用pickle保存模型
with open('model.pkl','wb') as f:
pickle.dump(gbm,f)
# 使用pickle加载模型
with open('model.pkl', 'rb') as f:
gbm = pickle.load(f)
多分类
import sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np
import lightgbm as lgb
import pickle
import random
import matplotlib.pyplot as plt
np.random.seed(1024)
random.seed(1024)
minst_digits = datasets.load_digits()
data = minst_digits.data
target = minst_digits.target
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1024)
print("Train data length:", len(X_train))
print("Test data length:", len(X_test))
# 转换为Dataset数据格式
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# 参数
params = {
'boosting_type': 'gbdt', # 设置提升类型
'objective': 'multiclass', # 目标函数
'num_class':10, # 类别数目
'metric': {'multi_logloss'}, # 评估函数
'num_leaves': 31, # 叶子节点数
'learning_rate': 0.05, # 学习速率
'feature_fraction':0.8, # 如果 feature_fraction 小于 1.0, LightGBM 将会在每次迭代中随机选择部分特征. alias=colsample_bytree
'feature_fraction_seed': 1,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'bagging_seed': 3,
'nthread': 4,
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
# 模型训练
evals_result = {} # to record eval results
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, evals_result=evals_result)
# 模型预测
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) # shape [B, n_class]
# y_pred = np.argmax(y_pred,axis=-1) # shape [B,]
print('best_iteration={} '.format(gbm.best_iteration))
print('auc=%.6f'%(roc_auc_score(y_test, y_pred, multi_class='ovr')))
print(gbm.best_score)
lgb.plot_importance(gbm)
plt.show()
lgb.plot_metric(evals_result)
plt.show()
更加详细的教程可以见https://github.com/microsoft/LightGBM/tree/master/examples/python-guide
以上是lgb原生使用形式,兼容sklearn的使用方法可以见 LightGBM两种使用方式