一个简单的遗传算法迭代xgboost最优参数的示例,这里用的是自定义损失函数
import pandas as pd
import numpy as np
import xgboost as xgb
from sko.GA import GA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics
from log_color import log,LogLevel
from tqdm import tqdm
train_df = pd.read_csv('./train_v2.csv')
test_df =pd.read_csv('./test_v2.csv')
x = train_df.drop(['user_id','merchant_id','label'],axis=1)
y = train_df['label']
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state = 42)
gamma = 0
train_Y = y_train
alpha = (train_Y==0).sum()/train_Y.size
def logistic_obj(p, dtrain):
y = dtrain.get_label()
p = 1.0 / (1.0 + np.exp(-p))
grad = p * (1 - p) * (alpha * gamma * y * (1 - p) ** gamma * np.log(p) / (1 - p) - alpha * y * (
1 - p) ** gamma / p - gamma * p ** gamma * (1 - alpha) * (1 - y) * np.log(1 - p) / p + p ** gamma * (
1 - alpha) * (1 - y) / (1 - p))
hess = p * (1 - p) * (p * (1 - p) * (
-alpha * gamma ** 2 * y * (1 - p) ** gamma * np.log(p) / (1 - p) ** 2 + alpha * gamma * y * (
1 - p) ** gamma * np.log(p) / (1 - p) ** 2 + 2 * alpha * gamma * y * (1 - p) ** gamma / (
p * (1 - p)) + alpha * y * (1 - p) ** gamma / p ** 2 - gamma ** 2 * p ** gamma * (
1 - alpha) * (1 - y) * np.log(1 - p) / p ** 2 + 2 * gamma * p ** gamma * (1 - alpha) * (
1 - y) / (p * (1 - p)) + gamma * p ** gamma * (1 - alpha) * (1 - y) * np.log(
1 - p) / p ** 2 + p ** gamma * (1 - alpha) * (1 - y) / (1 - p) ** 2) - p * (
alpha * gamma * y * (1 - p) ** gamma * np.log(p) / (1 - p) - alpha * y * (
1 - p) ** gamma / p - gamma * p ** gamma * (1 - alpha) * (1 - y) * np.log(
1 - p) / p + p ** gamma * (1 - alpha) * (1 - y) / (1 - p)) + (1 - p) * (
alpha * gamma * y * (1 - p) ** gamma * np.log(p) / (1 - p) - alpha * y * (
1 - p) ** gamma / p - gamma * p ** gamma * (1 - alpha) * (1 - y) * np.log(
1 - p) / p + p ** gamma * (1 - alpha) * (1 - y) / (1 - p)))
return grad, hess
def XGBoostAUC(p):
etas = [0.0001,0.001,0.01,0.1]
sampling_methods = ["uniform","gradient_based"]
w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,w14,w15,w16,w17 = p
params = {
"learning_rate":w1
, "n_estimators":int(w2)
, "max_depth":int(w3)
, "min_child_weight":w4
, "gamma":w5
, "subsample":w6
, "nthread":5
, "scale_pos_weight":(train_Y==0).sum()/(train_Y==1).sum()
, "lambda":w7
, "eta":etas[int(w8)]
, "verbosity":1
, "eval_metric":"auc"
, "seed":int(w9)
, "max_delta_step":w10
,"subsample":w11
,"sampling_method":sampling_methods[int(w12)]
,'colsample_bytree':w13
, 'colsample_bylevel':w14
, 'colsample_bynode':w15
,"gpu_id":0
,"tree_method":"gpu_hist"
,"max_leaves":int(w16)
,"num_parallel_tree":int(w17)
}
dtrain = xgb.DMatrix(x_train,label=y_train)
clf = xgb.train(params=params
,dtrain=dtrain
,num_boost_round=100
,evals=[(dtrain,"train")]
,verbose_eval=False
,obj=logistic_obj
)
dtest = xgb.DMatrix(x_val,label=y_val)
lr_proba = clf.predict(dtest)
lr_proba = np.nan_to_num(lr_proba,0)
fpr,tpr,threshold = metrics.roc_curve(y_val, lr_proba)
roc_auc = metrics.auc(fpr,tpr)
dtrain=None
clf = None
dtest = None
lr_proba = None
fpr,tpr,threshold = None,None,None
log(f"本次迭代AUC分数为:[{roc_auc}],本次X值为:[{p}]",LogLevel.PASS)
return -roc_auc
ga = GA(func=XGBoostAUC
, n_dim=17
, size_pop=10
, max_iter=5
, prob_mut=0.01
, lb=[0.1,5,1,0,0,0,0,0,10,0,0,0,0,0,0,0,1]
,ub=[1,20,20,100,1,1,100,3,100,10,1,1,1,1,1,10,10]
,precision=[0.1,1,1,0.1,0.1,0.1,0.1,1,1,0.1,0.1,1,0.1,0.1,0.1,1,1]
)
best_x,best_y = ga.run()
print('best_x:', best_x,'\n','best_y:',best_y)
opt_x_log = pd.DataFrame({
"best_x":[best_x]
,"best_y":[best_y]
})
print(f"优化结果表:{opt_x_log}")
opt_x_log.to_csv("best_x2.csv")
w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,w14,w15,w16,w17 = best_x
etas = [0.0001,0.001,0.01,0.1]
sampling_methods = ["uniform","gradient_based"]
params = {
"learning_rate":w1
, "n_estimators":int(w2)
, "max_depth":int(w3)
, "min_child_weight":w4
, "gamma":w5
, "subsample":w6
, "nthread":5
, "scale_pos_weight":(train_Y==0).sum()/(train_Y==1).sum()
, "lambda":w7
, "eta":etas[int(w8)]
, "verbosity":1
, "eval_metric":"auc"
, "seed":int(w9)
, "max_delta_step":w10
,"subsample":w11
,"sampling_method":sampling_methods[int(w12)]
,'colsample_bytree':w13
, 'colsample_bylevel':w14
, 'colsample_bynode':w15
,"gpu_id":0
,"tree_method":"gpu_hist"
,"max_leaves":int(w16)
,"num_parallel_tree":int(w17)
}
params.update({"best_auc":best_y})
best_params_table = pd.DataFrame({k:[v] for k,v in params.items()})
best_params_table.to_csv("best_params_table.csv")