1. model pipeline 拆解
# -*- coding: utf-8 -*-
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
feature_list = ["chat_7d_cnt", "chat_cnt_self_expression"]
drop_cols = ["uid", "random", "2d_retention"]
LABEL = "is_later_30d_loss"
model_file_path = "model.txt"
data_path = "data.txt"
params = {
'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'max_depth': 10,
'lambda': 10,
'subsample': 0.85,
'colsample_bytree': 0.85,
'min_child_weight': 2,
'eta': 0.1,
'seed': 0,
'nthread': 8,
'silent': 1
}
if __name__ == "__main__":
X_train, X_test, y_train, y_test = get_dataset(data_path)
train(X_train, y_train, X_test, y_test)
eval(X_test, y_test)
print_feature_important()
1.1 获取数据集
def get_dataset(data_path):
data = pd.read_csv(data_path, sep='\t')
data.drop(drop_cols, axis=1, inplace=True)
data.fillna(0, inplace=True)
data.round(decimals=2)
X = data[feature_list].values
y = data[LABEL].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
return X_train, X_test, y_train, y_test
1.2 模型训练
def train(X_tr, y_tr, X_te, y_te):
xg_train = xgb.DMatrix(X_tr, label=y_tr)
xg_test = xgb.DMatrix(X_te, label=y_te)
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 50
bst = xgb.train(params, xg_train, num_round, watchlist)
bst.save_model(model_file_path)
# bst.get_booster().save_model('xgb.model')
1.3 模型评估
def eval(X_test, y_test):
xgb_model = xgb.Booster(model_file=model_file_path)
# X_test = xgb.DMatrix(X_test)
pred = xgb_model.predict(X_test)
# pred = xgb_model.predict(X_test, pred_leaf=True)
print ('predicting, classification error=%f'
% (sum(int(pred[i]) != y_test[i] for i in range(len(y_test))) / float(len(y_test))))
y_pred = (pred >= 0.5) * 1
print ('AUC: %.4f' % metrics.roc_auc_score(y_test, pred))
print ('ACC: %.4f' % metrics.accuracy_score(y_test, y_pred))
print ('Recall: %.4f' % metrics.recall_score(y_test, y_pred))
print ('F1-score: %.4f' % metrics.f1_score(y_test, y_pred))
print ('Precesion: %.4f' % metrics.precision_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
1.4 打印特征重要度
def print_feature_important():
xgb_model = xgb.Booster(model_file=model_file_path)
importance = xgb_model.get_score(importance_type='gain')
sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
print('feature importances[gain]:')
print(sorted_importance)
feature_name_list = feature_list
importance_list = list(sorted_importance)
feature_importance_pair = [(fe, round(im, 2)) for fe, im in zip(feature_name_list, importance_list)]
feature_importance_pair = sorted(feature_importance_pair, key=lambda x: x[1], reverse=True)
for pair in feature_importance_pair:
print('Feature:\t{}\t{}'.format(*pair))
1.5 模型预测
def predict(X_test):
xgb_model = xgb.Booster(model_file=model_file_path)
# X_test = xgb.DMatrix(X_test)
pred = xgb_model.predict(X_test)
2. 代码汇总
# -*- coding: utf-8 -*-
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
feature_list = ["chat_7d_cnt", "chat_cnt_self_expression"]
drop_cols = ["uid", "random", "2d_retention"]
LABEL = "is_later_30d_loss"
model_file_path = "model.txt"
data_path = "data.txt"
params = {
'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'max_depth': 10,
'lambda': 10,
'subsample': 0.85,
'colsample_bytree': 0.85,
'min_child_weight': 2,
'eta': 0.1,
'seed': 0,
'nthread': 8,
'silent': 1
}
def get_dataset(data_path):
data = pd.read_csv(data_path, sep='\t')
data.drop(drop_cols, axis=1, inplace=True)
data.fillna(0, inplace=True)
data.round(decimals=2)
X = data[feature_list].values
y = data[LABEL].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
return X_train, X_test, y_train, y_test
def train(X_tr, y_tr, X_te, y_te):
xg_train = xgb.DMatrix(X_tr, label=y_tr)
xg_test = xgb.DMatrix(X_te, label=y_te)
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 50
bst = xgb.train(params, xg_train, num_round, watchlist)
bst.save_model(model_file_path)
# bst.get_booster().save_model('xgb.model')
def eval(X_test, y_test):
xgb_model = xgb.Booster(model_file=model_file_path)
# X_test = xgb.DMatrix(X_test)
pred = xgb_model.predict(X_test)
print ('predicting, classification error=%f'
% (sum(int(pred[i]) != y_test[i] for i in range(len(y_test))) / float(len(y_test))))
y_pred = (pred >= 0.5) * 1
print ('AUC: %.4f' % metrics.roc_auc_score(y_test, pred))
print ('ACC: %.4f' % metrics.accuracy_score(y_test, y_pred))
print ('Recall: %.4f' % metrics.recall_score(y_test, y_pred))
print ('F1-score: %.4f' % metrics.f1_score(y_test, y_pred))
print ('Precesion: %.4f' % metrics.precision_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
def print_feature_important():
xgb_model = xgb.Booster(model_file=model_file_path)
importance = xgb_model.get_score(importance_type='gain')
sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
print('feature importances[gain]:')
print(sorted_importance)
feature_name_list = feature_list
importance_list = list(sorted_importance)
feature_importance_pair = [(fe, round(im, 2)) for fe, im in zip(feature_name_list, importance_list)]
feature_importance_pair = sorted(feature_importance_pair, key=lambda x: x[1], reverse=True)
for pair in feature_importance_pair:
print('Feature:\t{}\t{}'.format(*pair))
if __name__ == "__main__":
X_train, X_test, y_train, y_test = get_dataset(data_path)
train(X_train, y_train, X_test, y_test)
eval(X_test, y_test)
print_feature_important()