模型训练xgb

MusicDancing

已于 2023-07-31 23:10:00 修改

阅读量1k

点赞数

CC 4.0 BY-SA版权

分类专栏：机器学习工程文章标签：机器学习

于 2021-08-25 12:41:44 首次发布

本文链接：https://blog.youkuaiyun.com/MusicDancing/article/details/119908815

机器学习工程专栏收录该内容

9 篇文章

订阅专栏

这段代码展示了如何使用XGBoost进行二分类任务的建模。首先，从数据集中提取关键特征并进行预处理，然后通过训练集和测试集划分数据。接着，配置XGBoost参数并训练模型，最后评估模型性能，包括准确率、召回率、F1分数等，并输出特征重要性。整个过程涉及数据读取、模型训练、评估和特征选择。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1. model pipeline 拆解

# -*- coding: utf-8 -*-
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

feature_list = ["chat_7d_cnt", "chat_cnt_self_expression"]
drop_cols = ["uid", "random", "2d_retention"]
LABEL = "is_later_30d_loss"
model_file_path = "model.txt"
data_path = "data.txt"

params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 10,
    'lambda': 10,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'min_child_weight': 2,
    'eta': 0.1,
    'seed': 0,
    'nthread': 8,
    'silent': 1
}


if __name__ == "__main__":
    X_train, X_test, y_train, y_test = get_dataset(data_path)
    train(X_train, y_train, X_test, y_test)
    eval(X_test, y_test)
    print_feature_important()

1.1 获取数据集

def get_dataset(data_path):
    data = pd.read_csv(data_path, sep='\t')
    data.drop(drop_cols, axis=1, inplace=True)
    data.fillna(0, inplace=True)
    data.round(decimals=2)
    X = data[feature_list].values
    y = data[LABEL].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return X_train, X_test, y_train, y_test

1.2 模型训练

def train(X_tr, y_tr, X_te, y_te):
    xg_train = xgb.DMatrix(X_tr, label=y_tr)
    xg_test = xgb.DMatrix(X_te, label=y_te)
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 50
    bst = xgb.train(params, xg_train, num_round, watchlist)
    bst.save_model(model_file_path)
    # bst.get_booster().save_model('xgb.model')

1.3 模型评估

def eval(X_test, y_test):
    xgb_model = xgb.Booster(model_file=model_file_path)
    # X_test = xgb.DMatrix(X_test)
    pred = xgb_model.predict(X_test)
	# pred = xgb_model.predict(X_test, pred_leaf=True)

    print ('predicting, classification error=%f'
           % (sum(int(pred[i]) != y_test[i] for i in range(len(y_test))) / float(len(y_test))))

    y_pred = (pred >= 0.5) * 1
    print ('AUC: %.4f' % metrics.roc_auc_score(y_test, pred))
    print ('ACC: %.4f' % metrics.accuracy_score(y_test, y_pred))
    print ('Recall: %.4f' % metrics.recall_score(y_test, y_pred))
    print ('F1-score: %.4f' % metrics.f1_score(y_test, y_pred))
    print ('Precesion: %.4f' % metrics.precision_score(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))

1.4 打印特征重要度

def print_feature_important():
    xgb_model = xgb.Booster(model_file=model_file_path)
    importance = xgb_model.get_score(importance_type='gain')
    sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
    print('feature importances[gain]:')
    print(sorted_importance)

    feature_name_list = feature_list
    importance_list = list(sorted_importance)
    feature_importance_pair = [(fe, round(im, 2)) for fe, im in zip(feature_name_list, importance_list)]
    feature_importance_pair = sorted(feature_importance_pair, key=lambda x: x[1], reverse=True)
    for pair in feature_importance_pair:
        print('Feature:\t{}\t{}'.format(*pair))

1.5 模型预测

def predict(X_test):
    xgb_model = xgb.Booster(model_file=model_file_path)
    # X_test = xgb.DMatrix(X_test)
    pred = xgb_model.predict(X_test)

2. 代码汇总

# -*- coding: utf-8 -*-
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

feature_list = ["chat_7d_cnt", "chat_cnt_self_expression"]
drop_cols = ["uid", "random", "2d_retention"]
LABEL = "is_later_30d_loss"
model_file_path = "model.txt"
data_path = "data.txt"

params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 10,
    'lambda': 10,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'min_child_weight': 2,
    'eta': 0.1,
    'seed': 0,
    'nthread': 8,
    'silent': 1
}


def get_dataset(data_path):
    data = pd.read_csv(data_path, sep='\t')
    data.drop(drop_cols, axis=1, inplace=True)
    data.fillna(0, inplace=True)
    data.round(decimals=2)
    X = data[feature_list].values
    y = data[LABEL].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return X_train, X_test, y_train, y_test


def train(X_tr, y_tr, X_te, y_te):
    xg_train = xgb.DMatrix(X_tr, label=y_tr)
    xg_test = xgb.DMatrix(X_te, label=y_te)
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 50
    bst = xgb.train(params, xg_train, num_round, watchlist)
    bst.save_model(model_file_path)
    # bst.get_booster().save_model('xgb.model')


def eval(X_test, y_test):
    xgb_model = xgb.Booster(model_file=model_file_path)
    # X_test = xgb.DMatrix(X_test)
    pred = xgb_model.predict(X_test)
    print ('predicting, classification error=%f'
           % (sum(int(pred[i]) != y_test[i] for i in range(len(y_test))) / float(len(y_test))))

    y_pred = (pred >= 0.5) * 1
    print ('AUC: %.4f' % metrics.roc_auc_score(y_test, pred))
    print ('ACC: %.4f' % metrics.accuracy_score(y_test, y_pred))
    print ('Recall: %.4f' % metrics.recall_score(y_test, y_pred))
    print ('F1-score: %.4f' % metrics.f1_score(y_test, y_pred))
    print ('Precesion: %.4f' % metrics.precision_score(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))


def print_feature_important():
    xgb_model = xgb.Booster(model_file=model_file_path)
    importance = xgb_model.get_score(importance_type='gain')
    sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
    print('feature importances[gain]:')
    print(sorted_importance)

    feature_name_list = feature_list
    importance_list = list(sorted_importance)
    feature_importance_pair = [(fe, round(im, 2)) for fe, im in zip(feature_name_list, importance_list)]
    feature_importance_pair = sorted(feature_importance_pair, key=lambda x: x[1], reverse=True)
    for pair in feature_importance_pair:
        print('Feature:\t{}\t{}'.format(*pair))


if __name__ == "__main__":
    X_train, X_test, y_train, y_test = get_dataset(data_path)
    train(X_train, y_train, X_test, y_test)
    eval(X_test, y_test)
    print_feature_important()