模型训练xgb

这段代码展示了如何使用XGBoost进行二分类任务的建模。首先,从数据集中提取关键特征并进行预处理,然后通过训练集和测试集划分数据。接着,配置XGBoost参数并训练模型,最后评估模型性能,包括准确率、召回率、F1分数等,并输出特征重要性。整个过程涉及数据读取、模型训练、评估和特征选择。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

1. model pipeline 拆解

# -*- coding: utf-8 -*-
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

feature_list = ["chat_7d_cnt", "chat_cnt_self_expression"]
drop_cols = ["uid", "random", "2d_retention"]
LABEL = "is_later_30d_loss"
model_file_path = "model.txt"
data_path = "data.txt"

params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 10,
    'lambda': 10,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'min_child_weight': 2,
    'eta': 0.1,
    'seed': 0,
    'nthread': 8,
    'silent': 1
}


if __name__ == "__main__":
    X_train, X_test, y_train, y_test = get_dataset(data_path)
    train(X_train, y_train, X_test, y_test)
    eval(X_test, y_test)
    print_feature_important()

1.1 获取数据集

def get_dataset(data_path):
    data = pd.read_csv(data_path, sep='\t')
    data.drop(drop_cols, axis=1, inplace=True)
    data.fillna(0, inplace=True)
    data.round(decimals=2)
    X = data[feature_list].values
    y = data[LABEL].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return X_train, X_test, y_train, y_test

1.2 模型训练

def train(X_tr, y_tr, X_te, y_te):
    xg_train = xgb.DMatrix(X_tr, label=y_tr)
    xg_test = xgb.DMatrix(X_te, label=y_te)
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 50
    bst = xgb.train(params, xg_train, num_round, watchlist)
    bst.save_model(model_file_path)
    # bst.get_booster().save_model('xgb.model')

1.3 模型评估

def eval(X_test, y_test):
    xgb_model = xgb.Booster(model_file=model_file_path)
    # X_test = xgb.DMatrix(X_test)
    pred = xgb_model.predict(X_test)
	# pred = xgb_model.predict(X_test, pred_leaf=True)

    print ('predicting, classification error=%f'
           % (sum(int(pred[i]) != y_test[i] for i in range(len(y_test))) / float(len(y_test))))

    y_pred = (pred >= 0.5) * 1
    print ('AUC: %.4f' % metrics.roc_auc_score(y_test, pred))
    print ('ACC: %.4f' % metrics.accuracy_score(y_test, y_pred))
    print ('Recall: %.4f' % metrics.recall_score(y_test, y_pred))
    print ('F1-score: %.4f' % metrics.f1_score(y_test, y_pred))
    print ('Precesion: %.4f' % metrics.precision_score(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))

1.4 打印特征重要度

def print_feature_important():
    xgb_model = xgb.Booster(model_file=model_file_path)
    importance = xgb_model.get_score(importance_type='gain')
    sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
    print('feature importances[gain]:')
    print(sorted_importance)

    feature_name_list = feature_list
    importance_list = list(sorted_importance)
    feature_importance_pair = [(fe, round(im, 2)) for fe, im in zip(feature_name_list, importance_list)]
    feature_importance_pair = sorted(feature_importance_pair, key=lambda x: x[1], reverse=True)
    for pair in feature_importance_pair:
        print('Feature:\t{}\t{}'.format(*pair))

1.5 模型预测

def predict(X_test):
    xgb_model = xgb.Booster(model_file=model_file_path)
    # X_test = xgb.DMatrix(X_test)
    pred = xgb_model.predict(X_test)

2. 代码汇总

# -*- coding: utf-8 -*-
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

feature_list = ["chat_7d_cnt", "chat_cnt_self_expression"]
drop_cols = ["uid", "random", "2d_retention"]
LABEL = "is_later_30d_loss"
model_file_path = "model.txt"
data_path = "data.txt"

params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 10,
    'lambda': 10,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'min_child_weight': 2,
    'eta': 0.1,
    'seed': 0,
    'nthread': 8,
    'silent': 1
}


def get_dataset(data_path):
    data = pd.read_csv(data_path, sep='\t')
    data.drop(drop_cols, axis=1, inplace=True)
    data.fillna(0, inplace=True)
    data.round(decimals=2)
    X = data[feature_list].values
    y = data[LABEL].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return X_train, X_test, y_train, y_test


def train(X_tr, y_tr, X_te, y_te):
    xg_train = xgb.DMatrix(X_tr, label=y_tr)
    xg_test = xgb.DMatrix(X_te, label=y_te)
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 50
    bst = xgb.train(params, xg_train, num_round, watchlist)
    bst.save_model(model_file_path)
    # bst.get_booster().save_model('xgb.model')


def eval(X_test, y_test):
    xgb_model = xgb.Booster(model_file=model_file_path)
    # X_test = xgb.DMatrix(X_test)
    pred = xgb_model.predict(X_test)
    print ('predicting, classification error=%f'
           % (sum(int(pred[i]) != y_test[i] for i in range(len(y_test))) / float(len(y_test))))

    y_pred = (pred >= 0.5) * 1
    print ('AUC: %.4f' % metrics.roc_auc_score(y_test, pred))
    print ('ACC: %.4f' % metrics.accuracy_score(y_test, y_pred))
    print ('Recall: %.4f' % metrics.recall_score(y_test, y_pred))
    print ('F1-score: %.4f' % metrics.f1_score(y_test, y_pred))
    print ('Precesion: %.4f' % metrics.precision_score(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))


def print_feature_important():
    xgb_model = xgb.Booster(model_file=model_file_path)
    importance = xgb_model.get_score(importance_type='gain')
    sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
    print('feature importances[gain]:')
    print(sorted_importance)

    feature_name_list = feature_list
    importance_list = list(sorted_importance)
    feature_importance_pair = [(fe, round(im, 2)) for fe, im in zip(feature_name_list, importance_list)]
    feature_importance_pair = sorted(feature_importance_pair, key=lambda x: x[1], reverse=True)
    for pair in feature_importance_pair:
        print('Feature:\t{}\t{}'.format(*pair))


if __name__ == "__main__":
    X_train, X_test, y_train, y_test = get_dataset(data_path)
    train(X_train, y_train, X_test, y_test)
    eval(X_test, y_test)
    print_feature_important()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值