模型训练lgb

原创已于 2024-01-22 20:43:38 修改 · 956 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#机器学习

于 2021-08-25 09:56:23 首次发布

机器学习工程专栏收录该内容

9 篇文章

订阅专栏

1. model pipeline拆解

# -*- coding: utf-8 -*-
import pandas as pd
import lightgbm as lgb
from sklearn import metrics
from woe.eval import eval_segment_metrics

feature_list = ['fea1', 'fea2', 'fea3']
drop_cols = ['uid', 'label', 'pt']
keep_cols = ['uid', 'score']
LABEL = 'is_later_30d_loss'
# 阈值通过segment确定
THRESHOLD = 0.5034
# 测试集待打分数据
data_input_path = '../data/raw_data.txt'
data_output_path = '../data/predict_data.csv'
model_file_path = 'model.txt'
# 一般这样，不需改动
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 6,  # 32 
    'max_bin': 10,  # 16
    'learning_rate': 0.01,  # 0.005
    # 'feature_fraction': 0.9,
    # 'bagging_fraction': 0.8,
    # 'bagging_freq': 5,
    # 'max_depth': 5,
    # 'min_data_in_leaf': 6,
    'is_unbalance': False,
    'verbose': 0
}

if __name__ == "__main__":
    train_data_path = '../data/train.txt'
    test_data_path = '../data/test.txt'
    # 获取训练集，测试集
    X_tr, y_tr, X_te, y_te = get_dataset(train_data_path, test_data_path)
    # 训练模型
    train(X_tr, y_tr, X_te, y_te)
    # 评估模型
    eval(X_te, y_te)
    # 打印模型特征重要度
    feature_names = X_tr.columns.tolist()
    print_important_feature(feature_names)
    # 模型预测
    predict()

1.1 获取数据集

1.1.1 获取训练集，测试集

def get_dataset(train_data_path, test_data_path):
    tr = pd.read_csv(train_data_path, sep='\t')
    te = pd.read_csv(test_data_path, sep='\t')
    # 训练集数量，及其中正样本数
    print('tr = %d, tr_pos_num = %d' % (len(tr), len(tr[tr[LABEL] == 1])))
    print('te = %d, te_pos_num = %d' % (len(te), len(te[te[LABEL] == 1])))

    y_tr = tr.loc[:, LABEL]
    y_te = te.loc[:, LABEL]
    tr.drop(labels=drop_cols, axis=1, inplace=True)
    te.drop(labels=drop_cols, axis=1, inplace=True)
    # 或者
    # tr = tr.loc[:, feature_list]
    # te = te.loc[:, feature_list]

    X_tr, X_te = tr, te
    return X_tr, y_tr, X_te, y_te

1.1.2 增加权重

# sample weight
tr['weight'] = 1
te['weight'] = 1
tr.loc[(tr[LABEL] == 1) & (tr['is_recall'] == 1), 'weight'] = (
    len(tr) - sum(tr[LABEL])) / sum(tr[LABEL])
tr.loc[(tr[LABEL] == 0) & (tr['is_recall'] == 1), 'weight'] = (
    len(tr) - sum(tr[LABEL])) / sum(tr[LABEL]) * 2
tr.loc[(tr[LABEL] == 0) & (tr['is_recall'] == 0), 'weight'] = 1
weights = tr['weight']

trainset = lgb.Dataset(data=x_tr, label=y_tr, weight=weights)
testset = lgb.Dataset(data=x_te, label=y_te)

1.1.3 切分训练集测试集

def split_dataset(data_path):
    data = pd.read_csv(data_path, sep='\t')
    X = data[feature_list].values
    y = data[LABEL].values
    X.fillna(0, inplace=True).round(decimals=2)

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)
    # X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0, stratify=y_test)
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
    return X_tr, y_tr, X_te, y_te

1.2 模型训练

def train(X_tr, y_tr, X_te, y_te):
    trainset = lgb.Dataset(data=X_tr, label=y_tr)
    testset = lgb.Dataset(data=X_te, label=y_te)
    # testset = lgb.Dataset(X_te, y_te, reference=trainset)
    lgb_model = lgb.train(params,
                          train_set=trainset,
                          valid_sets=[trainset, testset],
                          num_boost_round=2000,  # 200
                          early_stopping_rounds=100)
    # clf =lgb.LGBMClassifier(n_estimators=24)
    # clf.fit(trainset, eval_set=[trainset,testset], eval_metric='auc')
    # clf.best_score_['valid_1']['auc']
    print(lgb_model.best_iteration)  # 最佳迭代次数
    lgb_model.save_model(model_file_path)

含有策略型特征的训练

def train(X_tr, y_tr, X_te, y_te):
    trainset = lgb.Dataset(data=X_tr, label=y_tr, categorical_feature=category_feature_list)
    testset = lgb.Dataset(X_te, y_te, reference=trainset, categorical_feature=category_feature_list)
    lgb_model = lgb.train(params,
                          train_set=trainset,
                          valid_sets=[trainset, testset],
                          categorical_feature=category_feature_list,
                          num_boost_round=2000,
                          early_stopping_rounds=100)
    lgb_model.save_model(model_file_path)

1.3 模型评估及segment抽取

def eval(X, y):
    gbm = lgb.Booster(model_file=model_file_path)
    # 生成segment文件
    eval_segment_metrics(target=y.values,
                         segment_cnt=50,
                         predict_proba=gbm.predict(X),
                         out_path='aa_segment_' + LABEL + 'v8.xlsx')
    pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    print ('predicting, classification error=%f'
           % (sum(int(pred[i]) != y[i] for i in range(len(y))) / float(len(y))))

    y_pred = (pred >= THRESHOLD) * 1
    print ('ACC: %.4f' % metrics.accuracy_score(y, y_pred))
    print ('AUC: %.4f' % metrics.roc_auc_score(y, pred))
    print ('Recall: %.4f' % metrics.recall_score(y, y_pred))
    print ('F1-score: %.4f' % metrics.f1_score(y, y_pred))
    print ('Precesion: %.4f' % metrics.precision_score(y, y_pred))
    print(metrics.confusion_matrix(y, y_pred))

1.4 打印特征重要度

def print_important_feature(feature_names):
    gbm = lgb.Booster(model_file=model_file_path)
    importances = pd.Series(gbm.feature_importance('gain'),
                            index=feature_names
                            ).sort_values(ascending=False)

    #显示所有行，不然特征太多显示不完全
    pd.set_option('display.max_rows', None)
    print(importances)

    # importance_list = list(gbm.feature_importance())
    # feature_importance_pair = [(fe, round(im, 2)) for fe, im in zip(feature_names, importance_list)]
    # feature_importance_pair = sorted(feature_importance_pair, key=lambda x: x[1], reverse=True)
    # print(feature_importance_pair)

    # 或
    name = gbm.booster_.feature_name()
    importance = lgb_model.feature_importances_
    feature_importance = pd.DataFrame({'name': name, 
        'importance': importance}).sort_values(by=['importance'],ascending=False)

1.5 模型打分

def predict():
    gbm = lgb.Booster(model_file=model_file_path)
    test_data = pd.read_csv(data_input_path, sep='\t')
    test_data = test_data.loc[:, feature_list]
    test_data['score'] = gbm.predict(test_data)
    result = test_data.loc[test_data['score'] >= THRESHOLD, keep_cols]
    print("-----阈值门槛数据量: " + str(len(result)))
    result.to_csv(data_output_path, index=0, header=0, sep='\t', float_format='%.4f')

2. 代码汇总

# -*- coding: utf-8 -*-
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import datetime
import matplotlib.pyplot as plt
from woe.eval import eval_segment_metrics


params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 6,
    'max_bin': 10,
    'learning_rate': 0.01,
    'is_unbalance': False,
    'verbose': 0
}

feature_list = ["chat_7d_cnt", "chat_cnt_self_expression", "chat_cnt_text"]
drop_cols = ["uid", "is_later_30d_loss", "pt"]
keep_cols = ['uid', 'score']
LABEL = 'is_later_30d_loss'
# 阈值通过segment确定
THRESHOLD = 0.5034

# 测试集待打分数据
data_input_path = '../data/raw_data.txt'
data_output_path = '../data/predict_data.csv'
model_file_path = 'model.txt'


def get_dataset(train_data_path, test_data_path):
    tr = pd.read_csv(train_data_path, sep='\t')
    te = pd.read_csv(test_data_path, sep='\t')
    # 训练集数量，及其中正样本数
    print('tr = %d, tr_pos_num = %d' % (len(tr), len(tr[tr[LABEL] == 1])))
    print('te = %d, te_pos_num = %d' % (len(te), len(te[te[LABEL] == 1])))

    y_tr = tr.loc[:, LABEL]
    y_te = te.loc[:, LABEL]
    tr.drop(labels=drop_cols, axis=1, inplace=True)
    te.drop(labels=drop_cols, axis=1, inplace=True)

    X_tr, X_te = tr, te
    return X_tr, y_tr, X_te, y_te


def train(X_tr, y_tr, X_te, y_te):
    trainset = lgb.Dataset(data=X_tr, label=y_tr)
    # trainset = lgb.Dataset(data=X_tr, label=y_tr, weight=weights)
    testset = lgb.Dataset(data=X_te, label=y_te)

    lgb_model = lgb.train(params,
                          train_set=trainset,
                          valid_sets=[trainset, testset],
                          num_boost_round=2000,
                          early_stopping_rounds=100)
    lgb_model.save_model(model_file_path)


def eval(X, y):
    gbm = lgb.Booster(model_file=model_file_path)
    # 生成segment文件
    eval_segment_metrics(target=y.values,
                         segment_cnt=50,
                         predict_proba=gbm.predict(X),
                         out_path='aa_segment_' + LABEL + 'kk.xlsx')
    pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    print ('predicting, classification error=%f'
           % (sum(int(pred[i]) != y[i] for i in range(len(y))) / float(len(y))))

    y_pred = (pred >= THRESHOLD) * 1
    print ('ACC: %.4f' % metrics.accuracy_score(y, y_pred))
    print ('AUC: %.4f' % metrics.roc_auc_score(y, pred))
    print ('Recall: %.4f' % metrics.recall_score(y, y_pred))
    print ('F1-score: %.4f' % metrics.f1_score(y, y_pred))
    print ('Precesion: %.4f' % metrics.precision_score(y, y_pred))
    print(metrics.confusion_matrix(y, y_pred))


def print_important_feature(feature_names):
    gbm = lgb.Booster(model_file=model_file_path)
    # gbm.feature_importance('split')
    importances = pd.Series(gbm.feature_importance('gain'),
                            index=feature_names
                            ).sort_values(ascending=False)
    print(importances)


def predict():
    gbm = lgb.Booster(model_file=model_file_path)
    test_data = pd.read_csv(data_input_path, sep='\t')
    test_data = test_data.loc[:, feature_list]
    test_data['score'] = gbm.predict(test_data)
    result = test_data.loc[test_data['score'] >= THRESHOLD, keep_cols]
    print("-----阈值门槛数据量: " + str(len(result)))
    result.to_csv(data_output_path, index=0, header=0, sep='\t', float_format='%.4f')


if __name__ == "__main__":
    train_data_path = '../data/train.txt'
    test_data_path = '../data/test.txt'
    # 获取训练集，测试集
    X_tr, y_tr, X_te, y_te = get_dataset(train_data_path, test_data_path)
    # 训练模型
    train(X_tr, y_tr, X_te, y_te)
    # 评估模型
    eval(X_te, y_te)
    # 打印模型特征重要度
    feature_names = X_tr.columns.tolist()
    print_important_feature(feature_names)
    # 模型预测
    predict()