我说的是以下code
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import ParameterGrid, train_test_split
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import json
import joblib
import seaborn as sns
from scipy.stats import ks_2samp
# ======================
# 1. 特殊值处理函数(优化版)
# ======================
def create_special_flags(X, special_values):
"""为含特殊值的变量创建标识特征,并将特殊值替换为np.nan"""
X_processed = X.copy()
# 添加特殊值标识特征
for col, values in special_values.items():
if col in X.columns:
# 创建特殊值标识列
X_processed[f"{col}_IS_SPECIAL"] = X[col].isin(values).astype(int)
# 替换特殊值为nan(XGBoost自动处理缺失值)
X_processed[col] = X[col].replace(values, np.nan)
return X_processed
# ======================
# 2. 评估指标计算(y=1为不良样本)
# ======================
def calculate_ks(y_true, y_prob):
"""计算KS值"""
df = pd.DataFrame({'y_true': y_true, 'y_prob': y_prob})
df = df.sort_values('y_prob')
df['cum_bad'] = df['y_true'].cumsum() / df['y_true'].sum() # y=1为不良
df['cum_good'] = (1 - df['y_true']).cumsum() / (1 - df['y_true']).sum()
return np.max(df['cum_bad'] - df['cum_good'])
def calculate_ar(y_true, y_prob):
"""计算AR值(Accuracy Ratio)"""
auc = roc_auc_score(y_true, y_prob)
return 2 * auc - 1
def calculate_psi(expected, actual, buckets=10):
"""计算PSI(Population Stability Index)"""
# 分箱
breakpoints = np.linspace(0, 1, buckets + 1)[1:-1]
expected_bins = np.histogram(expected, bins=breakpoints)[0] / len(expected)
actual_bins = np.histogram(actual, bins=breakpoints)[0] / len(actual)
# 避免0值
expected_bins = np.where(expected_bins == 0, 0.0001, expected_bins)
actual_bins = np.where(actual_bins == 0, 0.0001, actual_bins)
# 计算PSI
psi = np.sum((actual_bins - expected_bins) * np.log(actual_bins / expected_bins))
return psi
# ======================
# 3. 不平衡数据处理(优化版)
# ======================
def apply_sampling(X, y, method):
"""处理样本不平衡(y=1为不良样本,需要增加)"""
# 计算目标负样本比例(10%)
target_minority_ratio = 0.10
n_minority_target = int(len(y) * target_minority_ratio)
if method == 'smote':
sampler = SMOTE(sampling_strategy={1: n_minority_target}, random_state=42)
elif method == 'adasyn':
sampler = ADASYN(sampling_strategy={1: n_minority_target}, random_state=42)
elif method == 'undersample':
# 保留所有负样本,下采样正样本
n_minority = y.sum()
n_majority_target = int(n_minority * (1 - target_minority_ratio) / target_minority_ratio)
sampler = RandomUnderSampler(sampling_strategy={0: n_majority_target}, random_state=42)
elif method == 'smoteenn':
sampler = SMOTEENN(sampling_strategy={1: n_minority_target}, random_state=42)
elif method == 'weighted':
# 样本权重方案
weight_ratio = len(y[y==0]) / len(y[y==1])
sample_weights = np.where(y == 1, weight_ratio, 1)
return X, y, sample_weights
else: # raw
return X, y, None
X_res, y_res = sampler.fit_resample(X, y)
return X_res, y_res, None
# ======================
# 4. XGBoost参数网格(优化版)
# ======================
def get_param_grid(y_train):
pos_count = len(y_train[y_train==0]) # y=0为正样本
neg_count = len(y_train[y_train==1]) # y=1为负样本
return {
'objective': ['binary:logistic'],
'eval_metric': ['auc'],
'eta': [0.01, 0.05, 0.1],
'max_depth': [3, 5, 7],
'min_child_weight': [1, 3, 5],
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.3, 0.5, 0.7],
'gamma': [0, 0.1, 0.2],
'scale_pos_weight': [1, neg_count/pos_count, neg_count/pos_count*2],
'lambda': [0.1, 1, 10],
'alpha': [0, 0.1, 1],
'n_estimators': [200, 500, 1000],
'tree_method': ['hist', 'gpu_hist'] # 支持GPU加速
}
# ======================
# 5. 模型训练与评估(优化版)
# ======================
def train_and_evaluate(X_train, y_train, X_val, y_val, params, sample_weights=None):
dtrain = xgb.DMatrix(X_train, label=y_train, weight=sample_weights)
dval = xgb.DMatrix(X_val, label=y_val)
evals_result = {}
model = xgb.train(
params,
dtrain,
num_boost_round=1000,
evals=[(dtrain, 'train'), (dval, 'val')],
early_stopping_rounds=50,
evals_result=evals_result,
verbose_eval=False
)
return model, evals_result
# ======================
# 6. 评分卡生成(优化版)
# ======================
def create_score_table(y_true, y_prob, n_bins=20):
"""生成评分卡分段表"""
df = pd.DataFrame({'y_true': y_true, 'y_prob': y_prob})
df = df.sort_values('y_prob', ascending=False) # 高风险到低风险
# 等频分箱
df['bin'] = pd.qcut(df['y_prob'], q=n_bins, duplicates='drop')
total_samples = len(df)
total_bad = df['y_true'].sum()
total_good = total_samples - total_bad
bin_stats = df.groupby('bin').agg(
total=('y_true', 'count'),
bad=('y_true', 'sum'), # y=1为不良
good=('y_true', lambda x: len(x)-sum(x)) # y=0为优良
).reset_index()
# 计算各种比率
bin_stats['bad_rate'] = bin_stats['bad'] / bin_stats['total']
bin_stats['cum_total'] = bin_stats['total'].cumsum()
bin_stats['cum_bad'] = bin_stats['bad'].cumsum()
bin_stats['cum_good'] = bin_stats['good'].cumsum()
bin_stats['cum_total_pct'] = bin_stats['cum_total'] / total_samples
bin_stats['cum_bad_pct'] = bin_stats['cum_bad'] / total_bad
bin_stats['cum_good_pct'] = bin_stats['cum_good'] / total_good
bin_stats['reject_rate'] = bin_stats['cum_bad'] / bin_stats['cum_total'] # 累计拒绝率
bin_stats['lift'] = bin_stats['bad_rate'] / (total_bad/total_samples) # 提升度
return bin_stats
# ======================
# 7. 评分卡可视化(新增)
# ======================
def plot_score_card(bin_stats, title, save_path=None):
"""可视化评分卡结果"""
fig, ax1 = plt.subplots(figsize=(12, 8))
# 累计不良率曲线
ax1.plot(bin_stats.index, bin_stats['cum_bad_pct'], 'b-', label='累计不良捕获率')
ax1.plot(bin_stats.index, bin_stats['cum_total_pct'], 'g-', label='累计样本占比')
ax1.set_xlabel('分箱 (高风险->低风险)')
ax1.set_ylabel('比例')
ax1.set_title(f'{title} - 评分卡分析')
# 不良率曲线
ax2 = ax1.twinx()
ax2.plot(bin_stats.index, bin_stats['bad_rate'], 'r-', label='分箱不良率')
ax2.set_ylabel('不良率')
# 添加参考线
ax1.axhline(y=0.8, color='gray', linestyle='--', alpha=0.5)
ax1.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
# 图例
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper left')
plt.tight_layout()
if save_path:
plt.savefig(save_path)
plt.close()
# ======================
# 8. 主训练流程(优化版)
# ======================
def main_training_flow(X_train, y_train, X_test, y_test, X_oot, y_oot, special_values):
# 特殊值处理
X_train_proc = create_special_flags(X_train, special_values)
X_test_proc = create_special_flags(X_test, special_values)
X_oot_proc = create_special_flags(X_oot, special_values)
# 定义采样方法
sampling_methods = {
'raw': None,
'smote': 'smote',
'adasyn': 'adasyn',
'undersample': 'undersample',
'smoteenn': 'smoteenn',
'weighted': 'weighted'
}
all_results = []
param_grid = get_param_grid(y_train)
for method_name, method in sampling_methods.items():
print(f"\n=== 处理方法: {method_name} ===")
# 处理不平衡数据
if method:
X_res, y_res, sample_weights = apply_sampling(X_train_proc, y_train, method)
else:
X_res, y_res, sample_weights = X_train_proc.copy(), y_train.copy(), None
# 从训练集划分验证集
X_tr, X_val, y_tr, y_val = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)
# 参数搜索
for params in ParameterGrid(param_grid):
print(f"训练参数: {params}")
# 训练模型
model, evals_result = train_and_evaluate(
X_tr, y_tr,
X_val, y_val,
params,
sample_weights=sample_weights
)
# 评估指标
def evaluate(X, y):
y_prob = model.predict(xgb.DMatrix(X))
return {
'auc': roc_auc_score(y, y_prob),
'ks': calculate_ks(y, y_prob),
'ar': calculate_ar(y, y_prob),
'accuracy': accuracy_score(y, (y_prob > 0.5).astype(int)),
'precision': precision_score(y, (y_prob > 0.5).astype(int), pos_label=1),
'recall': recall_score(y, (y_prob > 0.5).astype(int), pos_label=1),
'f1': f1_score(y, (y_prob > 0.5).astype(int), pos_label=1)
}
# 记录结果
train_metrics = evaluate(X_tr, y_tr)
val_metrics = evaluate(X_val, y_val)
test_metrics = evaluate(X_test_proc, y_test)
oot_metrics = evaluate(X_oot_proc, y_oot)
# 计算稳定性指标
y_prob_train = model.predict(xgb.DMatrix(X_tr))
y_prob_oot = model.predict(xgb.DMatrix(X_oot_proc))
psi = calculate_psi(y_prob_train, y_prob_oot)
result = {
'sampling_method': method_name,
**params,
**{'train_'+k: v for k,v in train_metrics.items()},
**{'val_'+k: v for k,v in val_metrics.items()},
**{'test_'+k: v for k,v in test_metrics.items()},
**{'oot_'+k: v for k,v in oot_metrics.items()},
'psi': psi,
'best_iteration': model.best_iteration
}
all_results.append(result)
# 实时保存
result_df = pd.DataFrame(all_results)
result_df.to_csv('xgb_results_all.csv', index=False)
# 每10次迭代输出一次进度
if len(all_results) % 10 == 0:
print(f"已完成 {len(all_results)} 个参数组合")
return pd.DataFrame(all_results)
# ======================
# 9. 结果分析与可视化(优化版)
# ======================
def analyze_results(results_df):
# 选择最佳模型(基于test集KS和稳定性)
results_df['stability'] = 1 - abs(results_df['test_ks'] - results_df['oot_ks'])
results_df['combined_score'] = results_df['test_ks'] * 0.7 + results_df['stability'] * 0.3
best_by_ks = results_df.loc[results_df['test_ks'].idxmax()]
best_by_ar = results_df.loc[results_df['test_ar'].idxmax()]
best_by_stability = results_df.loc[results_df['combined_score'].idxmax()]
# 重新训练最终模型(使用全部DEV数据)
def train_final_model(params, X_train, y_train, X_test, y_test, sampling_method):
# 合并训练集和测试集
X_full = pd.concat([X_train, X_test])
y_full = np.concatenate([y_train, y_test])
# 应用采样方法
if sampling_method != 'raw' and sampling_method != 'weighted':
X_full, y_full, _ = apply_sampling(X_full, y_full, sampling_method)
# 训练最终模型
dtrain = xgb.DMatrix(X_full, label=y_full)
return xgb.train(
params,
dtrain,
num_boost_round=int(params['best_iteration'] * 1.1) # 稍微增加迭代次数
)
# 为每个最佳模型生成结果
output = {}
for name, best in [('ks', best_by_ks), ('ar', best_by_ar), ('stability', best_by_stability)]:
params = {k: v for k,v in best.items() if k in get_param_grid([]).keys()}
sampling_method = best['sampling_method']
# 特殊值处理
X_train_proc = create_special_flags(X_train, special_values)
X_test_proc = create_special_flags(X_test, special_values)
X_oot_proc = create_special_flags(X_oot, special_values)
# 训练最终模型
model = train_final_model(
params,
X_train_proc, y_train,
X_test_proc, y_test,
sampling_method
)
# 保存模型
model.save_model(f'best_model_by_{name}.json')
# 特征重要性分析
importance = model.get_score(importance_type='gain')
importance_df = pd.DataFrame({
'feature': list(importance.keys()),
'importance': list(importance.values())
}).sort_values('importance', ascending=False)
importance_df.to_csv(f'feature_importance_{name}.csv', index=False)
# 生成评分表
score_tables = {}
for data_name, X, y in [
('train', X_train, y_train),
('test', X_test, y_test),
('oot', X_oot, y_oot)
]:
X_processed = create_special_flags(X, special_values)
y_prob = model.predict(xgb.DMatrix(X_processed))
score_table = create_score_table(y, y_prob)
score_tables[data_name] = score_table
# 保存评分表
score_table.to_csv(f'score_table_{name}_{data_name}.csv', index=False)
# 绘制评分卡
plot_score_card(
score_table,
f'{name.upper()}模型 - {data_name}集',
f'score_card_{name}_{data_name}.png'
)
# 绘制概率分布
plt.figure(figsize=(12, 8))
sns.kdeplot(y_prob[y==0], label='正样本(y=0)', fill=True, alpha=0.3)
sns.kdeplot(y_prob[y==1], label='负样本(y=1)', fill=True, alpha=0.3)
plt.title(f'{data_name}集预测概率分布 ({name.upper()}模型)')
plt.xlabel('预测概率')
plt.ylabel('密度')
plt.legend()
plt.savefig(f'prob_dist_{name}_{data_name}.png')
plt.close()
output[name] = {
'params': params,
'sampling_method': sampling_method,
'score_tables': score_tables,
'feature_importance': importance_df.to_dict()
}
# 保存结果
with open('final_results.json', 'w') as f:
json.dump(output, f, indent=2)
# 输出模型比较报告
generate_model_comparison(output)
return output
# ======================
# 10. 模型比较报告(新增)
# ======================
def generate_model_comparison(output):
"""生成模型比较报告"""
comparison = []
for name, results in output.items():
# 获取测试集性能
test_ks = results['score_tables']['test'].iloc[0]['ks'] # 假设KS在第一个位置
comparison.append({
'model_name': name,
'sampling_method': results['sampling_method'],
'test_ks': results['score_tables']['test'].iloc[0]['ks'],
'test_ar': results['score_tables']['test'].iloc[0]['ar'],
'oot_ks': results['score_tables']['oot'].iloc[0]['ks'],
'psi': results['score_tables']['oot'].iloc[0]['psi'],
'stability': 1 - abs(results['score_tables']['test'].iloc[0]['ks'] -
results['score_tables']['oot'].iloc[0]['ks']),
'top_features': ', '.join(results['feature_importance']['feature'][:5])
})
# 保存比较报告
comparison_df = pd.DataFrame(comparison)
comparison_df.to_csv('model_comparison.csv', index=False)
# 可视化比较
plt.figure(figsize=(12, 8))
comparison_df.set_index('model_name')[['test_ks', 'oot_ks', 'stability']].plot.bar()
plt.title('模型性能比较')
plt.ylabel('分数')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('model_comparison.png')
plt.close()
# ======================
# 执行主流程
# ======================
if __name__ == "__main__":
# 假设数据已加载
# X_train, y_train, X_test, y_test, X_oot, y_oot = load_data()
# special_values = {...}
# 运行训练
results = main_training_flow(
X_train, y_train,
X_test, y_test,
X_oot, y_oot,
special_values
)
# 分析结果
final_results = analyze_results(results)
但我发现完整地运行一次需要很长的时间,而且仍然存在bug,所以想添加一个preview版,简化并完整的运行一次code,确保code的准确性。error的话我发现X_OOTis not defiend会多一点