# ========== 模型训练函数 ==========
def train_model(
model_name: str,
X_train: pd.DataFrame,
X_test: pd.DataFrame,
y_train: np.ndarray,
y_test: np.ndarray
) -> np.ndarray:
"""
统一模型训练函数
支持模型: 'XGBoost', 'MLP', 'TabNet'
"""
# 1. 共享预处理
X_tr, X_te, y_tr, y_te = preprocess_data(X_train, X_test, y_train, y_test)
# 2. 根据不同模型类型进行训练
if model_name == "XGBoost":
# 转换为XGBoost格式
dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=X_train.columns.tolist())
dtest = xgb.DMatrix(X_te, label=y_te, feature_names=X_test.columns.tolist())
# 训练参数
params = {
"objective": "multi:softmax",
"num_class": len(np.unique(y_tr)),
"max_depth": 6,
"eta": 0.2,
"n_estimators": 45,
"subsample": 0.8,
"colsample_bytree": 0.8,
"gamma": 0.1,
"min_child_weight": 3,
"lambda": 1,
"alpha": 0,
"eval_metric": ["merror", "mlogloss"],
"early_stopping_rounds": 10,
"tree_method": "hist",
"random_state": 42
}
# 训练并预测
model = xgb.train(params, dtrain, num_boost_round=100, verbose_eval=False)
return model.predict(dtest).astype(int)
elif model_name == "MLP":
# 获取模型实例
model = ModelFactory.create_model(model_name, X_tr.shape[1], len(np.unique(y_tr)))
# 训练配置
early_stopping = EarlyStopping(
monitor='val_loss',
patience=5,
restore_best_weights=True
)
# 训练并预测
model.fit(
X_tr, y_tr,
epochs=MLP_EPOCHS,
batch_size=MLP_BATCH,
validation_data=(X_te, y_te),
callbacks=[early_stopping],
verbose=0
)
y_pred_prob = model.predict(X_te, verbose=0)
return np.argmax(y_pred_prob, axis=1)
elif model_name == "TabNet":
# 转换为TabNet格式
X_tr = X_tr.astype(np.float32)
X_te = X_te.astype(np.float32)
y_tr = y_tr.astype(np.int64)
y_te = y_te.astype(np.int64)
# 获取模型实例
model = ModelFactory.create_model(model_name, X_tr.shape[1], len(np.unique(y_tr)))
# 训练并预测
model.fit(
X_train=X_tr,
y_train=y_tr,
eval_set=[(X_te, y_te)],
max_epochs=TABNET_EPOCHS,
batch_size=TABNET_BATCH,
patience=TABNET_PATIENCE,
eval_metric=['accuracy'],
)
return model.predict(X_te)
else:
raise ValueError(f"不支持的模型类型: {model_name}")
# ========== 交叉验证框架 ==========
def run_core_app_cv(
models_to_run: list,
X: pd.DataFrame,
y_encoded: np.ndarray,
df: pd.DataFrame,
selected_features: list,
label_encoder: LabelEncoder
) -> dict:
"""
核心App留一交叉验证
:param models_to_run: 要运行的模型列表
:return: 包含所有模型结果的字典 {model_name: results_dict}
"""
unique_app_cores = df['app_core'].unique()
all_results = {}
for model_name in models_to_run:
print(f"\n{'='*60}\n开始 {model_name} 模型交叉验证\n{'='*60}")
results = {"true": [], "pred": [], "cores": [], "indices": []}
for app_idx, test_core in enumerate(unique_app_cores, 1):
print(f"\n{'='*20} {model_name} - 第 {app_idx}/{len(unique_app_cores)} 个App: {test_core} {'='*20}")
# 划分训练/测试集
train_mask = df['app_core'] != test_core
test_mask = df['app_core'] == test_core
X_train = X[selected_features][train_mask].copy()
X_test = X[selected_features][test_mask].copy()
y_train = y_encoded[train_mask]
y_test = y_encoded[test_mask]
print(f"训练集: {X_train.shape[0]}样本, 测试集: {X_test.shape[0]}样本")
# 训练并预测
y_pred = train_model(model_name, X_train, X_test, y_train, y_test)
# 保存结果
results["true"].extend(label_encoder.inverse_transform(y_test))
results["pred"].extend(label_encoder.inverse_transform(y_pred))
results["cores"].extend([test_core] * len(y_test))
results["indices"].extend(df[test_mask].index.tolist())
all_results[model_name] = results
return all_results
# ========== 结果保存与分析 ==========
def save_results(model_name: str, results: dict, save_dir: str):
"""保存单模型结果并生成可视化"""
# 创建结果DataFrame
df_res = df.loc[results["indices"], ['app_name', 'app_core', 'second_id']].reset_index(drop=True)
df_res['true_label'] = [category_mapping[str(c).zfill(3)] for c in results["true"]]
df_res['pred_label'] = [category_mapping[str(c).zfill(3)] for c in results["pred"]]
df_res['correct'] = (np.array(results["true"]) == np.array(results["pred"])).astype(int)
df_res.rename(columns={'second_id': 'true_second_id'}, inplace=True)
# 保存样本级结果
sample_path = os.path.join(save_dir, f'{model_name}_sample_results.csv')
df_res.to_csv(sample_path, index=False, encoding='utf-8-sig')
print(f"已保存样本结果到: {sample_path}")
# 计算App级准确率
app_acc = df_res.groupby('app_core').agg(
total=('correct', 'count'),
correct=('correct', 'sum'),
accuracy=('correct', 'mean')
).reset_index()
app_acc['accuracy'] = app_acc['accuracy'].round(4)
# 保存App级结果
app_path = os.path.join(save_dir, f'{model_name}_app_accuracy.csv')
app_acc.to_csv(app_path, index=False, encoding='utf_8_sig')
print(f"已保存App准确率到: {app_path}")
# 生成混淆矩阵
class_names = [category_mapping[c] for c in encoded_classes]
cm = confusion_matrix(results["true"], results["pred"], labels=encoded_classes)
# 可视化混淆矩阵
fig, ax = plt.subplots(figsize=(12, 10))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot(cmap='Blues', values_format='d', ax=ax, colorbar=False)
plt.title(f'{model_name}混淆矩阵', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
# 保存混淆矩阵图
cm_path = os.path.join(save_dir, f'{model_name}_confusion_matrix.png')
plt.savefig(cm_path, bbox_inches='tight', dpi=300)
plt.close()
print(f"已保存混淆矩阵图到: {cm_path}")
# 生成分类报告
report = classification_report(
results["true"],
results["pred"],
target_names=class_names,
output_dict=True
)
# 转换报告为DataFrame并保存
report_df = pd.DataFrame(report).transpose()
report_path = os.path.join(save_dir, f'{model_name}_classification_report.csv')
report_df.to_csv(report_path, encoding='utf_8_sig')
print(f"已保存分类报告到: {report_path}")
# 返回关键性能指标
overall_acc = accuracy_score(results["true"], results["pred"])
return {
'model': model_name,
'overall_accuracy': overall_acc,
'app_accuracy': app_acc,
'confusion_matrix_path': cm_path
}
# ========== 主执行流程 ==========
if __name__ == "__main__":
print("\n" + "="*60)
print("开始执行核心App交叉验证流程")
print("="*60 + "\n")
# 1. 特征选择
selected_features = perform_feature_selection(X, y_encoded, groups=df['app_core'])
feature_file = os.path.join(RESULTS_DIR, 'selected_features.txt')
with open(feature_file, 'w') as f:
for feature in selected_features:
f.write(f"{feature}\n")
print("特征已保存至 selected_features.txt")
# 2. 执行交叉验证
all_results = run_core_app_cv(
models_to_run=SELECTED_MODELS,
X=X,
y_encoded=y_encoded,
df=df,
selected_features=selected_features,
label_encoder=label_encoder
)
# 3. 结果保存与分析
final_metrics = {}
for model_name, results in all_results.items():
print(f"\n正在处理{model_name}的结果分析...")
metrics = save_results(model_name, results, RESULTS_DIR)
final_metrics[model_name] = metrics['overall_accuracy']
# 打印关键指标
print(f"[{model_name}] 总体准确率: {metrics['overall_accuracy']:.4f}")
print(f"应用级准确率分布:\n{metrics['app_accuracy'][['app_core', 'accuracy']]}")
我想修改这部分模型,对于xgboost,我希望能够通过不断测试寻找最优的参数