# 任务二:基于41维特征的多分类器对比分析(使用10折交叉验证)
# 新评价指标:Accuracy, Precision, Recall, F1-Score(宏平均)
# 使用源域数据 + 10-Fold CV,避免划分偏差
# 新增功能:保存训练好的随机森林模型供任务三使用
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
import os
import joblib # 新增导入
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")
# ====================== 主程序开始 ======================
print(" 开始加载并处理特征数据...")
df = pd.read_csv('extracted_features_with_domain.csv')
source_data = df[df['domain'] == 'source'].copy()
print(f" 源域样本数: {len(source_data)}")
X = source_data.drop(columns=['filename', 'label', 'domain']).values
y = source_data['label'].values
labels = sorted(np.unique(y))
class_names = ['Normal', 'Outer Race', 'Inner Race', 'Ball']
# 标准化(仅用于需要标准化的模型)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 定义分类器
models = {
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
"SVM_RBF": SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42),
"MLP": MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, alpha=1e-4,
batch_size=32, early_stopping=True, random_state=42)
}
# 十折分层交叉验证
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results_summary = []
print("\n" + "=" * 60)
print(" 正在进行 10-Fold Cross Validation 评估各分类器...")
print("=" * 60)
for name, model in models.items():
print(f"\n 正在评估 {name}...")
# 是否需要标准化?
X_use = X_scaled if name in ["SVM_RBF", "MLP"] else X
all_y_true, all_y_pred = [], []
for train_idx, test_idx in cv.split(X_use, y):
X_train_fold, X_test_fold = X_use[train_idx], X_use[test_idx]
y_train_fold, y_test_fold = y[train_idx], y[test_idx]
model.fit(X_train_fold, y_train_fold)
y_pred_fold = model.predict(X_test_fold)
all_y_true.extend(y_test_fold)
all_y_pred.extend(y_pred_fold)
# 计算总体 Accuracy
acc = accuracy_score(all_y_true, all_y_pred)
# 获取 classification report(宏平均)
report = classification_report(all_y_true, all_y_pred, target_names=class_names,
labels=labels, output_dict=True)
precision_macro = report['macro avg']['precision']
recall_macro = report['macro avg']['recall']
f1_macro = report['macro avg']['f1-score']
# 存储结果
results_summary.append({
'Model': name,
'Accuracy': acc,
'Precision': precision_macro,
'Recall': recall_macro,
'F1-Score': f1_macro
})
# 输出详细报告
print(f" {name} 10-Fold CV 结果:")
print(f" Accuracy = {acc:.3f}")
print(f" Precision = {precision_macro:.3f}")
print(f" Recall = {recall_macro:.3f}")
print(f" F1-Score = {f1_macro:.3f}")
# 显示分类报告
print("\n 分类报告 (Classification Report):")
print(classification_report(all_y_true, all_y_pred, target_names=class_names))
# 绘制混淆矩阵热力图
cm = confusion_matrix(all_y_true, all_y_pred, labels=labels)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.title(f'Confusion Matrix - {name} (10-Fold CV)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.savefig(f'cm_{name.replace(" ", "_")}_10fold.png', dpi=150)
plt.show()
# ====================== 综合对比表格 ======================
print("\n" + "=" * 60)
print(" 所有模型综合性能对比(10-Fold CV)")
print("=" * 60)
summary_df = pd.DataFrame(results_summary).round(3)
print(summary_df.to_string(index=False))
# 保存为 CSV
summary_df.to_csv('model_comparison_summary_10fold_updated.csv', index=False)
print(" 综合结果已保存至: model_comparison_summary_10fold_updated.csv")
# ====================== 可视化四项指标柱状图 ======================
metrics_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors = ['#4E79A7', '#F28E2B', '#E15759']
x_pos = np.arange(len(models))
fig, ax = plt.subplots(figsize=(10, 6))
width = 0.2
for i, metric in enumerate(metrics_plot):
values = summary_df[metric].values
bars = ax.bar(x_pos + i * width, values, width, label=metric, color=colors[i % len(colors)], alpha=0.8)
# 添加数值标签
for bar, val in zip(bars, values):
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01, f"{val:.3f}",
ha='center', va='bottom', fontsize=9, fontweight='bold')
ax.set_xlabel('Model')
ax.set_ylabel('Score')
ax.set_title(
'Model Comparison on Source Domain (41D Features) - 10-Fold CV\nMetrics: Accuracy, Precision, Recall, F1-Score')
ax.set_xticks(x_pos + width * 1.5)
ax.set_xticklabels(summary_df['Model'])
ax.set_ylim(0, 1.0)
ax.legend()
ax.grid(True, axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig('model_performance_comparison_4metrics.png', dpi=150)
plt.show()
# ====================== 特征重要性(仅RF)======================
if 'Random Forest' in models:
# 训练完整随机森林模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y) # 使用全部源域训练
# 新增:保存模型及元数据
model_dir = "saved_models"
os.makedirs(model_dir, exist_ok=True) # 创建保存目录
# 保存核心模型文件
joblib.dump(
rf_model,
os.path.join(model_dir, "random_forest_model.pkl")
)
# 保存元数据(特征名、类别名、标签等)
metadata = {
'feature_names': df.columns.drop(['filename', 'label', 'domain']).tolist(),
'class_names': class_names,
'labels': sorted(np.unique(y)),
'preprocessing': 'none' # 表示模型训练时未使用标准化
}
joblib.dump(
metadata,
os.path.join(model_dir, "random_forest_metadata.pkl")
)
print(" 模型及元数据已保存至 saved_models/ 目录")
# 绘制特征重要性
feat_importance = rf_model.feature_importances_
feature_names = df.columns.drop(['filename', 'label', 'domain'])
indices = np.argsort(feat_importance)[::-1][:20]
plt.figure(figsize=(10, 6))
plt.barh([feature_names[i] for i in indices[::-1]], feat_importance[indices][::-1], color='steelblue')
plt.xlabel('Feature Importance (Gini)')
plt.title('Top 20 Important Features - Random Forest (Full Training Set)')
plt.gca().invert_yaxis()
plt.grid(True, axis='x', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig('rf_feature_importance_10fold.png', dpi=150)
plt.show()
print("\n 所有任务完成!请查看生成的图表与CSV文件。")
results_summary = []
print("\n" + "=" * 60)
print(" 正在进行 10-Fold Cross Validation 评估各分类器...")
print("=" * 60)
for name, model in models.items():
print(f"\n 正在评估 {name}...")
# 是否需要标准化?
X_use = X_scaled if name in ["SVM_RBF", "MLP"] else X
all_y_true, all_y_pred = [], []
for train_idx, test_idx in cv.split(X_use, y):
X_train_fold, X_test_fold = X_use[train_idx], X_use[test_idx]
y_train_fold, y_test_fold = y[train_idx], y[test_idx]
model.fit(X_train_fold, y_train_fold)
y_pred_fold = model.predict(X_test_fold)
all_y_true.extend(y_test_fold)
all_y_pred.extend(y_pred_fold)
# 计算总体 Accuracy
acc = accuracy_score(all_y_true, all_y_pred)
# 获取 classification report(宏平均)
report = classification_report(all_y_true, all_y_pred, target_names=class_names,
labels=labels, output_dict=True)
precision_macro = report['macro avg']['precision']
recall_macro = report['macro avg']['recall']
f1_macro = report['macro avg']['f1-score']
# 存储结果
results_summary.append({
'Model': name,
'Accuracy': acc,
'Precision': precision_macro,
'Recall': recall_macro,
'F1-Score': f1_macro
})
# 输出详细报告
print(f" {name} 10-Fold CV 结果:")
print(f" Accuracy = {acc:.3f}")
print(f" Precision = {precision_macro:.3f}")
print(f" Recall = {recall_macro:.3f}")
print(f" F1-Score = {f1_macro:.3f}")
# 显示分类报告
print("\n 分类报告 (Classification Report):")
print(classification_report(all_y_true, all_y_pred, target_names=class_names))
# 绘制混淆矩阵热力图
cm = confusion_matrix(all_y_true, all_y_pred, labels=labels)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.title(f'Confusion Matrix - {name} (10-Fold CV)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.savefig(f'cm_{name.replace(" ", "_")}_10fold.png', dpi=150)
plt.show()
# ====================== 综合对比表格 ======================
print("\n" + "=" * 60)
print(" 所有模型综合性能对比(10-Fold CV)")
print("=" * 60)
summary_df = pd.DataFrame(results_summary).round(3)
print(summary_df.to_string(index=False))
# 保存为 CSV
summary_df.to_csv('model_comparison_summary_10fold_updated.csv', index=False)
print(" 综合结果已保存至: model_comparison_summary_10fold_updated.csv")
# ====================== 可视化四项指标柱状图 ======================
metrics_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors = ['#4E79A7', '#F28E2B', '#E15759']
x_pos = np.arange(len(models))
fig, ax = plt.subplots(figsize=(10, 6))
width = 0.2
for i, metric in enumerate(metrics_plot):
values = summary_df[metric].values
bars = ax.bar(x_pos + i * width, values, width, label=metric, color=colors[i % len(colors)], alpha=0.8)
# 添加数值标签
for bar, val in zip(bars, values):
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01, f"{val:.3f}",
ha='center', va='bottom', fontsize=9, fontweight='bold')
ax.set_xlabel('Model')
ax.set_ylabel('Score')
ax.set_title(
'Model Comparison on Source Domain (41D Features) - 10-Fold CV\nMetrics: Accuracy, Precision, Recall, F1-Score')
ax.set_xticks(x_pos + width * 1.5)
ax.set_xticklabels(summary_df['Model'])
ax.set_ylim(0, 1.0)
ax.legend()
ax.grid(True, axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig('model_performance_comparison_4metrics.png', dpi=150)
plt.show()
# ====================== 特征重要性(仅RF)======================
if 'Random Forest' in models:
# 训练完整随机森林模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y) # 使用全部源域训练
# 新增:保存模型及元数据
model_dir = "saved_models"
os.makedirs(model_dir, exist_ok=True) # 创建保存目录
# 保存核心模型文件
joblib.dump(
rf_model,
os.path.join(model_dir, "random_forest_model.pkl")
)
# 保存元数据(特征名、类别名、标签等)
metadata = {
'feature_names': df.columns.drop(['filename', 'label', 'domain']).tolist(),
'class_names': class_names,
'labels': sorted(np.unique(y)),
'preprocessing': 'none' # 表示模型训练时未使用标准化
}
joblib.dump(
metadata,
os.path.join(model_dir, "random_forest_metadata.pkl")
)
print(" 模型及元数据已保存至 saved_models/ 目录")
# 绘制特征重要性
feat_importance = rf_model.feature_importances_
feature_names = df.columns.drop(['filename', 'label', 'domain'])
indices = np.argsort(feat_importance)[::-1][:20]
plt.figure(figsize=(10, 6))
plt.barh([feature_names[i] for i in indices[::-1]], feat_importance[indices][::-1], color='steelblue')
plt.xlabel('Feature Importance (Gini)')
plt.title('Top 20 Important Features - Random Forest (Full Training Set)')
plt.gca().invert_yaxis()
plt.grid(True, axis='x', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig('rf_feature_importance_10fold.png', dpi=150)
plt.show()
print("\n 所有任务完成!请查看生成的图表与CSV文件。")
这段代码的输出结果之一Top 20 Importance Features-Random Forest(Full training Set),这个图,现在想生成一个二维的这种图,也就是在这个图的基础上,更美观,更高级,那么,原代码是哪一段生成了原本的一维图。现在只想增加一段新的代码,来生成这个二维图,又要怎么增加代码
最新发布