```python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import time
# 1. 生成模拟二分类数据集
X, y = make_classification(
n_samples=1000,
n_features=10,
n_informative=8,
n_redundant=2,
n_clusters_per_class=1,
random_state=42
)
# 2. 输出数据集基本信息
print("=== 数据集基本信息 ===")
print(f"样本数量: {X.shape[0]}")
print(f"特征数量: {X.shape[1]}")
print(f"类别分布: 类别0={np.sum(y == 0)}, 类别1={np.sum(y == 1)}")
# 3. 划分训练集和测试集 (80%训练, 20%测试)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 4. 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 5. 初始化随机森林模型
model = RandomForestClassifier(
n_estimators=100,
max_depth=10,
min_samples_split=5,
min_samples_leaf=2,
random_state=42
)
# 6. 记录训练过程中的性能指标
train_losses = []
test_accuracies = []
epochs = 50
print("\n开始训练...")
for epoch in range(1, epochs + 1):
# 动态调整树的数量进行训练(模拟增量学习)
temp_model = RandomForestClassifier(
n_estimators=min(10 * epoch, 100),
max_depth=10,
random_state=42
)
temp_model.fit(X_train_scaled, y_train)
# 预测并计算准确率
y_train_pred = temp_model.predict(X_train_scaled)
y_test_pred = temp_model.predict(X_test_scaled)
train_loss = 1 - accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
train_losses.append(train_loss)
test_accuracies.append(test_acc)
if epoch % 10 == 0:
print(f"Epoch [{epoch}/50] | Train Error: {train_loss:.4f} | Test Acc: {test_acc:.4f}")
time.sleep(0.01) # 模拟轻微延迟,便于观察
# 7. 使用最终模型进行预测
final_model = RandomForestClassifier(n_estimators=100, random_state=42)
final_model.fit(X_train_scaled, y_train)
y_final_pred = final_model.predict(X_test_scaled)
# 8. 输出详细评估报告
print("\n=== 最终模型评估结果 ===")
print(f"测试集准确率: {accuracy_score(y_test, y_final_pred):.4f}")
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_final_pred, average='binary')
print(f"精确率: {precision:.4f} | 召回率: {recall:.4f} | F1分数: {f1:.4f}")
# 9. 分类报告
print("\n分类报告:")
print(classification_report(y_test, y_final_pred, target_names=['Negative', 'Positive']))
# 10. 绘制训练过程曲线
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Error', color='red')
plt.xlabel('Epoch')
plt.ylabel('Error Rate')
plt.title('Training Error Over Epochs')
plt.legend()
plt.grid(True, alpha=0.3)
plt.subplot(1, 2, 2)
plt.plot(test_accuracies, label='Test Accuracy', color='blue')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Test Accuracy Over Epochs')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 11. 混淆矩阵热力图
cm = confusion_matrix(y_test, y_final_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
# 12. 显示模型特征重要性
importances = final_model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(10), importances[indices[:10]], color='skyblue', edgecolor='black')
plt.xticks(range(10), [f'F{idx}' for idx in indices[:10]])
plt.tight_layout()
plt.show()
print("可视化完成!")
```