import os
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import (roc_curve, auc, classification_report,
confusion_matrix, accuracy_score)
from sklearn.linear_model import LogisticRegressionCV
from sklearn import svm, naive_bayes
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint, uniform
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
# 设置中文显示
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
# 自定义颜色映射
risk_cmap = LinearSegmentedColormap.from_list("risk_cmap", ["#4daf4a", "#ffff33", "#e41a1c"])
# 路径设置
outPath = r'D:\PyCharmproject\PychaemProject1\out'
savePath = r'D:\PyCharmproject\PychaemProject1\save'
rfPath = os.path.join(outPath, 'merged.csv')
cliPath = os.path.join(outPath, 'cli.csv')
rfsPath = os.path.join(outPath, 'selected_rf.csv')
clisPath = os.path.join(outPath, 'selected_cli.csv')
uniPath = os.path.join(outPath, 'uni.csv')
uniprobPath = os.path.join(outPath, 'uniprob.csv')
colrfsPath = os.path.join(savePath, 'colrfs_ndarr.jlb')
colclisPath = os.path.join(savePath, 'colclis_ndarr.jlb')
stdPath = os.path.join(savePath, 'std_uni.jlb')
unimPath = os.path.join(savePath, 'mdl_uni.jlb')
# 数据准备
col_rfs = np.append(pd.read_csv(rfsPath).columns.values, 'ID')
col_clis = np.append(pd.read_csv(clisPath).columns.values, '序号')
joblib.dump(col_rfs[:-2], colrfsPath)
joblib.dump(col_clis[:-2], colclisPath)
df_rfs = pd.read_csv(rfPath)[col_rfs]
df_clis = pd.read_csv(cliPath)[col_clis]
df_uni = pd.merge(df_rfs, df_clis, left_on='ID', right_on='序号')
del df_uni['Enlarge']
del df_uni['ID']
df_uni.dropna(axis=0, inplace=True)
df_uni.to_csv(uniPath, index=False)
# 特征和目标变量
y = df_uni['血肿扩大'].values
del df_uni['血肿扩大']
del df_uni['序号']
X = df_uni.values
features = df_uni.columns.values
print(f"目标变量形状: {y.shape}")
print(f"特征变量形状: {X.shape}")
# 数据标准化
stdsc = StandardScaler().fit(X)
joblib.dump(stdsc, stdPath)
X_std = stdsc.transform(X)
# 数据集分割
X_train, X_test, y_train, y_test = train_test_split(X_std,
y,
test_size=0.2,
random_state=2022)
print(f'训练集大小: {len(X_train)}')
print(f'测试集大小: {len(X_test)}')
# 模型训练
# 逻辑回归
lr = LogisticRegressionCV(cv=5,
scoring='roc_auc',
max_iter=10000,
class_weight='balanced',
verbose=1,
random_state=2022).fit(X_train, y_train)
print(f'最佳正则化参数: {lr.C_}')
# SVM
svm_rbf = svm.SVC(C=1,
kernel='rbf',
probability=True,
class_weight='balanced',
verbose=True,
random_state=2022).fit(X_train, y_train)
# KNN
knn = KNeighborsClassifier().fit(X_train, y_train)
# 高斯朴素贝叶斯
gnb = naive_bayes.GaussianNB().fit(X_train, y_train)
# 随机森林(使用随机搜索)
rfc = RandomForestClassifier(
bootstrap=True,
oob_score=True,
random_state=2022,
n_jobs=-1,
class_weight='balanced'
)
# 随机森林参数分布
param_dist_rfc = {
'n_estimators': randint(low=50, high=200),
'max_depth': randint(low=3, high=10),
'min_samples_split': randint(low=5, high=30),
'min_samples_leaf': randint(low=2, high=8),
'max_features': ['sqrt', 'log2', 0.3, 0.5, 0.7],
'max_samples': uniform(loc=0.7, scale=0.2),
'criterion': ['gini', 'entropy'],
'min_weight_fraction_leaf': [0.0, 0.01, 0.02]
}
# 随机搜索
optimizer_rfc = RandomizedSearchCV(
estimator=rfc,
param_distributions=param_dist_rfc,
n_iter=50,
cv=3,
scoring='roc_auc',
verbose=5,
random_state=2022,
n_jobs=-1
).fit(X_train, y_train)
best_rfc = optimizer_rfc.best_estimator_
print("随机森林最佳参数组合:", optimizer_rfc.best_params_)
print("随机森林最佳交叉验证AUC:", optimizer_rfc.best_score_)
# XGBoost(带网格搜索)
xgbcf = xgb.XGBClassifier(objective='binary:logistic', booster='gbtree')
param_grid_xgb = {
'learning_rate': [0.01, 0.05],
'gamma': [0.1, 0.3],
'max_depth': [3, 5],
'subsample': [0.6, 0.8],
'colsample_bytree': [0.6, 0.8],
'min_child_weight': [1, 2]
}
optimizer_xgb = GridSearchCV(xgbcf,
param_grid=param_grid_xgb,
cv=3,
scoring='roc_auc',
verbose=10).fit(X_train, y_train)
# 特征重要性(随机森林)
importances = optimizer_rfc.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]
print("\n随机森林特征重要性:")
for f in range(X.shape[1]):
print(f"{f + 1:2d}) {features[indices[f]]:<60} {importances[indices[f]]:.6f}")
# 特征重要性(XGBoost)
importances = optimizer_xgb.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]
print("\nXGBoost特征重要性:")
for f in range(X.shape[1]):
print(f"{f + 1:2d}) {features[indices[f]]:<60} {importances[indices[f]]:.6f}")
# 模型评估与可视化
model_names = ['LR', 'SVM', 'KNN', 'GNB', 'RF', 'XGB']
model_instances = [
lr, svm_rbf, knn, gnb, optimizer_rfc.best_estimator_,
optimizer_xgb.best_estimator_
]
plot_colors = [
'orange', 'gold', 'mediumseagreen', 'steelblue', 'mediumpurple', 'crimson'
]
# 1. 训练集ROC曲线
plt.figure(figsize=(12, 10), dpi=300)
for (name, instance, color) in zip(model_names, model_instances, plot_colors):
y_train_predprob = instance.predict_proba(X_train)[:, 1]
fpr, tpr, _ = roc_curve(y_train, y_train_predprob, pos_label=1)
plt.plot(fpr,
tpr,
lw=3,
label=f'{name} (AUC={auc(fpr, tpr):.3f})',
color=color)
plt.plot([0, 1], [0, 1], '--', lw=3, color='grey')
plt.axis('square')
plt.xlabel('假阳性率 (FPR)', fontsize=14)
plt.ylabel('真阳性率 (TPR)', fontsize=14)
plt.title('训练集ROC曲线', fontsize=18)
plt.legend(loc='lower right', fontsize=12)
plt.grid(alpha=0.3)
plt.savefig(os.path.join(outPath, 'train_roc_curves.svg'), format='svg')
plt.show()
# 2. 测试集ROC曲线
plt.figure(figsize=(12, 10), dpi=300)
test_aucs = []
test_accuracies = []
for (name, instance, color) in zip(model_names, model_instances, plot_colors):
y_test_preds = instance.predict(X_test)
y_test_predprob = instance.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_test_predprob, pos_label=1)
roc_auc = auc(fpr, tpr)
accuracy = accuracy_score(y_test, y_test_preds)
test_aucs.append(roc_auc)
test_accuracies.append(accuracy)
print(f'\n=========={name}模型评估==========')
print(classification_report(y_test, y_test_preds, target_names=['稳定', '扩大']))
print(f'准确率: {accuracy:.4f}')
print(f'AUC值: {roc_auc:.4f}')
plt.plot(fpr,
tpr,
lw=3,
label=f'{name} (AUC={roc_auc:.3f})',
color=color)
plt.plot([0, 1], [0, 1], '--', lw=3, color='grey')
plt.axis('square')
plt.xlabel('假阳性率 (FPR)', fontsize=14)
plt.ylabel('真阳性率 (TPR)', fontsize=14)
plt.title('测试集ROC曲线', fontsize=18)
plt.legend(loc='lower right', fontsize=12)
plt.grid(alpha=0.3)
plt.savefig(os.path.join(outPath, 'test_roc_curves.svg'), format='svg')
plt.show()
# 3. 模型准确率对比可视化
plt.figure(figsize=(10, 6), dpi=300)
x = np.arange(len(model_names))
plt.bar(x, test_accuracies, color=plot_colors, alpha=0.7)
plt.xticks(x, model_names, fontsize=12)
plt.ylabel('准确率', fontsize=14)
plt.title('各模型在测试集上的准确率对比', fontsize=16)
plt.ylim(0, 1.0)
for i, v in enumerate(test_accuracies):
plt.text(i, v + 0.01, f'{v:.3f}', ha='center', fontsize=10)
plt.grid(axis='y', alpha=0.3)
plt.savefig(os.path.join(outPath, 'model_accuracy_comparison.svg'), format='svg')
plt.show()
# 4. 混淆矩阵可视化(显示数量)
fig, axes = plt.subplots(2, 3, figsize=(18, 12), dpi=300)
axes = axes.flatten()
for i, (name, instance, color) in enumerate(zip(model_names, model_instances, plot_colors)):
y_test_preds = instance.predict(X_test)
cm = confusion_matrix(y_test, y_test_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i],
cbar=False, annot_kws={"size": 12})
axes[i].set_title(f'{name}模型混淆矩阵 (数量)', fontsize=14)
axes[i].set_xlabel('预测标签', fontsize=12)
axes[i].set_ylabel('真实标签', fontsize=12)
axes[i].set_xticklabels(['稳定', '扩大'])
axes[i].set_yticklabels(['稳定', '扩大'])
plt.tight_layout()
plt.savefig(os.path.join(outPath, 'confusion_matrices.svg'), format='svg')
plt.show()
# 选择最佳模型并保存
chosen_model = optimizer_rfc.best_estimator_
joblib.dump(chosen_model, unimPath)
print(f"最佳模型已保存至: {unimPath}")
# ====================== 风险分层系统 ======================
def create_risk_stratification(y_true, y_prob, n_bins=5, strategy='quantile'):
"""
创建风险分层系统
:param y_true: 真实标签
:param y_prob: 预测概率
:param n_bins: 分层数量
:param strategy: 分箱策略 ('quantile'等频, 'uniform'等宽)
:return: 包含分层信息的数据框
"""
# 使用KBinsDiscretizer进行分箱
discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy)
risk_groups = discretizer.fit_transform(y_prob.reshape(-1, 1)).flatten().astype(int)
# 创建风险组标签
group_labels = [f'第{i + 1}风险组' for i in range(n_bins)]
# 创建数据框
df_risk = pd.DataFrame({
'True': y_true,
'Prob': y_prob,
'Risk_Group': risk_groups
})
# 将数值标签转换为文本标签
df_risk['Risk_Label'] = df_risk['Risk_Group'].apply(lambda x: group_labels[x])
# 计算各组的实际风险率
risk_stats = df_risk.groupby('Risk_Label').agg(
Sample_Count=('Prob', 'count'),
Actual_Risk=('True', lambda x: f"{x.mean() * 100:.1f}%"),
Avg_Probability=('Prob', lambda x: f"{x.mean():.3f}"),
Min_Probability=('Prob', 'min'),
Max_Probability=('Prob', 'max'),
Risk_Score=('Risk_Group', 'mean') # 风险评分
).reset_index()
# 添加风险评分范围
risk_stats['Risk_Score_Range'] = risk_stats.apply(
lambda row: f"{row['Min_Probability']:.3f}-{row['Max_Probability']:.3f}", axis=1)
# 按风险组排序
risk_stats = risk_stats.sort_values('Risk_Score', ascending=False)
return df_risk, risk_stats
# 使用最佳模型生成全量数据的预测概率
full_probs = chosen_model.predict_proba(X_std)[:, 1]
# 创建风险分层(使用等频分箱)
df_risk, risk_stats = create_risk_stratification(y, full_probs, n_bins=5, strategy='quantile')
# 将风险分层信息合并到原始数据
df_uniprob = pd.read_csv(uniPath)
df_uniprob['Prob'] = full_probs
df_uniprob['Risk_Group'] = df_risk['Risk_Group']
df_uniprob['Risk_Label'] = df_risk['Risk_Label']
# 保存包含风险分层的数据
df_uniprob.to_csv(uniprobPath, index=False)
# 可视化风险分层结果
plt.figure(figsize=(16, 12), dpi=300)
plt.suptitle('血肿扩大风险分层分析', fontsize=20)
# 1. 各风险组实际血肿扩大率
plt.subplot(2, 2, 1)
ax1 = sns.barplot(x='Risk_Label', y='Actual_Risk',
data=risk_stats.assign(Actual_Risk=risk_stats['Actual_Risk'].str.rstrip('%').astype('float') / 100),
palette=risk_cmap(np.linspace(0, 1, len(risk_stats))),
saturation=0.8)
plt.title('各风险组实际血肿扩大率', fontsize=16)
plt.xlabel('风险组别', fontsize=14)
plt.ylabel('血肿扩大比例', fontsize=14)
plt.ylim(0, 1.0)
# 添加数值标签
for p in ax1.patches:
height = p.get_height()
ax1.annotate(f'{height:.2%}',
(p.get_x() + p.get_width() / 2., height),
ha='center', va='center',
xytext=(0, 10),
textcoords='offset points',
fontsize=12)
# 2. 各风险组预测概率分布
plt.subplot(2, 2, 2)
sns.boxplot(x='Risk_Label', y='Prob', data=df_risk,
palette=risk_cmap(np.linspace(0, 1, len(risk_stats))),
showfliers=False)
plt.title('各风险组预测概率分布', fontsize=16)
plt.xlabel('风险组别', fontsize=14)
plt.ylabel('预测概率', fontsize=14)
plt.ylim(0, 1.0)
# 3. 高风险患者分析(最高风险组)
plt.subplot(2, 2, 3)
high_risk_group = df_risk[df_risk['Risk_Group'] == df_risk['Risk_Group'].max()]
high_risk_features = df_uniprob.loc[high_risk_group.index, features].mean()
all_features = df_uniprob[features].mean()
# 计算特征贡献率(高风险组/总体平均)
feature_ratio = (high_risk_features / all_features).sort_values(ascending=False)[:10]
sns.barplot(x=feature_ratio.values, y=feature_ratio.index,
palette='rocket_r', saturation=0.8)
plt.title('高风险组特征值与总体平均值的比率(Top 10)', fontsize=16)
plt.xlabel('特征值比率(高风险组/总体)', fontsize=14)
plt.ylabel('特征', fontsize=14)
plt.xlim(0.8, max(feature_ratio.values) * 1.1)
# 添加基准线
plt.axvline(1.0, color='red', linestyle='--', alpha=0.7)
# 4. 风险评分与血肿扩大概率关系
plt.subplot(2, 2, 4)
# 使用等宽分箱创建更细粒度的评分
score_bins = pd.cut(df_risk['Prob'], bins=20)
score_stats = df_risk.groupby(score_bins).agg(
Risk_Score=('Prob', 'mean'),
Actual_Risk=('True', 'mean')
).reset_index()
sns.regplot(x='Risk_Score', y='Actual_Risk', data=score_stats,
scatter_kws={'s': 80, 'alpha': 0.7},
line_kws={'color': 'red', 'lw': 2},
order=2) # 二次多项式拟合
plt.title('风险评分与血肿扩大概率关系', fontsize=16)
plt.xlabel('风险评分', fontsize=14)
plt.ylabel('实际血肿扩大概率', fontsize=14)
plt.ylim(0, 1.0)
plt.grid(alpha=0.3)
# 添加相关系数
corr_coef = np.corrcoef(score_stats['Risk_Score'], score_stats['Actual_Risk'])[0, 1]
plt.annotate(f'相关系数: {corr_coef:.3f}', xy=(0.05, 0.9),
xycoords='axes fraction', fontsize=14)
plt.tight_layout(rect=[0, 0, 1, 0.96]) # 为总标题留出空间
plt.savefig(os.path.join(outPath, 'risk_stratification_analysis.svg'), format='svg')
plt.show()
# 高风险患者分析(最高风险组)
high_risk_label = risk_stats.iloc[0]['Risk_Label']
high_risk_group = df_uniprob[df_uniprob['Risk_Label'] == high_risk_label]
high_risk_rate = high_risk_group['血肿扩大'].mean()
print(f"\n高风险组分析({high_risk_label}):")
print(f"患者数量: {len(high_risk_group)}")
print(f"血肿扩大比例: {high_risk_rate * 100:.1f}%")
print(f"平均预测概率: {high_risk_group['Prob'].mean():.3f}")
print(f"预测概率范围: {high_risk_group['Prob'].min():.3f} - {high_risk_group['Prob'].max():.3f}")
# 保存风险分层统计结果
risk_stats.to_csv(os.path.join(outPath, 'risk_statistics.csv'), index=False)
print(f"\n风险分层统计已保存至: {os.path.join(outPath, 'risk_statistics.csv')}")
# 预测概率分布直方图
plt.figure(figsize=(14, 8), dpi=300)
probs_e = df_uniprob['Prob'][df_uniprob['血肿扩大'] == 1]
probs_s = df_uniprob['Prob'][df_uniprob['血肿扩大'] == 0]
# 添加风险分界线
risk_bins = risk_stats['Min_Probability'].values
risk_bins = np.sort(risk_bins)[1:] # 跳过第一个最小值
plt.hist(probs_e,
bins=50,
color='r',
alpha=0.7,
label='血肿扩大组')
plt.hist(probs_s,
bins=50,
color='g',
alpha=0.7,
label='血肿稳定组')
# 添加风险分界线
for bin_val in risk_bins:
plt.axvline(x=bin_val, color='blue', linestyle='--', alpha=0.7)
plt.title('预测概率分布与风险分界', fontsize=18)
plt.xlabel('预测概率', fontsize=14)
plt.ylabel('样本数量', fontsize=14)
plt.legend(loc='upper right', fontsize=12)
plt.grid(alpha=0.3)
# 添加风险组标签
for i, (min_val, max_val) in enumerate(zip(risk_bins[:-1], risk_bins[1:])):
plt.annotate(f'风险组 {i + 2}',
xy=((min_val + max_val) / 2, plt.ylim()[1] * 0.8),
ha='center', fontsize=10, color='blue')
plt.annotate('风险组 1',
xy=(risk_bins[0] / 2, plt.ylim()[1] * 0.8),
ha='center', fontsize=10, color='blue')
plt.annotate(f'风险组 {len(risk_bins) + 1}',
xy=((risk_bins[-1] + 1) / 2, plt.ylim()[1] * 0.8),
ha='center', fontsize=10, color='blue')
plt.savefig(os.path.join(outPath, 'probability_distribution.svg'), format='svg')
plt.show()
# 临床决策曲线分析
def plot_decision_curve(y_true, y_prob, label='模型'):
"""绘制临床决策曲线"""
# 计算净收益
thresholds = np.linspace(0, 1, 100)
net_benefits = []
for threshold in thresholds:
y_pred = (y_prob >= threshold).astype(int)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
n = len(y_true)
# 计算净收益
net_benefit = (tp / n) - (fp / n) * (threshold / (1 - threshold))
net_benefits.append(net_benefit)
# 绘制曲线
plt.plot(thresholds, net_benefits, label=label, lw=2)
# 添加参考线
plt.axhline(y=0, color='black', linestyle='--', lw=1)
return thresholds, net_benefits
plt.figure(figsize=(12, 8), dpi=300)
plot_decision_curve(y, full_probs, '风险评分模型')
# 添加极端情况参考线
plt.plot([0, 1], [0, 0], 'k--', label='不干预策略')
plt.plot([0, 1], [0, 0.1], 'k:', label='干预所有策略') # 简化示例
plt.xlabel('风险阈值概率', fontsize=14)
plt.ylabel('净收益', fontsize=14)
plt.title('临床决策曲线分析 (DCA)', fontsize=18)
plt.legend(loc='upper right', fontsize=12)
plt.grid(alpha=0.3)
plt.xlim(0, 1)
plt.ylim(-0.1, 0.3) # 调整y轴范围以更好显示
plt.savefig(os.path.join(outPath, 'decision_curve_analysis.svg'), format='svg')
plt.show()
print("风险分层系统构建完成,所有结果已保存至输出目录")
把生成的图片变成svg格式