import os
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import (roc_curve, auc, classification_report,
confusion_matrix, accuracy_score)
from sklearn.linear_model import LogisticRegressionCV
from sklearn import svm, naive_bayes
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint, uniform # 用于随机搜索的参数分布
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns # 用于混淆矩阵可视化
# 设置中文显示
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
outPath = r'D:\PyCharmproject\PychaemProject1\out'
savePath = r'D:\PyCharmproject\PychaemProject1\save'
rfPath = os.path.join(outPath, 'merged.csv')
cliPath = os.path.join(outPath, 'cli.csv')
rfsPath = os.path.join(outPath, 'selected_rf.csv')
clisPath = os.path.join(outPath, 'selected_cli.csv')
uniPath = os.path.join(outPath, 'uni.csv')
uniprobPath = os.path.join(outPath, 'uniprob.csv')
colrfsPath = os.path.join(savePath, 'colrfs_ndarr.jlb')
colclisPath = os.path.join(savePath, 'colclis_ndarr.jlb')
stdPath = os.path.join(savePath, 'std_uni.jlb')
unimPath = os.path.join(savePath, 'mdl_uni.jlb')
# 数据准备
col_rfs = np.append(pd.read_csv(rfsPath).columns.values, 'ID')
col_clis = np.append(pd.read_csv(clisPath).columns.values, '序号')
joblib.dump(col_rfs[:-2], colrfsPath)
joblib.dump(col_clis[:-2], colclisPath)
df_rfs = pd.read_csv(rfPath)[col_rfs]
df_clis = pd.read_csv(cliPath)[col_clis]
df_uni = pd.merge(df_rfs, df_clis, left_on='ID', right_on='序号')
del df_uni['Enlarge']
del df_uni['ID']
df_uni.dropna(axis=0, inplace=True)
df_uni.to_csv(uniPath, index=False)
# 特征和目标变量
y = df_uni['血肿扩大'].values
del df_uni['血肿扩大']
del df_uni['序号']
X = df_uni.values
features = df_uni.columns.values
print(f"目标变量形状: {y.shape}")
print(f"特征变量形状: {X.shape}")
# 数据标准化
stdsc = StandardScaler().fit(X)
joblib.dump(stdsc, stdPath)
X_std = stdsc.transform(X)
# 数据集分割
X_train, X_test, y_train, y_test = train_test_split(X_std,
y,
test_size=0.2,
random_state=2022)
print(f'训练集大小: {len(X_train)}')
print(f'测试集大小: {len(X_test)}')
# 模型训练
# 逻辑回归
lr = LogisticRegressionCV(cv=5,
scoring='roc_auc',
max_iter=10000,
class_weight='balanced',
verbose=1,
random_state=2022).fit(X_train, y_train)
print(f'最佳正则化参数: {lr.C_}')
# SVM
svm_rbf = svm.SVC(C=1,
kernel='rbf',
probability=True,
class_weight='balanced',
verbose=True,
random_state=2022).fit(X_train, y_train)
# KNN
knn = KNeighborsClassifier().fit(X_train, y_train)
# 高斯朴素贝叶斯
gnb = naive_bayes.GaussianNB().fit(X_train, y_train)
# 随机森林(使用随机搜索)
rfc = RandomForestClassifier(
bootstrap=True,
oob_score=True,
random_state=2022,
n_jobs=-1,
class_weight='balanced'
)
# 随机森林参数分布
param_dist_rfc = {
'n_estimators': randint(low=50, high=200),
'max_depth': randint(low=3, high=10),
'min_samples_split': randint(low=5, high=30),
'min_samples_leaf': randint(low=2, high=8),
'max_features': ['sqrt', 'log2', 0.3, 0.5, 0.7],
'max_samples': uniform(loc=0.7, scale=0.2),
'criterion': ['gini', 'entropy'],
'min_weight_fraction_leaf': [0.0, 0.01, 0.02]
}
# 随机搜索
optimizer_rfc = RandomizedSearchCV(
estimator=rfc,
param_distributions=param_dist_rfc,
n_iter=50,
cv=3,
scoring='roc_auc',
verbose=5,
random_state=2022,
n_jobs=-1
).fit(X_train, y_train)
best_rfc = optimizer_rfc.best_estimator_
print("随机森林最佳参数组合:", optimizer_rfc.best_params_)
print("随机森林最佳交叉验证AUC:", optimizer_rfc.best_score_)
# XGBoost(带网格搜索)
xgbcf = xgb.XGBClassifier(objective='binary:logistic', booster='gbtree')
param_grid_xgb = {
'learning_rate': [0.01, 0.05],
'gamma': [0.1, 0.3],
'max_depth': [3, 5],
'subsample': [0.6, 0.8],
'colsample_bytree': [0.6, 0.8],
'min_child_weight': [1, 2]
}
optimizer_xgb = GridSearchCV(xgbcf,
param_grid=param_grid_xgb,
cv=3,
scoring='roc_auc',
verbose=10).fit(X_train, y_train)
# 特征重要性(随机森林)
importances = optimizer_rfc.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]
print("\n随机森林特征重要性:")
for f in range(X.shape[1]):
print(f"{f + 1:2d}) {features[indices[f]]:<60} {importances[indices[f]]:.6f}")
# 特征重要性(XGBoost)
importances = optimizer_xgb.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]
print("\nXGBoost特征重要性:")
for f in range(X.shape[1]):
print(f"{f + 1:2d}) {features[indices[f]]:<60} {importances[indices[f]]:.6f}")
# 模型评估与可视化
model_names = ['LR', 'SVM', 'KNN', 'GNB', 'RF', 'XGB']
model_instances = [
lr, svm_rbf, knn, gnb, optimizer_rfc.best_estimator_,
optimizer_xgb.best_estimator_
]
plot_colors = [
'orange', 'gold', 'mediumseagreen', 'steelblue', 'mediumpurple', 'crimson'
]
# 1. 训练集ROC曲线
plt.figure(figsize=(12, 10), dpi=300)
for (name, instance, color) in zip(model_names, model_instances, plot_colors):
y_train_predprob = instance.predict_proba(X_train)[:, 1]
fpr, tpr, _ = roc_curve(y_train, y_train_predprob, pos_label=1)
plt.plot(fpr,
tpr,
lw=3,
label=f'{name} (AUC={auc(fpr, tpr):.3f})',
color=color)
plt.plot([0, 1], [0, 1], '--', lw=3, color='grey')
plt.axis('square')
plt.xlabel('假阳性率 (FPR)', fontsize=14)
plt.ylabel('真阳性率 (TPR)', fontsize=14)
plt.title('训练集ROC曲线', fontsize=18)
plt.legend(loc='lower right', fontsize=12)
plt.grid(alpha=0.3)
plt.show()
# 2. 测试集ROC曲线
plt.figure(figsize=(12, 10), dpi=300)
test_aucs = []
test_accuracies = []
for (name, instance, color) in zip(model_names, model_instances, plot_colors):
y_test_preds = instance.predict(X_test)
y_test_predprob = instance.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_test_predprob, pos_label=1)
roc_auc = auc(fpr, tpr)
accuracy = accuracy_score(y_test, y_test_preds)
test_aucs.append(roc_auc)
test_accuracies.append(accuracy)
print(f'\n=========={name}模型评估==========')
print(classification_report(y_test, y_test_preds, target_names=['稳定', '扩大']))
print(f'准确率: {accuracy:.4f}')
print(f'AUC值: {roc_auc:.4f}')
plt.plot(fpr,
tpr,
lw=3,
label=f'{name} (AUC={roc_auc:.3f})',
color=color)
plt.plot([0, 1], [0, 1], '--', lw=3, color='grey')
plt.axis('square')
plt.xlabel('假阳性率 (FPR)', fontsize=14)
plt.ylabel('真阳性率 (TPR)', fontsize=14)
plt.title('测试集ROC曲线', fontsize=18)
plt.legend(loc='lower right', fontsize=12)
plt.grid(alpha=0.3)
plt.show()
# 3. 模型准确率对比可视化
plt.figure(figsize=(10, 6), dpi=300)
x = np.arange(len(model_names))
plt.bar(x, test_accuracies, color=plot_colors, alpha=0.7)
plt.xticks(x, model_names, fontsize=12)
plt.ylabel('准确率', fontsize=14)
plt.title('各模型在测试集上的准确率对比', fontsize=16)
plt.ylim(0, 1.0)
for i, v in enumerate(test_accuracies):
plt.text(i, v + 0.01, f'{v:.3f}', ha='center', fontsize=10)
plt.grid(axis='y', alpha=0.3)
plt.show()
# 4. 混淆矩阵可视化(显示数量)
fig, axes = plt.subplots(2, 3, figsize=(18, 12), dpi=300)
axes = axes.flatten()
# 修复循环变量解包问题
for i, (name, instance, color) in enumerate(zip(model_names, model_instances, plot_colors)):
y_test_preds = instance.predict(X_test)
cm = confusion_matrix(y_test, y_test_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i],
cbar=False, annot_kws={"size": 12})
axes[i].set_title(f'{name}模型混淆矩阵 (数量)', fontsize=14)
axes[i].set_xlabel('预测标签', fontsize=12)
axes[i].set_ylabel('真实标签', fontsize=12)
axes[i].set_xticklabels(['稳定', '扩大'])
axes[i].set_yticklabels(['稳定', '扩大'])
plt.tight_layout()
plt.show()
# 选择最佳模型并保存
chosen_model = optimizer_rfc.best_estimator_
# 生成预测概率文件
df_uniprob = pd.read_csv(uniPath)
df_uniprob.insert(df_uniprob.shape[1], 'Prob', 0)
for index, row in df_uniprob.iterrows():
x = stdsc.transform(row[:-3].values.reshape(1, -1))
pred_prob = chosen_model.predict_proba(x)[:, 1]
df_uniprob.loc[index, 'Prob'] = pred_prob
df_uniprob.info()
df_uniprob.to_csv(uniprobPath, index=False)
# 预测概率分布直方图
probs_e = df_uniprob['Prob'][df_uniprob['血肿扩大'] == 1]
probs_s = df_uniprob['Prob'][df_uniprob['血肿扩大'] == 0]
plt.figure(figsize=(12, 8), dpi=300)
plt.hist(probs_e,
range=[0, 1],
bins=20,
color='r',
alpha=0.7,
label='血肿扩大组')
plt.hist(probs_s,
range=[0, 1],
bins=20,
color='g',
alpha=0.7,
label='血肿稳定组')
plt.title('预测概率分布', fontsize=18)
plt.xlabel('预测概率', fontsize=14)
plt.ylabel('样本数量', fontsize=14)
plt.legend(loc='upper right', fontsize=12)
plt.grid(alpha=0.3)
plt.show()
# 保存最佳模型
joblib.dump(chosen_model, unimPath)
print(f"最佳模型已保存至: {unimPath}")
对上面代码修改,要求如下:
1. 与概率的输出结果相呼应,说明这些高风险的患者概率多高,最好能做可视化分析;
2. 优化评分系统,使不同风险组间评分差异明显。
需要根据这两个需求优化下算法,简单来说,现有是根据阈值进行二分类,但后续需要进行多分类问题,根据风险评分,划分层次,每个层次下,最终特征(血肿扩大)为真的概率可以衡量
应该如何调整呢,其中多分类问题使用:等频分箱划分为五个等级
最新发布