@创建于:2023.02.19
@修改于:2023.02.19
1、理论计算方法
代码如下:
from sklearn.metrics import roc_curve, roc_auc_score
#label是每个样本对应的真实标签(0或1),pred_prob是模型输出的对每个样本的预测概率
FPR, TPR, _ = roc_curve(label, pred_prob, pos_label = 1)
AUC = roc_auc_score(label, pred_prob)
from scipy.stats import norm
import numpy as np
def AUC_CI(auc, label, alpha = 0.05):
label = np.array(label)#防止label不是array类型
n1, n2 = np.sum(label == 1), np.sum(label == 0)
q1 = auc / (2-auc)
q2 = (2 * auc ** 2) / (1 + auc)
se = np.sqrt((auc * (1 - auc) + (n1 - 1) * (q1 - auc ** 2) + (n2 -1) * (q2 - auc ** 2)) / (n1 * n2))
confidence_level = 1 - alpha
z_lower, z_upper = norm.interval(confidence_level)
lowerb, upperb = auc + z_lower * se, auc + z_upper * se
return (lowerb, upperb)
import matplotlib.pyplot as plt
def plot_AUC(ax, FPR, TPR, AUC, CI, label):
label = '{}: {} ({}-{})'.format(str(label), round(AUC, 3), round(CI[0], 3), round(CI[1], 3))
ax.plot(FPR, TPR, label = label)
return ax
Python - matplotlib - ROC曲线(Receiver Operating Characteristic curve)
2、对训练数据集采样
def bootstrap_auc(clf, X_train, y_train, X_test, y_test, nsamples=1000):
auc_values = []
for b in range(nsamples):
idx = np.random.randint(X_train.shape[0], size=X_train.shape[0])
clf.fit(X_train.iloc[idx], y_train.iloc[idx])
pred = clf.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test.ravel(), pred.ravel())
auc_values.append(roc_auc)
return np.percentile(auc_values, (2.5, 97.5))
# 还可以计算所有值的 2*sigma
3、对测试数据集采样
def bootstrap_auc(y, pred, classes, bootstraps = 100, fold_size = 1000):
statistics = np.zeros((len(classes), bootstraps))
for c in range(len(classes)):
df = pd.DataFrame(columns=['y', 'pred'])
# df.
df.loc[:, 'y'] = y
df.loc[:, 'pred'] = pred
df_pos = df[df.y == 1]
df_neg = df[df.y == 0]
prevalence = len(df_pos) / len(df)
for i in range(bootstraps):
pos_sample = df_pos.sample(n = int(fold_size * prevalence), replace=True)
neg_sample = df_neg.sample(n = int(fold_size * (1-prevalence)), replace=True)
y_sample = np.concatenate([pos_sample.y.values, neg_sample.y.values])
pred_sample = np.concatenate([pos_sample.pred.values, neg_sample.pred.values])
score = roc_auc_score(y_sample, pred_sample)
statistics[c][i] = score
return statistics
y = np.array([1, 1, 0, 0])
scores = np.array([0.1, 0.4, 0.35, 0.8])
statistics = bootstrap_auc(y,scores,[0,1])
print("均值:",np.mean(statistics,axis=1))
print("最大值:",np.max(statistics,axis=1))
print("最小值:",np.min(statistics,axis=1))
//均值: [0.24828344 0.2484288 ]
//最大值: [0.281912 0.286216]
//最小值: [0.204516 0.205208]