import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
# 设置默认字体和解决负号显示问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 读取数据
data = pd.read_excel(r'C:\Users\冯雪玲\Desktop\北京市空气质量数据.xlsx') # 修正拼写错误 read_excel → read_excel
data = data.replace(0, np.nan) # 替换0值为NaN
data = data.dropna() # 删除缺失值
# 创建目标变量
# 修正字典语法错误(冒号→逗号,删除多余小数点)
data['有无污染'] = data['质量等级'].map({
'优': 0,
'良': 0,
'轻度污染': 1,
'中度污染': 1, # 修正 1.'重度污染' → 1,
'重度污染': 1,
'严重污染': 1})
# 划分特征与标签
X = data.loc[:, ['PM2.5', 'PM10', 'SO2', 'CO', 'NO2', 'O3']]
Y = data.loc[:, '有无污染']
# 模型训练与评估
modelNB = GaussianNB()
modelNB.fit(X, Y)
modelLR = LogisticRegression()
modelLR.fit(X, Y)
# 输出朴素贝叶斯模型结果
print('朴素贝叶斯模型结果:\n', classification_report(Y, modelNB.predict(X)))
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve
# --------------------- 绘图代码 ---------------------
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,4))
# ROC曲线
fpr, tpr, _ = roc_curve(Y, modelNB.predict_proba(X)[:, 1], pos_label=1)
fpr1, tpr1, _ = roc_curve(Y, modelLR.predict_proba(X)[:, 1], pos_label=1)
axes[0].plot(fpr, tpr, color='r', label=f'朴素贝叶斯 ROC (AUC={auc(fpr, tpr):.3f})')
axes[0].plot(fpr1, tpr1, color='blue', label=f'逻辑回归 ROC (AUC={auc(fpr1, tpr1):.3f})')
axes[0].plot([0, 1], [0, 1], linestyle='--', color='navy')
axes[0].set(xlabel='假正率 (FPR)', ylabel='真正率 (TPR)', title='ROC曲线对比')
axes[0].legend()
# PR曲线
pre, rec, _ = precision_recall_curve(Y, modelNB.predict_proba(X)[:, 1], pos_label=1)
pre1, rec1, _ = precision_recall_curve(Y, modelLR.predict_proba(X)[:, 1], pos_label=1)
axes[1].plot(rec, pre, color='r', label=f'朴素贝叶斯 (AP={auc(rec, pre):.3f})')
axes[1].plot(rec1, pre1, color='blue', label=f'逻辑回归 (AP={auc(rec1, pre1):.3f})')
axes[1].set(xlabel='召回率 (Recall)', ylabel='精确率 (Precision)', title='P-R曲线对比')
axes[1].legend()
plt.tight_layout()
plt.show()解析代码