import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
# 设置默认字体和解决负号显示问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 读取数据
data = pd.read_excel(r'C:\Users\冯雪玲\Desktop\北京市空气质量数据.xlsx')
data = data.replace(0, np.nan) # 替换0值为缺失值
data = data.dropna() # 删除缺失值
data['有无污染'] = data['质量等级'].map({'优': 0, '良': 0, '轻度污染': 1, '中度污染': 1, '重度污染': 1, '严重污染': 1})
# 目标变量转换(确保字典键使用英文引号)
quality_mapping = {'优':0, '良':0, '轻度污染':1, '中度污染':1, '重度污染':1, '严重污染':1}
data['有无污染'] = data['质量等级'].map(quality_mapping)
# 2. 特征与目标变量定义
X = data.loc[:, ['PM2.5', 'PM10', 'SO2', 'CO', 'NO2', 'O3']] # 修正列名大小写和空格
y = data.loc[:, '有无污染'] # 确保列名与数据一致
# 3. 模型训练
modelNB = GaussianNB()
modelNB.fit(X, y)
modelLR = LogisticRegression(max_iter=1000) # 增加迭代次数确保收敛
modelLR.fit(X, y)
# 4. 模型评估
print("贝叶斯分类器报告:\n", classification_report(y, modelNB.predict(X)))
print("\n逻辑回归分类器报告:\n", classification_report(y, modelLR.predict(X)))# 5. 可视化对比(修正绘图参数)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))
# ROC曲线绘制
fpr, tpr, _ = roc_curve(y, modelNB.predict_proba(X)[:,1], pos_label=1)
fpr1, tpr1, _ = roc_curve(y, modelLR.predict_proba(X)[:,1], pos_label=1)
axes[0].plot(fpr, tpr, color='r', label=f'贝叶斯 (AUC={auc(fpr,tpr):.4f})')
axes[0].plot(fpr1, tpr1, 'b--', label=f'逻辑回归 (AUC={auc(fpr1,tpr1):.4f})')
axes[0].plot([0,1], [0,1], 'k--')
axes[0].set(xlim=[-0.01,1.01], ylim=[-0.01,1.01],
xlabel='False Positive Rate', ylabel='True Positive Rate',
title='ROC曲线对比')
# PR曲线绘制
precision, recall, _ = precision_recall_curve(y, modelNB.predict_proba(X)[:,1], pos_label=1)
precision1, recall1, _ = precision_recall_curve(y, modelLR.predict_proba(X)[:,1], pos_label=1)
axes[1].plot(recall, precision, 'r',
label=f'贝叶斯 (准确率={accuracy_score(y, modelNB.predict(X)):.3f})')
axes[1].plot(recall1, precision1, 'b--',
label=f'逻辑回归 (准确率={accuracy_score(y