导入必要的库
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,
ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.patches import Patch
import warnings
warnings.filterwarnings(‘ignore’)
设置中文字体
plt.rcParams[‘font.sans-serif’] = [‘SimHei’] # 用来正常显示中文标签
plt.rcParams[‘axes.unicode_minus’] = False # 用来正常显示负号
尝试导入可选库
try:
from xgboost import XGBClassifier
XGB_AVAILABLE = True
except ImportError:
print(“XGBoost 不可用,跳过 XGBoost 模型”)
XGB_AVAILABLE = False
try:
from lightgbm import LGBMClassifier
LGBM_AVAILABLE = True
except ImportError:
print(“LightGBM 不可用,跳过 LightGBM 模型”)
LGBM_AVAILABLE = False
try:
import shap
SHAP_AVAILABLE = True
except ImportError:
print(“SHAP 不可用,跳过 SHAP 解释”)
SHAP_AVAILABLE = False
定义张继权教授的四因子理论函数
def apply_four_factor_theory(df):
# 检查所需的列是否存在
required_cols = [‘PFBA’, ‘PFPeA’, ‘PFHxA’, ‘PFHpA’, ‘PFOA’, ‘PFNA’, ‘PFDA’, ‘PFBS’, ‘PFHxS’, ‘PFOS’]
available_cols = [col for col in required_cols if col in df.columns]
if len(available_cols) < 4: print(f"警告: 只有 {len(available_cols)} 个PFAS列可用,可能需要调整四因子计算") # 短期酸类因子 (PFBA, PFPeA, PFHxA) short_term_cols = [col for col in ['PFBA', 'PFPeA', 'PFHxA'] if col in df.columns] if short_term_cols: df['Short_term_acid_factor'] = df[short_term_cols].mean(axis=1, skipna=True) else: df['Short_term_acid_factor'] = 0 # 长期酸类因子 (PFHpA, PFOA, PFNA, PFDA) long_term_cols = [col for col in ['PFHpA', 'PFOA', 'PFNA', 'PFDA'] if col in df.columns] if long_term_cols: df['Long_term_acid_factor'] = df[long_term_cols].mean(axis=1, skipna=True) else: df['Long_term_acid_factor'] = 0 # 磺酸类因子 (PFBS, PFHxS, PFOS) sulfonate_cols = [col for col in ['PFBS', 'PFHxS', 'PFOS'] if col in df.columns] if sulfonate_cols: df['Sulfonate_factor'] = df[sulfonate_cols].mean(axis=1, skipna=True) else: df['Sulfonate_factor'] = 0 # 暴露因子 (总PFAS浓度) all_pfas_cols = [col for col in required_cols if col in df.columns] if all_pfas_cols: df['Exposure_factor'] = df[all_pfas_cols].sum(axis=1, skipna=True) else: df['Exposure_factor'] = 0 return df
自定义表格打印函数
def print_table(data, headers=None, title=None):
if title:
print(f"\n{title}“)
print(”=" * 60)
if headers: # 打印表头 header_line = " | ".join(f"{h:>15}" for h in headers) print(header_line) print("-" * len(header_line)) # 打印数据行 for row in data: if isinstance(row, (list, tuple)): row_line = " | ".join(f"{str(item):>15}" for item in row) else: # 处理DataFrame行 row_line = " | ".join(f"{str(row[col]):>15}" for col in headers) print(row_line)
尝试多种编码方式读取文件
def read_csv_with_encodings(file_path):
# 尝试的编码列表(中文环境中常见的编码)
encodings = [‘utf-8’, ‘gbk’, ‘gb2312’, ‘gb18030’, ‘latin1’, ‘cp936’]
for encoding in encodings: try: print(f"尝试使用 {encoding} 编码读取文件...") df = pd.read_csv(file_path, encoding=encoding) # 删除完全为空的行和列 df = df.dropna(how='all', axis=0) df = df.dropna(how='all', axis=1) print(f"成功使用 {encoding} 编码读取文件") return df except UnicodeDecodeError: continue except Exception as e: print(f"使用 {encoding} 编码时出错: {e}") continue # 如果所有编码都失败,尝试使用错误处理 try: print("尝试使用错误处理方式读取文件...") df = pd.read_csv(file_path, encoding='utf-8', errors='ignore') df = df.dropna(how='all', axis=0) df = df.dropna(how='all', axis=1) print("使用错误处理方式成功读取文件") return df except Exception as e: print(f"所有读取尝试都失败: {e}") return None
加载CSV文件并跳过完全为空的行列
file_path = r’E:\pycharm\meta\整合数据.csv’ # 使用原始字符串表示法
try:
# 使用多种编码方式尝试读取文件
df = read_csv_with_encodings(file_path)
if df is None: print("无法读取文件,请检查文件路径和格式") exit() print(f"数据形状: {df.shape}") print(f"列名: {df.columns.tolist()}")
except Exception as e:
print(f"读取文件时出错: {e}")
exit()
数据预处理
检查PFAS相关列是否存在,如果不存在则尝试找到类似的列
expected_features = [‘PFBA’, ‘PFPeA’, ‘PFHxA’, ‘PFHpA’, ‘PFOA’, ‘PFNA’, ‘PFDA’, ‘PFBS’, ‘PFHxS’, ‘PFOS’]
available_features = []
for feature in expected_features:
if feature in df.columns:
available_features.append(feature)
else:
# 尝试找到包含该字符串的列
matching_cols = [col for col in df.columns if feature.lower() in col.lower()]
if matching_cols:
available_features.extend(matching_cols)
print(f"使用 ‘{matching_cols[0]}’ 替代 ‘{feature}’")
如果没有找到任何PFAS特征,使用所有数值列
if not available_features:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
available_features = numeric_cols
print(f"未找到PFAS特征,使用所有数值列: {available_features}")
print(f"可用的PFAS特征: {available_features}")
检查目标列
target_column = ‘城市’
if target_column not in df.columns:
# 尝试找到可能的目标列
possible_targets = [col for col in df.columns if
any(word in col for word in [‘城市’, ‘地区’, ‘区域’, ‘地点’, ‘city’, ‘region’])]
if possible_targets:
target_column = possible_targets[0]
print(f"使用 ‘{target_column}’ 作为目标变量")
else:
# 如果没有找到,使用第一列非数值列作为目标
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns
if len(non_numeric_cols) > 0:
target_column = non_numeric_cols[0]
print(f"使用 ‘{target_column}’ 作为目标变量")
else:
# 如果没有非数值列,使用最后一列作为目标
target_column = df.columns[-1]
print(f"使用最后一列 ‘{target_column}’ 作为目标变量")
处理缺失值(用中位数填充)
for feature in available_features:
if feature in df.columns:
df[feature] = pd.to_numeric(df[feature], errors=‘coerce’)
df[feature] = df[feature].fillna(df[feature].median())
应用张继权教授的四因子理论
df = apply_four_factor_theory(df)
添加四因子到特征中
features = available_features + [‘Short_term_acid_factor’, ‘Long_term_acid_factor’, ‘Sulfonate_factor’,
‘Exposure_factor’]
print(f"最终使用的特征: {features}")
编码目标变量(多分类)
le = LabelEncoder()
df[target_column] = le.fit_transform(df[target_column].fillna(‘Unknown’))
class_names = le.classes_
分割数据集
X = df[features]
y = df[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"训练集大小: {X_train.shape}“)
print(f"测试集大小: {X_test.shape}”)
print(f"目标变量类别数: {len(np.unique(y))}")
标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
PCA降维(保留95%方差)
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
print(f"PCA降维后特征数: {X_train_pca.shape[1]}")
========== 新增:相关性分析 ==========
print(“\n进行相关性分析…”)
计算特征之间的相关性矩阵
correlation_matrix = X.corr()
绘制相关性热图
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap=‘coolwarm’, center=0, fmt=‘.2f’)
plt.title(‘特征相关性热图’)
plt.tight_layout()
plt.savefig(‘correlation_heatmap.png’, dpi=300, bbox_inches=‘tight’)
plt.show()
找出高度相关的特征对
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
for j in range(i + 1, len(correlation_matrix.columns)):
if abs(correlation_matrix.iloc[i, j]) > 0.8: # 阈值设为0.8
high_corr_pairs.append((
correlation_matrix.columns[i],
correlation_matrix.columns[j],
correlation_matrix.iloc[i, j]
))
if high_corr_pairs:
print(“高度相关的特征对:”)
for pair in high_corr_pairs:
print(f" {pair[0]} 和 {pair[1]}: {pair[2]:.3f}")
else:
print(“没有发现高度相关的特征对(相关系数>0.8)”)
========== 新增:Meta分析 - 数据分布可视化 ==========
print(“\n进行Meta分析…”)
1. 目标变量分布
plt.figure(figsize=(10, 6))
df[target_column].value_counts().plot(kind=‘bar’)
plt.title(‘目标变量分布’)
plt.xlabel(‘类别’)
plt.ylabel(‘样本数量’)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(‘target_distribution.png’, dpi=300, bbox_inches=‘tight’)
plt.show()
2. 四因子分布
four_factor_cols = [‘Short_term_acid_factor’, ‘Long_term_acid_factor’, ‘Sulfonate_factor’, ‘Exposure_factor’]
if all(col in df.columns for col in four_factor_cols):
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()
for i, col in enumerate(four_factor_cols): axes[i].hist(df[col], bins=30, alpha=0.7, color='skyblue', edgecolor='black') axes[i].set_title(f'{col} 分布') axes[i].set_xlabel(col) axes[i].set_ylabel('频率') plt.tight_layout() plt.savefig('four_factor_distribution.png', dpi=300, bbox_inches='tight') plt.show()
3. 特征箱线图
plt.figure(figsize=(15, 10))
X.boxplot()
plt.title(‘特征箱线图’)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(‘feature_boxplot.png’, dpi=300, bbox_inches=‘tight’)
plt.show()
4. PCA可视化
plt.figure(figsize=(10, 8))
if X_train_pca.shape[1] >= 2:
scatter = plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=y_train, cmap=‘viridis’, alpha=0.7)
plt.colorbar(scatter)
plt.xlabel(‘第一主成分’)
plt.ylabel(‘第二主成分’)
plt.title(‘PCA降维可视化’)
plt.tight_layout()
plt.savefig(‘pca_visualization.png’, dpi=300, bbox_inches=‘tight’)
plt.show()
5. 累计方差解释率
plt.figure(figsize=(10, 6))
pca_full = PCA().fit(X_train_scaled)
plt.plot(np.cumsum(pca_full.explained_variance_ratio_))
plt.xlabel(‘主成分数量’)
plt.ylabel(‘累计方差解释率’)
plt.title(‘PCA累计方差解释率’)
plt.grid(True)
plt.tight_layout()
plt.savefig(‘pca_explained_variance.png’, dpi=300, bbox_inches=‘tight’)
plt.show()
========== 模型训练和评估 ==========
定义机器学习模型
models = {
‘LogisticRegression’: LogisticRegression(max_iter=1000, random_state=42),
‘DecisionTree’: DecisionTreeClassifier(random_state=42),
‘RandomForest’: RandomForestClassifier(random_state=42),
‘SVM’: SVC(random_state=42, probability=True),
‘KNN’: KNeighborsClassifier(),
‘NaiveBayes’: GaussianNB(),
‘GradientBoosting’: GradientBoostingClassifier(random_state=42),
‘MLP’: MLPClassifier(max_iter=1000, random_state=42),
‘AdaBoost’: AdaBoostClassifier(random_state=42),
‘ExtraTrees’: ExtraTreesClassifier(random_state=42)
}
添加可选模型
if XGB_AVAILABLE:
models[‘XGBoost’] = XGBClassifier(use_label_encoder=False, eval_metric=‘mlogloss’, random_state=42)
if LGBM_AVAILABLE:
models[‘LightGBM’] = LGBMClassifier(random_state=42)
print(f"将训练 {len(models)} 个模型")
训练和评估模型
results = []
cv_results = [] # 用于存储交叉验证结果
model_objects = {} # 存储训练好的模型对象
print(“开始训练模型…”)
for name, model in models.items():
print(f"训练 {name}…")
try: # 训练模型 model.fit(X_train_pca, y_train) model_objects[name] = model # 预测和评估 y_pred = model.predict(X_test_pca) y_pred_proba = model.predict_proba(X_test_pca) if hasattr(model, "predict_proba") else None acc = accuracy_score(y_test, y_pred) report = classification_report(y_test, y_pred, output_dict=True, zero_division=0) # 交叉验证 cv_scores = cross_val_score(model, X_train_pca, y_train, cv=5, scoring='accuracy') cv_mean = cv_scores.mean() cv_std = cv_scores.std() results.append([ name, acc, report['weighted avg']['precision'], report['weighted avg']['recall'], report['weighted avg']['f1-score'], cv_mean, cv_std ]) cv_results.append({ 'model': name, 'scores': cv_scores }) except Exception as e: print(f"训练模型 {name} 时出错: {e}") results.append([name, 0, 0, 0, 0, 0, 0])
生成结果表格
headers = [‘Model’, ‘Accuracy’, ‘Precision’, ‘Recall’, ‘F1-Score’, ‘CV Mean’, ‘CV Std’]
results_df = pd.DataFrame(results, columns=headers)
results_df = results_df.sort_values(‘Accuracy’, ascending=False)
print(“\n模型性能排名:”)
print_table(results_df.values.tolist(), headers=headers, title=“模型性能比较”)
========== 新增:模型性能可视化 ==========
print(“\n生成模型性能可视化…”)
1. 模型准确率比较
plt.figure(figsize=(12, 8))
models_names = results_df[‘Model’]
accuracies = results_df[‘Accuracy’]
cv_means = results_df[‘CV Mean’]
cv_stds = results_df[‘CV Std’]
x = np.arange(len(models_names))
width = 0.35
plt.bar(x - width / 2, accuracies, width, label=‘测试集准确率’, alpha=0.7)
plt.bar(x + width / 2, cv_means, width, yerr=cv_stds, label=‘交叉验证准确率’, alpha=0.7, capsize=5)
plt.xlabel(‘模型’)
plt.ylabel(‘准确率’)
plt.title(‘模型性能比较’)
plt.xticks(x, models_names, rotation=45)
plt.legend()
plt.tight_layout()
plt.savefig(‘model_performance_comparison.png’, dpi=300, bbox_inches=‘tight’)
plt.show()
2. 交叉验证箱线图
cv_df = pd.DataFrame({item[‘model’]: item[‘scores’] for item in cv_results})
plt.figure(figsize=(12, 8))
cv_df.boxplot()
plt.title(‘模型交叉验证准确率分布’)
plt.xticks(rotation=45)
plt.ylabel(‘准确率’)
plt.tight_layout()
plt.savefig(‘cv_boxplot.png’, dpi=300, bbox_inches=‘tight’)
plt.show()
3. 最佳模型详细分析
best_model_name = results_df.iloc[0][‘Model’]
best_model = model_objects[best_model_name]
print(f"\n对最佳模型 {best_model_name} 进行详细分析…")
混淆矩阵
y_pred_best = best_model.predict(X_test_pca)
cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt=‘d’, cmap=‘Blues’,
xticklabels=class_names, yticklabels=class_names)
plt.title(f’{best_model_name} 混淆矩阵’)
plt.xlabel(‘预测标签’)
plt.ylabel(‘真实标签’)
plt.tight_layout()
plt.savefig(f’confusion_matrix_{best_model_name}.png’, dpi=300, bbox_inches=‘tight’)
plt.show()
学习曲线
def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure(figsize=(10, 6))
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="训练得分") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="交叉验证得分") plt.xlabel("训练样本数") plt.ylabel("得分") plt.title(title) plt.legend(loc="best") plt.tight_layout() plt.savefig(f'learning_curve_{best_model_name}.png', dpi=300, bbox_inches='tight') plt.show()
plot_learning_curve(best_model, f’{best_model_name} 学习曲线’, X_train_pca, y_train, cv=5)
特征重要性(如果模型支持)
if hasattr(best_model, ‘feature_importances_’):
importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 8)) plt.title(f"{best_model_name} 特征重要性") plt.bar(range(min(20, len(importances))), importances[indices[:20]]) plt.xticks(range(min(20, len(importances))), [f'PC{i + 1}' for i in indices[:20]], rotation=45) plt.tight_layout() plt.savefig(f'feature_importance_{best_model_name}.png', dpi=300, bbox_inches='tight') plt.show()
========== 新增:SHAP特征分析组合图 ==========
if SHAP_AVAILABLE:
print(“\n生成SHAP特征分析组合图…”)
# 只为最佳模型生成SHAP组合图 if best_model_name in ['RandomForest', 'DecisionTree', 'GradientBoosting', 'XGBoost', 'LightGBM']: try: # 创建SHAP解释器 explainer = shap.TreeExplainer(best_model) shap_values = explainer.shap_values(X_test_pca) # 对于多分类问题,选择第一个类别的SHAP值 if isinstance(shap_values, list): shap_values_used = shap_values[0] # 使用第一个类别的SHAP值 else: shap_values_used = shap_values # 创建组合图:全局重要性(左)与个体影响(右) fig = plt.figure(figsize=(20, 10)) gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1]) # 左图:全局特征重要性 ax1 = plt.subplot(gs[0]) shap.summary_plot(shap_values_used, X_test_pca, plot_type="bar", feature_names=[f'PC{i + 1}' for i in range(X_test_pca.shape[1])], show=False, max_display=15) ax1.set_title(f'{best_model_name} - SHAP全局特征重要性', fontsize=16, fontweight='bold') # 右图:个体样本SHAP值 ax2 = plt.subplot(gs[1]) # 选择一个有代表性的样本(SHAP值绝对值最大的样本) sample_idx = np.argmax(np.sum(np.abs(shap_values_used), axis=1)) # 绘制瀑布图显示个体样本的SHAP值 shap.waterfall_plot( explainer.expected_value[0] if isinstance(explainer.expected_value, list) else explainer.expected_value, shap_values_used[sample_idx], feature_names=[f'PC{i + 1}' for i in range(X_test_pca.shape[1])], show=False, max_display=15) ax2.set_title(f'{best_model_name} - 个体样本SHAP值分析\n(样本索引: {sample_idx})', fontsize=16, fontweight='bold') plt.tight_layout() plt.savefig(f'shap_combined_{best_model_name}.png', dpi=300, bbox_inches='tight') plt.show() # 创建另一个组合图:SHAP摘要图与依赖图 fig2 = plt.figure(figsize=(20, 10)) gs2 = gridspec.GridSpec(1, 2, width_ratios=[1, 1]) # 左图:SHAP摘要图(显示特征值与SHAP值的关系) ax3 = plt.subplot(gs2[0]) shap.summary_plot(shap_values_used, X_test_pca, feature_names=[f'PC{i + 1}' for i in range(X_test_pca.shape[1])], show=False, max_display=15) ax3.set_title(f'{best_model_name} - SHAP摘要图', fontsize=16, fontweight='bold') # 右图:SHAP依赖图(最重要的特征) ax4 = plt.subplot(gs2[1]) # 找到最重要的特征 if hasattr(best_model, 'feature_importances_'): feature_importances = best_model.feature_importances_ most_important_feature = np.argmax(feature_importances) else: # 如果没有feature_importances_属性,使用SHAP值计算重要性 feature_importances = np.mean(np.abs(shap_values_used), axis=0) most_important_feature = np.argmax(feature_importances) shap.dependence_plot(most_important_feature, shap_values_used, X_test_pca, feature_names=[f'PC{i + 1}' for i in range(X_test_pca.shape[1])], show=False, ax=ax4) ax4.set_title(f'{best_model_name} - SHAP依赖图\n(最重要的特征: PC{most_important_feature + 1})', fontsize=16, fontweight='bold') plt.tight_layout() plt.savefig(f'shap_detailed_{best_model_name}.png', dpi=300, bbox_inches='tight') plt.show() print(f"已保存 SHAP组合图 for {best_model_name}") except Exception as e: print(f"生成SHAP组合图失败 for {best_model_name}: {e}") else: print(f"最佳模型 {best_model_name} 不支持TreeExplainer,跳过SHAP组合图")
else:
print(“\nSHAP不可用,跳过SHAP组合图”)
========== 生成四因子表格 ==========
four_factor_cols = [‘Short_term_acid_factor’, ‘Long_term_acid_factor’, ‘Sulfonate_factor’, ‘Exposure_factor’]
if all(col in df.columns for col in four_factor_cols):
four_factor_table = df[[target_column] + four_factor_cols].head(10)
# 解码目标变量以显示原始标签
four_factor_table[target_column] = le.inverse_transform(four_factor_table[target_column])
print_table(four_factor_table.values.tolist(),
headers=[target_column] + four_factor_cols,
title=“四因子理论表格 (前10行)”)
========== 生成PCA组件表格 ==========
pca_components = pd.DataFrame(pca.components_, columns=features)
print(f"\nPCA主成分表格 (前5个主成分)😊
print_table(pca_components.head().values.tolist(),
headers=[‘PC’] + features,
title=“PCA主成分权重”)
========== 保存重要结果 ==========
results_df.to_csv(‘model_results.csv’, index=False)
print(“\n模型结果已保存到 ‘model_results.csv’”)
显示最佳模型
best_model = results_df.iloc[0]
print(f"\n最佳模型: {best_model[‘Model’]} (准确率: {best_model[‘Accuracy’]:.4f})")
print(“\n所有图表已生成完成!”)完善改进代码