计算特征之间的相关性用矩阵相关性方法corr_matrix = features_df.corr().abs().round(2)

导入必要的库 import pandas as pd import numpy as np import seaborn as sns from sklearn.model_selection import train_test_split, cross_val_score, learning_curve from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.decomposition import PCA from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.neural_network import MLPClassifier import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec from matplotlib.patches import Patch import warnings warnings.filterwarnings(‘ignore’) 设置中文字体 plt.rcParams[‘font.sans-serif’] = [‘SimHei’] # 用来正常显示中文标签 plt.rcParams[‘axes.unicode_minus’] = False # 用来正常显示负号 尝试导入可选库 try: from xgboost import XGBClassifier XGB_AVAILABLE = True except ImportError: print(“XGBoost 不可用,跳过 XGBoost 模型”) XGB_AVAILABLE = False try: from lightgbm import LGBMClassifier LGBM_AVAILABLE = True except ImportError: print(“LightGBM 不可用,跳过 LightGBM 模型”) LGBM_AVAILABLE = False try: import shap SHAP_AVAILABLE = True except ImportError: print(“SHAP 不可用,跳过 SHAP 解释”) SHAP_AVAILABLE = False 定义张继权教授的四因子理论函数 def apply_four_factor_theory(df): # 检查所需的列是否存在 required_cols = [‘PFBA’, ‘PFPeA’, ‘PFHxA’, ‘PFHpA’, ‘PFOA’, ‘PFNA’, ‘PFDA’, ‘PFBS’, ‘PFHxS’, ‘PFOS’] available_cols = [col for col in required_cols if col in df.columns] if len(available_cols) < 4: print(f"警告: 只有 {len(available_cols)} 个PFAS列可用,可能需要调整四因子计算") # 短期酸类因子 (PFBA, PFPeA, PFHxA) short_term_cols = [col for col in ['PFBA', 'PFPeA', 'PFHxA'] if col in df.columns] if short_term_cols: df['Short_term_acid_factor'] = df[short_term_cols].mean(axis=1, skipna=True) else: df['Short_term_acid_factor'] = 0 # 长期酸类因子 (PFHpA, PFOA, PFNA, PFDA) long_term_cols = [col for col in ['PFHpA', 'PFOA', 'PFNA', 'PFDA'] if col in df.columns] if long_term_cols: df['Long_term_acid_factor'] = df[long_term_cols].mean(axis=1, skipna=True) else: df['Long_term_acid_factor'] = 0 # 磺酸类因子 (PFBS, PFHxS, PFOS) sulfonate_cols = [col for col in ['PFBS', 'PFHxS', 'PFOS'] if col in df.columns] if sulfonate_cols: df['Sulfonate_factor'] = df[sulfonate_cols].mean(axis=1, skipna=True) else: df['Sulfonate_factor'] = 0 # 暴露因子 (总PFAS浓度) all_pfas_cols = [col for col in required_cols if col in df.columns] if all_pfas_cols: df['Exposure_factor'] = df[all_pfas_cols].sum(axis=1, skipna=True) else: df['Exposure_factor'] = 0 return df 自定义表格打印函数 def print_table(data, headers=None, title=None): if title: print(f"\n{title}“) print(=" * 60) if headers: # 打印表头 header_line = " | ".join(f"{h:>15}" for h in headers) print(header_line) print("-" * len(header_line)) # 打印数据行 for row in data: if isinstance(row, (list, tuple)): row_line = " | ".join(f"{str(item):>15}" for item in row) else: # 处理DataFrame行 row_line = " | ".join(f"{str(row[col]):>15}" for col in headers) print(row_line) 尝试多种编码方式读取文件 def read_csv_with_encodings(file_path): # 尝试的编码列表(中文环境中常见的编码) encodings = [‘utf-8’, ‘gbk’, ‘gb2312’, ‘gb18030’, ‘latin1’, ‘cp936’] for encoding in encodings: try: print(f"尝试使用 {encoding} 编码读取文件...") df = pd.read_csv(file_path, encoding=encoding) # 删除完全为空的行和列 df = df.dropna(how='all', axis=0) df = df.dropna(how='all', axis=1) print(f"成功使用 {encoding} 编码读取文件") return df except UnicodeDecodeError: continue except Exception as e: print(f"使用 {encoding} 编码时出错: {e}") continue # 如果所有编码都失败,尝试使用错误处理 try: print("尝试使用错误处理方式读取文件...") df = pd.read_csv(file_path, encoding='utf-8', errors='ignore') df = df.dropna(how='all', axis=0) df = df.dropna(how='all', axis=1) print("使用错误处理方式成功读取文件") return df except Exception as e: print(f"所有读取尝试都失败: {e}") return None 加载CSV文件并跳过完全为空的行列 file_path = r’E:\pycharm\meta\整合数据.csv’ # 使用原始字符串表示法 try: # 使用多种编码方式尝试读取文件 df = read_csv_with_encodings(file_path) if df is None: print("无法读取文件,请检查文件路径和格式") exit() print(f"数据形状: {df.shape}") print(f"列名: {df.columns.tolist()}") except Exception as e: print(f"读取文件时出错: {e}") exit() 数据预处理 检查PFAS相关列是否存在,如果不存在则尝试找到类似的列 expected_features = [‘PFBA’, ‘PFPeA’, ‘PFHxA’, ‘PFHpA’, ‘PFOA’, ‘PFNA’, ‘PFDA’, ‘PFBS’, ‘PFHxS’, ‘PFOS’] available_features = [] for feature in expected_features: if feature in df.columns: available_features.append(feature) else: # 尝试找到包含该字符串的列 matching_cols = [col for col in df.columns if feature.lower() in col.lower()] if matching_cols: available_features.extend(matching_cols) print(f"使用 ‘{matching_cols[0]}’ 替代 ‘{feature}’") 如果没有找到任何PFAS特征,使用所有数值列 if not available_features: numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() available_features = numeric_cols print(f"未找到PFAS特征,使用所有数值列: {available_features}") print(f"可用的PFAS特征: {available_features}") 检查目标列 target_column = ‘城市’ if target_column not in df.columns: # 尝试找到可能的目标列 possible_targets = [col for col in df.columns if any(word in col for word in [‘城市’, ‘地区’, ‘区域’, ‘地点’, ‘city’, ‘region’])] if possible_targets: target_column = possible_targets[0] print(f"使用 ‘{target_column}’ 作为目标变量") else: # 如果没有找到,使用第一列非数值列作为目标 non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns if len(non_numeric_cols) > 0: target_column = non_numeric_cols[0] print(f"使用 ‘{target_column}’ 作为目标变量") else: # 如果没有非数值列,使用最后一列作为目标 target_column = df.columns[-1] print(f"使用最后一列 ‘{target_column}’ 作为目标变量") 处理缺失值(用中位数填充) for feature in available_features: if feature in df.columns: df[feature] = pd.to_numeric(df[feature], errors=‘coerce’) df[feature] = df[feature].fillna(df[feature].median()) 应用张继权教授的四因子理论 df = apply_four_factor_theory(df) 添加四因子到特征features = available_features + [‘Short_term_acid_factor’, ‘Long_term_acid_factor’, ‘Sulfonate_factor’, ‘Exposure_factor’] print(f"最终使用的特征: {features}") 编码目标变量(多分类) le = LabelEncoder() df[target_column] = le.fit_transform(df[target_column].fillna(‘Unknown’)) class_names = le.classes_ 分割数据集 X = df[features] y = df[target_column] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print(f"训练集大小: {X_train.shape}“) print(f"测试集大小: {X_test.shape}”) print(f"目标变量类别数: {len(np.unique(y))}") 标准化特征 scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) PCA降维(保留95%方差) pca = PCA(n_components=0.95) X_train_pca = pca.fit_transform(X_train_scaled) X_test_pca = pca.transform(X_test_scaled) print(f"PCA降维后特征数: {X_train_pca.shape[1]}") ========== 新增:相关性分析 ========== print(“\n进行相关性分析…”) 计算特征之间相关性矩阵 correlation_matrix = X.corr() 绘制相关性热图 plt.figure(figsize=(12, 10)) sns.heatmap(correlation_matrix, annot=True, cmap=‘coolwarm’, center=0, fmt=.2f’) plt.title(特征相关性热图’) plt.tight_layout() plt.savefig(correlation_heatmap.png’, dpi=300, bbox_inches=‘tight’) plt.show() 找出高度相关的特征对 high_corr_pairs = [] for i in range(len(correlation_matrix.columns)): for j in range(i + 1, len(correlation_matrix.columns)): if abs(correlation_matrix.iloc[i, j]) > 0.8: # 阈值设为0.8 high_corr_pairs.append(( correlation_matrix.columns[i], correlation_matrix.columns[j], correlation_matrix.iloc[i, j] )) if high_corr_pairs: print(“高度相关的特征对:”) for pair in high_corr_pairs: print(f" {pair[0]} 和 {pair[1]}: {pair[2]:.3f}") else: print(“没有发现高度相关的特征(相关系数>0.8)) ========== 新增:Meta分析 - 数据分布可视化 ========== print(“\n进行Meta分析…”) 1. 目标变量分布 plt.figure(figsize=(10, 6)) df[target_column].value_counts().plot(kind=‘bar’) plt.title(‘目标变量分布’) plt.xlabel(‘类别’) plt.ylabel(‘样本数量’) plt.xticks(rotation=45) plt.tight_layout() plt.savefig(‘target_distribution.png’, dpi=300, bbox_inches=‘tight’) plt.show() 2. 四因子分布 four_factor_cols = [‘Short_term_acid_factor’, ‘Long_term_acid_factor’, ‘Sulfonate_factor’, ‘Exposure_factor’] if all(col in df.columns for col in four_factor_cols): fig, axes = plt.subplots(2, 2, figsize=(12, 10)) axes = axes.ravel() for i, col in enumerate(four_factor_cols): axes[i].hist(df[col], bins=30, alpha=0.7, color='skyblue', edgecolor='black') axes[i].set_title(f'{col} 分布') axes[i].set_xlabel(col) axes[i].set_ylabel('频率') plt.tight_layout() plt.savefig('four_factor_distribution.png', dpi=300, bbox_inches='tight') plt.show() 3. 特征箱线图 plt.figure(figsize=(15, 10)) X.boxplot() plt.title(特征箱线图’) plt.xticks(rotation=45) plt.tight_layout() plt.savefig(‘feature_boxplot.png’, dpi=300, bbox_inches=‘tight’) plt.show() 4. PCA可视化 plt.figure(figsize=(10, 8)) if X_train_pca.shape[1] >= 2: scatter = plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=y_train, cmap=‘viridis’, alpha=0.7) plt.colorbar(scatter) plt.xlabel(‘第一主成分’) plt.ylabel(‘第二主成分’) plt.title(‘PCA降维可视化’) plt.tight_layout() plt.savefig(‘pca_visualization.png’, dpi=300, bbox_inches=‘tight’) plt.show() 5. 累计方差解释率 plt.figure(figsize=(10, 6)) pca_full = PCA().fit(X_train_scaled) plt.plot(np.cumsum(pca_full.explained_variance_ratio_)) plt.xlabel(‘主成分数量’) plt.ylabel(‘累计方差解释率’) plt.title(‘PCA累计方差解释率’) plt.grid(True) plt.tight_layout() plt.savefig(‘pca_explained_variance.png’, dpi=300, bbox_inches=‘tight’) plt.show() ========== 模型训练和评估 ========== 定义机器学习模型 models = { ‘LogisticRegression’: LogisticRegression(max_iter=1000, random_state=42), ‘DecisionTree’: DecisionTreeClassifier(random_state=42), ‘RandomForest’: RandomForestClassifier(random_state=42), ‘SVM’: SVC(random_state=42, probability=True), ‘KNN’: KNeighborsClassifier(), ‘NaiveBayes’: GaussianNB(), ‘GradientBoosting’: GradientBoostingClassifier(random_state=42), ‘MLP’: MLPClassifier(max_iter=1000, random_state=42), ‘AdaBoost’: AdaBoostClassifier(random_state=42), ‘ExtraTrees’: ExtraTreesClassifier(random_state=42) } 添加可选模型 if XGB_AVAILABLE: models[‘XGBoost’] = XGBClassifier(use_label_encoder=False, eval_metric=‘mlogloss’, random_state=42) if LGBM_AVAILABLE: models[‘LightGBM’] = LGBMClassifier(random_state=42) print(f"将训练 {len(models)} 个模型") 训练和评估模型 results = [] cv_results = [] # 用于存储交叉验证结果 model_objects = {} # 存储训练好的模型对象 print(“开始训练模型…”) for name, model in models.items(): print(f"训练 {name}…") try: # 训练模型 model.fit(X_train_pca, y_train) model_objects[name] = model # 预测和评估 y_pred = model.predict(X_test_pca) y_pred_proba = model.predict_proba(X_test_pca) if hasattr(model, "predict_proba") else None acc = accuracy_score(y_test, y_pred) report = classification_report(y_test, y_pred, output_dict=True, zero_division=0) # 交叉验证 cv_scores = cross_val_score(model, X_train_pca, y_train, cv=5, scoring='accuracy') cv_mean = cv_scores.mean() cv_std = cv_scores.std() results.append([ name, acc, report['weighted avg']['precision'], report['weighted avg']['recall'], report['weighted avg']['f1-score'], cv_mean, cv_std ]) cv_results.append({ 'model': name, 'scores': cv_scores }) except Exception as e: print(f"训练模型 {name} 时出错: {e}") results.append([name, 0, 0, 0, 0, 0, 0]) 生成结果表格 headers = [‘Model’, ‘Accuracy’, ‘Precision’, ‘Recall’, ‘F1-Score’, ‘CV Mean’, ‘CV Std’] results_df = pd.DataFrame(results, columns=headers) results_df = results_df.sort_values(‘Accuracy’, ascending=False) print(“\n模型性能排名:”) print_table(results_df.values.tolist(), headers=headers, title=“模型性能比较”) ========== 新增:模型性能可视化 ========== print(“\n生成模型性能可视化…”) 1. 模型准确率比较 plt.figure(figsize=(12, 8)) models_names = results_df[‘Model’] accuracies = results_df[‘Accuracy’] cv_means = results_df[‘CV Mean’] cv_stds = results_df[‘CV Std’] x = np.arange(len(models_names)) width = 0.35 plt.bar(x - width / 2, accuracies, width, label=‘测试集准确率’, alpha=0.7) plt.bar(x + width / 2, cv_means, width, yerr=cv_stds, label=‘交叉验证准确率’, alpha=0.7, capsize=5) plt.xlabel(‘模型’) plt.ylabel(‘准确率’) plt.title(‘模型性能比较’) plt.xticks(x, models_names, rotation=45) plt.legend() plt.tight_layout() plt.savefig(‘model_performance_comparison.png’, dpi=300, bbox_inches=‘tight’) plt.show() 2. 交叉验证箱线图 cv_df = pd.DataFrame({item[‘model’]: item[‘scores’] for item in cv_results}) plt.figure(figsize=(12, 8)) cv_df.boxplot() plt.title(‘模型交叉验证准确率分布’) plt.xticks(rotation=45) plt.ylabel(‘准确率’) plt.tight_layout() plt.savefig(‘cv_boxplot.png’, dpi=300, bbox_inches=‘tight’) plt.show() 3. 最佳模型详细分析 best_model_name = results_df.iloc[0][‘Model’] best_model = model_objects[best_model_name] print(f"\n对最佳模型 {best_model_name} 进行详细分析…") 混淆矩阵 y_pred_best = best_model.predict(X_test_pca) cm = confusion_matrix(y_test, y_pred_best) plt.figure(figsize=(10, 8)) sns.heatmap(cm, annot=True, fmt=‘d’, cmap=‘Blues’, xticklabels=class_names, yticklabels=class_names) plt.title(f’{best_model_name} 混淆矩阵) plt.xlabel(‘预测标签’) plt.ylabel(‘真实标签’) plt.tight_layout() plt.savefig(f’confusion_matrix_{best_model_name}.png’, dpi=300, bbox_inches=‘tight’) plt.show() 学习曲线 def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)): plt.figure(figsize=(10, 6)) train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="训练得分") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="交叉验证得分") plt.xlabel("训练样本数") plt.ylabel("得分") plt.title(title) plt.legend(loc="best") plt.tight_layout() plt.savefig(f'learning_curve_{best_model_name}.png', dpi=300, bbox_inches='tight') plt.show() plot_learning_curve(best_model, f’{best_model_name} 学习曲线’, X_train_pca, y_train, cv=5) 特征重要性(如果模型支持) if hasattr(best_model, ‘feature_importances_’): importances = best_model.feature_importances_ indices = np.argsort(importances)[::-1] plt.figure(figsize=(10, 8)) plt.title(f"{best_model_name} 特征重要性") plt.bar(range(min(20, len(importances))), importances[indices[:20]]) plt.xticks(range(min(20, len(importances))), [f'PC{i + 1}' for i in indices[:20]], rotation=45) plt.tight_layout() plt.savefig(f'feature_importance_{best_model_name}.png', dpi=300, bbox_inches='tight') plt.show() ========== 新增:SHAP特征分析组合图 ========== if SHAP_AVAILABLE: print(“\n生成SHAP特征分析组合图…”) # 只为最佳模型生成SHAP组合图 if best_model_name in ['RandomForest', 'DecisionTree', 'GradientBoosting', 'XGBoost', 'LightGBM']: try: # 创建SHAP解释器 explainer = shap.TreeExplainer(best_model) shap_values = explainer.shap_values(X_test_pca) # 对于多分类问题,选择第一个类别的SHAP值 if isinstance(shap_values, list): shap_values_used = shap_values[0] # 使用第一个类别的SHAP值 else: shap_values_used = shap_values # 创建组合图:全局重要性(左)与个体影响(右) fig = plt.figure(figsize=(20, 10)) gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1]) # 左图:全局特征重要性 ax1 = plt.subplot(gs[0]) shap.summary_plot(shap_values_used, X_test_pca, plot_type="bar", feature_names=[f'PC{i + 1}' for i in range(X_test_pca.shape[1])], show=False, max_display=15) ax1.set_title(f'{best_model_name} - SHAP全局特征重要性', fontsize=16, fontweight='bold') # 右图:个体样本SHAP值 ax2 = plt.subplot(gs[1]) # 选择一个有代表性的样本(SHAP值绝对值最大的样本) sample_idx = np.argmax(np.sum(np.abs(shap_values_used), axis=1)) # 绘制瀑布图显示个体样本的SHAP值 shap.waterfall_plot( explainer.expected_value[0] if isinstance(explainer.expected_value, list) else explainer.expected_value, shap_values_used[sample_idx], feature_names=[f'PC{i + 1}' for i in range(X_test_pca.shape[1])], show=False, max_display=15) ax2.set_title(f'{best_model_name} - 个体样本SHAP值分析\n(样本索引: {sample_idx})', fontsize=16, fontweight='bold') plt.tight_layout() plt.savefig(f'shap_combined_{best_model_name}.png', dpi=300, bbox_inches='tight') plt.show() # 创建另一个组合图:SHAP摘要图与依赖图 fig2 = plt.figure(figsize=(20, 10)) gs2 = gridspec.GridSpec(1, 2, width_ratios=[1, 1]) # 左图:SHAP摘要图(显示特征值与SHAP值的关系) ax3 = plt.subplot(gs2[0]) shap.summary_plot(shap_values_used, X_test_pca, feature_names=[f'PC{i + 1}' for i in range(X_test_pca.shape[1])], show=False, max_display=15) ax3.set_title(f'{best_model_name} - SHAP摘要图', fontsize=16, fontweight='bold') # 右图:SHAP依赖图(最重要的特征) ax4 = plt.subplot(gs2[1]) # 找到最重要的特征 if hasattr(best_model, 'feature_importances_'): feature_importances = best_model.feature_importances_ most_important_feature = np.argmax(feature_importances) else: # 如果没有feature_importances_属性,使用SHAP值计算重要性 feature_importances = np.mean(np.abs(shap_values_used), axis=0) most_important_feature = np.argmax(feature_importances) shap.dependence_plot(most_important_feature, shap_values_used, X_test_pca, feature_names=[f'PC{i + 1}' for i in range(X_test_pca.shape[1])], show=False, ax=ax4) ax4.set_title(f'{best_model_name} - SHAP依赖图\n(最重要的特征: PC{most_important_feature + 1})', fontsize=16, fontweight='bold') plt.tight_layout() plt.savefig(f'shap_detailed_{best_model_name}.png', dpi=300, bbox_inches='tight') plt.show() print(f"已保存 SHAP组合图 for {best_model_name}") except Exception as e: print(f"生成SHAP组合图失败 for {best_model_name}: {e}") else: print(f"最佳模型 {best_model_name} 不支持TreeExplainer,跳过SHAP组合图") else: print(“\nSHAP不可用,跳过SHAP组合图”) ========== 生成四因子表格 ========== four_factor_cols = [‘Short_term_acid_factor’, ‘Long_term_acid_factor’, ‘Sulfonate_factor’, ‘Exposure_factor’] if all(col in df.columns for col in four_factor_cols): four_factor_table = df[[target_column] + four_factor_cols].head(10) # 解码目标变量以显示原始标签 four_factor_table[target_column] = le.inverse_transform(four_factor_table[target_column]) print_table(four_factor_table.values.tolist(), headers=[target_column] + four_factor_cols, title=“四因子理论表格 (前10行)) ========== 生成PCA组件表格 ========== pca_components = pd.DataFrame(pca.components_, columns=features) print(f"\nPCA主成分表格 (前5个主成分)😊 print_table(pca_components.head().values.tolist(), headers=[‘PC’] + features, title=“PCA主成分权重”) ========== 保存重要结果 ========== results_df.to_csv(‘model_results.csv’, index=False) print(“\n模型结果已保存到 ‘model_results.csv’”) 显示最佳模型 best_model = results_df.iloc[0] print(f"\n最佳模型: {best_model[‘Model’]} (准确率: {best_model[‘Accuracy’]:.4f})") print(“\n所有图表已生成完成!”)完善改进代码
11-12
导入模块 import warnings warnings.filterwarnings(‘ignore’) import shap from sklearn.model_selection import train_test_split import xgboost as xgb import pandas as pd from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score import numpy as np #预测 def performance_reg(model, X, y, metrics_name=None): y_pred = model.predict(X) if metrics_name: print(metrics_name, "😊 print("Mean Squared Error (MSE): ", mean_squared_error(y, y_pred)) print("Root Mean Squared Error (RMSE): ", np.sqrt(mean_squared_error(y, y_pred))) print("Mean Absolute Error (MAE): ", mean_absolute_error(y, y_pred)) print(“R2 Score: “, r2_score(y, y_pred)) print(”----------------------------”) 数据 data = pd.read_excel(“BA建模数据集.xlsx”) 处理 data = data.dropna() X = data.drop([‘CA’], axis=1) y = data[‘CA’] print(特征矩阵 X:\n”, X.head()) print(“\n目标变量 y:\n”, y.head()) 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2021, shuffle=True) xgboost模型 xgb_clf = xgb.XGBRegressor(objective=‘reg:squarederror’, booster=‘gbtree’, n_estimators=716, max_depth=5, reg_alpha=0.09, min_child_weight=0.66, colsample_bynode=0.65, subsample=0.75, learning_rate=0.164, scale_pos_weight = 1.073, use_label_encoder=False ) xgb训练 xgb_clf = xgb_clf.fit(X_train, y_train) print(“-----XGBOOST----”) performance_reg(xgb_clf, X_train, y_train, metrics_name=‘train’) performance_reg(xgb_clf, X_test, y_test, metrics_name=‘test’) y_pred = xgb_clf.predict(X_test) df = pd.DataFrame(y_pred, columns=[‘Prediction’]) df.to_excel(‘predictions0910.xlsx’, index=False) 相关性热力图 import seaborn as sns import matplotlib.pyplot as plt plt.rcParams[‘font.family’] = [‘serif’, ‘Microsoft YaHei’] corr_matrix = data[[‘RBC’, ‘MCV’, ‘PLT’, ‘ALP’, ‘Urea’, ‘UA’, ‘FBG’, ‘TC’, ‘TG’, ‘HDL’, ‘LDL’, ‘SBP’, ‘DBP’]].corr() plt.figure(figsize=(8, 6)) sns.heatmap(corr_matrix, annot=True, cmap=“coolwarm”, fmt=.2f’, annot_kws={“size”: 8}) plt.title(Correlation Heatmap’) plt.xticks(fontsize=8) plt.yticks(fontsize=8) plt.show() #可视化预测误差 SHAP分析 explainer = shap.TreeExplainer(xgb_clf) shap_values = explainer.shap_values(X_test) import pandas as pd shap_df = pd.DataFrame(shap_values, columns=X_test.columns) shap_df.to_csv(‘0924shap_values.csv’, index=False) ####总体分析 plt.rcParams[‘font.family’] = ‘serif’ plt.rcParams[‘font.sans-serif’] = [‘Microsoft YaHei’] shap.initjs() max_display = len(X_test.columns) shap.summary_plot(shap_values, X_test, max_display=max_display) shap.summary_plot(shap_values, X_test) ##shap特征交互图 shap_interaction_values = explainer.shap_interaction_values(X_test) shap.summary_plot(shap_interaction_values, X_test) 平均影响排序图 import numpy as np abs_shap_values = np.abs(shap_values) mean_abs_shap_values = np.mean(abs_shap_values, axis=0) 按平均 SHAP 值排序 sorted_indices = np.argsort(mean_abs_shap_values)[::-1] sorted_features = X_test.columns[sorted_indices] sorted_mean_abs_shap_values = mean_abs_shap_values[sorted_indices] plt.rcParams[‘font.family’] = [‘serif’, ‘Microsoft YaHei’] 绘制特征变量排序图 fig, ax = plt.subplots(figsize=(10, 6)) ax.barh(range(len(sorted_features)), sorted_mean_abs_shap_values, align=‘center’) ax.set_yticks(range(len(sorted_features))) ax.set_yticklabels(sorted_features) ax.invert_yaxis() plt.xlabel(‘mean(|SHAP value|)) plt.ylabel(Features) plt.title(‘SHAP MEAN’) 在每个柱状图后面添加对应的数据 for i in range(len(ax.patches)): rect = ax.patches[i] feature_name = sorted_features[i] importance_value = round(sorted_mean_abs_shap_values[i], 5) ax.text(rect.get_width() + rect.get_x(), rect.get_y() + rect.get_height() / 2, importance_value, ha=‘left’, va=‘center’) plt.tight_layout() plt.show() 偏依赖图 shap.dependence_plot(“X7”, shap_values, X_test, interaction_index=None) shap.dependence_plot(“X4”, shap_values, X_test, interaction_index=None) shap.dependence_plot(“X11”, shap_values, X_test, interaction_index=None) shap.dependence_plot(“X5”, shap_values, X_test, interaction_index=None) shap.dependence_plot(‘X7’, shap_values, X_test, interaction_index=‘X11’) shap.dependence_plot(‘X7’, shap_values, X_test, interaction_index=‘X5’) shap.dependence_plot(‘X7’, shap_values, X_test, interaction_index=‘X2) shap.dependence_plot(‘X4’, shap_values, X_test, interaction_index=‘X1’) print(‘end’) 基于这个代码,帮我补充实际值与预测值的可视化比较图
07-06
import pandas as pd import numpy as np from scipy import stats import os from datetime import datetime, timedelta import warnings warnings.filterwarnings('ignore') class AirQualityDataAugmenter: """ 空气质量数据增强器 通过多种策略扩充空气质量时间序列数据 """ def __init__(self, data_path): """ 初始化数据增强器 Args: data_path: 原始数据文件路径 """ self.data_path = data_path self.original_data = None self.enhanced_data = None self.load_data() def load_data(self): """加载并清洗原始数据""" print("正在加载原始数据...") self.original_data = pd.read_excel(self.data_path) # 确保日期格式正确 self.original_data['Date'] = pd.to_datetime(self.original_data['Date']) # 数据清洗:处理缺失值和异常值 self.clean_data() print(f"原始数据形状: {self.original_data.shape}") print("数据列:", self.original_data.columns.tolist()) print("\n前5行数据:") print(self.original_data.head()) # 数据基本信息 print("\n数据基本信息:") print(self.original_data.describe()) def clean_data(self): """数据清洗:处理缺失值和异常值""" print("正在进行数据清洗...") # 检查缺失值 missing_values = self.original_data.isnull().sum() if missing_values.sum() > 0: print(f"发现缺失值: {missing_values[missing_values > 0]}") # 使用中位数填充数值列的缺失值 numeric_cols = ['AQI', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3_8h'] for col in numeric_cols: if self.original_data[col].isnull().sum() > 0: median_val = self.original_data[col].median() self.original_data[col].fillna(median_val, inplace=True) print(f" 使用中位数 {median_val:.2f} 填充 {col} 的缺失值") # 处理异常值(AQI和污染物浓度应为正数) numeric_cols = ['AQI', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3_8h'] for col in numeric_cols: # 将负值替换为中位数 negative_mask = self.original_data[col] < 0 if negative_mask.sum() > 0: median_val = self.original_data[col].median() self.original_data.loc[negative_mask, col] = median_val print(f" 将 {col} 的 {negative_mask.sum()} 个负值替换为中位数 {median_val:.2f}") # 处理极端异常值(使用3σ原则) mean_val = self.original_data[col].mean() std_val = self.original_data[col].std() upper_limit = mean_val + 3 * std_val extreme_mask = self.original_data[col] > upper_limit if extreme_mask.sum() > 0: p95 = self.original_data[col].quantile(0.95) self.original_data.loc[extreme_mask, col] = p95 print(f" 将 {col} 的 {extreme_mask.sum()} 个极端值替换为95分位数 {p95:.2f}") # 确保PM10 >= PM2.5 pm_violations = self.original_data['PM10'] < self.original_data['PM2.5'] if pm_violations.sum() > 0: self.original_data.loc[pm_violations, 'PM10'] = self.original_data.loc[pm_violations, 'PM2.5'] * 1.1 print(f" 修正了 {pm_violations.sum()} 条 PM10 < PM2.5 的记录") print("数据清洗完成!") def add_time_features(self, df): """添加时间特征(仅用于内部计算)""" df_copy = df.copy() df_copy['month'] = df_copy['Date'].dt.month df_copy['day'] = df_copy['Date'].dt.day df_copy['day_of_week'] = df_copy['Date'].dt.dayofweek df_copy['is_weekend'] = (df_copy['day_of_week'] >= 5).astype(int) df_copy['season'] = (df_copy['month'] % 12 + 3) // 3 # 1:春, 2:夏, 3:秋, 4:冬 # 确保时间特征没有NaN time_features = ['month', 'day', 'day_of_week', 'is_weekend', 'season'] for feature in time_features: if df_copy[feature].isnull().sum() > 0: df_copy[feature].fillna(df_copy[feature].mode()[0], inplace=True) return df_copy def safe_monthly_stats(self, df, pollutant_cols): """安全计算月度统计量,处理缺失月份""" # 创建完整的月份索引 (1-12) full_months = pd.DataFrame({'month': range(1, 13)}) monthly_means = df.groupby('month')[pollutant_cols].mean().reset_index() monthly_stds = df.groupby('month')[pollutant_cols].std().reset_index() # 合并完整月份,填充缺失值 monthly_means = full_months.merge(monthly_means, on='month', how='left') monthly_stds = full_months.merge(monthly_stds, on='month', how='left') # 填充缺失的统计值 global_means = df[pollutant_cols].mean() global_stds = df[pollutant_cols].std() for col in pollutant_cols: monthly_means[col].fillna(global_means[col], inplace=True) monthly_stds[col].fillna(global_stds[col], inplace=True) # 设置月份为索引 monthly_means.set_index('month', inplace=True) monthly_stds.set_index('month', inplace=True) return monthly_means, monthly_stds def gaussian_noise_augmentation(self, df, num_copies=3, noise_level=0.05): """ 高斯噪声增强 - 向前增强 """ augmented_dfs = [] for i in range(num_copies): augmented_df = df.copy() # 对数值列添加高斯噪声 numeric_cols = ['AQI', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3_8h'] for col in numeric_cols: std_dev = df[col].std() * noise_level noise = np.random.normal(0, std_dev, len(df)) augmented_df[col] = df[col] + noise # 确保非负 augmented_df[col] = np.maximum(augmented_df[col], 0) # 修改日期:向前偏移(过去的时间) date_offset = timedelta(days=365 * (i + 1)) augmented_df['Date'] = df['Date'] - date_offset # 只保留原始列 augmented_df = augmented_df[self.original_data.columns] augmented_dfs.append(augmented_df) return pd.concat(augmented_dfs, ignore_index=True) def seasonal_augmentation(self, df, seasonal_cycles=2): """ 季节性模式增强 - 向前增强 """ augmented_dfs = [] pollutant_cols = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3_8h'] # 添加时间特征用于计算 df_with_features = self.add_time_features(df) # 使用安全的方法计算月度统计量 monthly_means, monthly_stds = self.safe_monthly_stats(df_with_features, pollutant_cols) for cycle in range(1, seasonal_cycles + 1): augmented_df = df.copy() augmented_df_with_features = self.add_time_features(augmented_df) # 基于季节性模式调整污染物浓度 for idx, row in augmented_df_with_features.iterrows(): month = row['month'] # 确保月份是有效值 if pd.isna(month) or month < 1 or month > 12: month = np.random.randint(1, 13) # 随机分配一个有效月份 for col in pollutant_cols: try: # 在月均值的±1标准差范围内随机调整 seasonal_factor = np.random.normal( monthly_means.loc[month, col], monthly_stds.loc[month, col] * 0.3 ) current_value = row[col] # 平滑调整,避免突变 adjustment = 0.7 * current_value + 0.3 * seasonal_factor augmented_df.loc[idx, col] = max(adjustment, 0) except KeyError: # 如果月份索引出错,使用全局均值 global_mean = df[col].mean() augmented_df.loc[idx, col] = global_mean # 重新计算AQI augmented_df['AQI'] = self.calculate_aqi_from_pm25(augmented_df['PM2.5']) # 日期偏移:向前偏移(过去的时间) date_offset = timedelta(days=365 * cycle) augmented_df['Date'] = df['Date'] - date_offset # 只保留原始列 augmented_df = augmented_df[self.original_data.columns] augmented_dfs.append(augmented_df) return pd.concat(augmented_dfs, ignore_index=True) def calculate_aqi_from_pm25(self, pm25_values): """根据PM2.5计算简化的AQI""" aqi_values = [] for pm25 in pm25_values: if pd.isna(pm25): aqi_values.append(50) # 默认值 continue if pm25 <= 35: aqi = (pm25 / 35) * 50 elif pm25 <= 75: aqi = 50 + (pm25 - 35) / 40 * 50 elif pm25 <= 115: aqi = 100 + (pm25 - 75) / 40 * 50 elif pm25 <= 150: aqi = 150 + (pm25 - 115) / 35 * 50 elif pm25 <= 250: aqi = 200 + (pm25 - 150) / 100 * 100 else: aqi = 300 + (pm25 - 250) / 150 * 100 aqi_values.append(min(aqi, 500)) # AQI最大500 return np.array(aqi_values) def correlation_preserving_augmentation(self, df, num_copies=2): """ 保持相关性的数据增强 - 向前增强 """ augmented_dfs = [] # 计算污染物之间相关性矩阵 pollutant_cols = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3_8h'] # 确保没有NaN值 df_clean = df[pollutant_cols].dropna() if len(df_clean) < len(df): print(f" 警告: 丢弃了 {len(df) - len(df_clean)} 条包含NaN值的记录") if len(df_clean) < 2: print(" 错误: 没有足够的数据计算相关性,跳过此增强") return df try: correlation_matrix = df_clean.corr() # 计算特征值和特征向量 eigenvalues, eigenvectors = np.linalg.eig(correlation_matrix) except (np.linalg.LinAlgError, ValueError) as e: print(f" 计算相关性时出错: {e},跳过此增强") return df for copy_idx in range(num_copies): augmented_df = df.copy() for i in range(len(augmented_df)): # 在主成分空间添加噪声 original_values = augmented_df[pollutant_cols].iloc[i].values # 检查是否有NaN值 if np.any(pd.isna(original_values)): continue try: # 转换到主成分空间 pc_space = eigenvectors.T @ original_values # 在主成分上添加噪声 noise_scale = 0.1 pc_noise = np.random.normal(0, noise_scale, len(pc_space)) pc_noise = pc_noise * np.sqrt(np.abs(eigenvalues)) # 使用绝对值避免负数开方 pc_perturbed = pc_space + pc_noise # 转换回原始空间 perturbed_values = eigenvectors @ pc_perturbed # 确保物理合理性 perturbed_values = np.maximum(perturbed_values, 0) if len(perturbed_values) > 1: perturbed_values[1] = max(perturbed_values[1], perturbed_values[0] * 1.1) # 更新数据 for j, col in enumerate(pollutant_cols): augmented_df.loc[i, col] = perturbed_values[j] except Exception as e: # 如果计算出错,保持原值 continue # 重新计算AQI augmented_df['AQI'] = self.calculate_aqi_from_pm25(augmented_df['PM2.5']) # 日期偏移:向前偏移(过去的时间) date_offset = timedelta(days=365 * (copy_idx + 1)) augmented_df['Date'] = df['Date'] - date_offset # 只保留原始列 augmented_df = augmented_df[self.original_data.columns] augmented_dfs.append(augmented_df) return pd.concat(augmented_dfs, ignore_index=True) def trend_based_augmentation(self, df, trend_factors=[0.8, 1.0, 1.2]): """ 基于趋势的数据增强 - 向前增强 """ augmented_dfs = [] for factor in trend_factors: augmented_df = df.copy() # 应用趋势因子 pollutant_cols = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3_8h'] augmented_df[pollutant_cols] = df[pollutant_cols] * factor # 确保物理合理性 augmented_df[pollutant_cols] = augmented_df[pollutant_cols].clip(lower=0) augmented_df['PM10'] = np.maximum(augmented_df['PM10'], augmented_df['PM2.5'] * 1.1) # 重新计算AQI augmented_df['AQI'] = self.calculate_aqi_from_pm25(augmented_df['PM2.5']) # 不同的日期偏移:向前偏移 if factor < 1.0: date_offset = timedelta(days=365 * 2) elif factor > 1.0: date_offset = timedelta(days=365 * 3) else: date_offset = timedelta(days=0) augmented_df['Date'] = df['Date'] - date_offset # 只保留原始列 augmented_df = augmented_df[self.original_data.columns] augmented_dfs.append(augmented_df) return pd.concat(augmented_dfs, ignore_index=True) def weather_pattern_augmentation(self, df, pattern_types=3): """ 天气模式增强 - 向前增强 """ augmented_dfs = [] # 定义不同的天气模式 weather_patterns = [ {'name': '静稳天气', 'factors': {'PM2.5': 1.3, 'PM10': 1.2, 'O3_8h': 0.8}}, {'name': '大风天气', 'factors': {'PM2.5': 0.7, 'PM10': 0.9, 'O3_8h': 1.1}}, {'name': '降水天气', 'factors': {'PM2.5': 0.6, 'PM10': 0.7, 'O3_8h': 0.9}}, ] for i, pattern in enumerate(weather_patterns[:pattern_types]): augmented_df = df.copy() # 应用天气模式因子 for pollutant, factor in pattern['factors'].items(): augmented_df[pollutant] = df[pollutant] * factor # 确保物理合理性 pollutant_cols = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3_8h'] augmented_df[pollutant_cols] = augmented_df[pollutant_cols].clip(lower=0) augmented_df['PM10'] = np.maximum(augmented_df['PM10'], augmented_df['PM2.5'] * 1.1) # 重新计算AQI augmented_df['AQI'] = self.calculate_aqi_from_pm25(augmented_df['PM2.5']) # 日期偏移:向前偏移(过去的时间) date_offset = timedelta(days=365 * (i + 1)) augmented_df['Date'] = df['Date'] - date_offset # 只保留原始列 augmented_df = augmented_df[self.original_data.columns] augmented_dfs.append(augmented_df) return pd.concat(augmented_dfs, ignore_index=True) def composite_augmentation(self, df, target_multiplier=10): """ 复合增强策略 - 向前增强 """ print(f"开始复合数据增强,目标扩充倍数: {target_multiplier}x") # 计算需要的增强轮次 current_size = len(df) target_size = current_size * target_multiplier enhanced_data = df.copy() round_num = 0 while len(enhanced_data) < target_size and round_num < 20: # 最大20轮防止无限循环 round_num += 1 print(f"\n第 {round_num} 轮增强...") try: # 交替使用不同的增强策略 if round_num % 4 == 0: new_data = self.gaussian_noise_augmentation(enhanced_data, num_copies=1) elif round_num % 4 == 1: new_data = self.correlation_preserving_augmentation(enhanced_data, num_copies=1) elif round_num % 4 == 2: new_data = self.seasonal_augmentation(enhanced_data, seasonal_cycles=1) else: new_data = self.weather_pattern_augmentation(enhanced_data, pattern_types=1) # 合并数据前检查新数据质量 if len(new_data) == 0: print(" 新数据为空,跳过此轮") continue # 合并数据并去重 before_merge = len(enhanced_data) enhanced_data = pd.concat([enhanced_data, new_data], ignore_index=True) enhanced_data = enhanced_data.drop_duplicates(subset=['Date'], keep='first') print(f" 当前数据量: {len(enhanced_data)} (本轮新增: {len(enhanced_data) - before_merge})") except Exception as e: print(f" 第 {round_num} 轮增强出错: {e}") # 跳过此轮,继续下一轮 continue print(f"增强完成,共进行 {round_num} 轮") return enhanced_data def validate_enhanced_data(self, original_df, enhanced_df): """验证增强数据的质量""" print("\n" + "=" * 50) print("增强数据质量验证") print("=" * 50) # 检查NaN值 nan_counts = enhanced_df.isnull().sum() if nan_counts.sum() > 0: print(f"警告: 增强数据中存在NaN值: {nan_counts[nan_counts > 0]}") # 基本统计信息对比 numeric_cols = ['AQI', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3_8h'] original_stats = original_df[numeric_cols].describe() enhanced_stats = enhanced_df[numeric_cols].describe() print("原始数据统计:") print(original_stats) print("\n增强数据统计:") print(enhanced_stats) # 物理合理性检查 pm_ratio_violations = len(enhanced_df[enhanced_df['PM10'] < enhanced_df['PM2.5']]) negative_values = len(enhanced_df[(enhanced_df[numeric_cols] < 0).any(axis=1)]) print(f"\n物理合理性检查:") print(f"PM10 < PM2.5 的违规数量: {pm_ratio_violations}") print(f"负值出现次数: {negative_values}") # 时间范围检查 original_min_date = original_df['Date'].min() original_max_date = original_df['Date'].max() enhanced_min_date = enhanced_df['Date'].min() enhanced_max_date = enhanced_df['Date'].max() print(f"\n时间范围检查:") print(f"原始数据时间范围: {original_min_date} 到 {original_max_date}") print(f"增强数据时间范围: {enhanced_min_date} 到 {enhanced_max_date}") print(f"时间方向: {'向前增强' if enhanced_min_date < original_min_date else '向后增强'}") return True def enhance_data(self, enhancement_method='composite', target_multiplier=8): """ 执行数据增强 """ print("\n" + "=" * 50) print("开始数据增强(向前增强模式)") print("=" * 50) # 根据选择的方法进行增强 if enhancement_method == 'gaussian': self.enhanced_data = self.gaussian_noise_augmentation(self.original_data, num_copies=target_multiplier - 1) elif enhancement_method == 'seasonal': self.enhanced_data = self.seasonal_augmentation(self.original_data, seasonal_cycles=target_multiplier - 1) elif enhancement_method == 'correlation': self.enhanced_data = self.correlation_preserving_augmentation(self.original_data, num_copies=target_multiplier - 1) elif enhancement_method == 'composite': self.enhanced_data = self.composite_augmentation(self.original_data, target_multiplier=target_multiplier) else: raise ValueError(f"不支持的增强方法: {enhancement_method}") # 合并原始数据和增强数据 combined_data = pd.concat([self.enhanced_data, self.original_data], ignore_index=True) # 按日期排序 combined_data = combined_data.sort_values('Date').reset_index(drop=True) # 确保只包含原始列 combined_data = combined_data[self.original_data.columns] self.enhanced_data = combined_data print(f"\n增强完成!") print(f"原始数据量: {len(self.original_data)}") print(f"增强后数据量: {len(self.enhanced_data)}") print(f"实际扩充倍数: {len(self.enhanced_data) / len(self.original_data):.2f}x") # 验证增强数据质量 self.validate_enhanced_data(self.original_data, self.enhanced_data) return self.enhanced_data def save_enhanced_data(self, output_path=None): """保存增强后的数据""" if self.enhanced_data is None: raise ValueError("请先执行数据增强") if output_path is None: # 生成默认输出路径 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = f"空气质量数据_向前增强_{timestamp}.xlsx" # 最终数据清洗 self.enhanced_data = self.enhanced_data.dropna() # 移除任何NaN行 # 保存数据 self.enhanced_data.to_excel(output_path, index=False) print(f"\n增强数据已保存到: {output_path}") # 同时保存处理信息 info = { 'original_file': self.data_path, 'enhancement_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'original_size': len(self.original_data), 'enhanced_size': len(self.enhanced_data), 'enhancement_ratio': len(self.enhanced_data) / len(self.original_data), 'enhancement_direction': 'forward', # 标记为向前增强 'output_file': output_path } info_path = output_path.replace('.xlsx', '_info.json') import json with open(info_path, 'w', encoding='utf-8') as f: json.dump(info, f, indent=4, ensure_ascii=False) print(f"处理信息已保存到: {info_path}") return output_path def main(): """主函数""" # 配置参数 DATA_FILE = "空气质量数据全new.xlsx" ENHANCEMENT_METHOD = "composite" # 可选择: gaussian, seasonal, correlation, composite TARGET_MULTIPLIER = 8 # 目标扩充倍数 print("空气质量数据增强工具(向前增强模式)") print("=" * 50) try: # 初始化增强器 augmenter = AirQualityDataAugmenter(DATA_FILE) # 执行数据增强 enhanced_data = augmenter.enhance_data( enhancement_method=ENHANCEMENT_METHOD, target_multiplier=TARGET_MULTIPLIER ) # 保存结果 output_file = augmenter.save_enhanced_data() print("\n" + "=" * 50) print("数据增强完成!") print(f"原始数据: {len(augmenter.original_data)} 条记录") print(f"增强数据: {len(enhanced_data)} 条记录") print(f"时间方向: 向前增强(扩展到过去的时间)") print(f"输出文件: {output_file}") print("=" * 50) except Exception as e: print(f"程序执行出错: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main()按逻辑整理以上模型并进行说明
最新发布
11-23
import pandas as pd from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.cluster import KMeans from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.metrics import silhouette_samples, silhouette_score import matplotlib.pyplot as plt import matplotlib.cm as cm import numpy as np from matplotlib import rcParams import warnings warnings.filterwarnings('ignore') # 设置中文显示 rcParams['font.sans-serif'] = ['SimHei'] rcParams['axes.unicode_minus'] = False # 读取数据 file_path = 'C:/Users/yuanx/Desktop/数据/附件1.xlsx' df = pd.read_excel(file_path) # 要处理的列 numeric_columns = ['abs赔付差额', 'abs赔付差额/实际赔付金额', 'log(实际赔付差额)'] categorical_columns = ['异常原因', '商品类型'] # 处理缺失值 imputer_cat = SimpleImputer(strategy='constant', fill_value='未知') df['异常原因'] = imputer_cat.fit_transform(df[['异常原因']]).ravel() df = df.dropna(subset=numeric_columns).copy() # 构建预处理器 scaler = StandardScaler() encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop=None) preprocessor = ColumnTransformer( transformers=[ ('num', scaler, numeric_columns), ('cat', Pipeline(steps=[('imputer', imputer_cat), ('encoder', encoder)]), categorical_columns) ], remainder='drop' ) # 创建聚类管道 n_clusters = 3 kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('kmeans', kmeans) ]) # 拟合并获取标签 labels = pipeline.fit_predict(df) X_transformed = pipeline[:-1].transform(df) # 注意:这里应该用 transform 而非 fit_transform # ======================== # 1. 原有轮廓图(保持不变) # ======================== fig, ax1 = plt.subplots(1, 1, figsize=(8, 6)) silhouette_avg = silhouette_score(X_transformed, labels) sample_silhouette_values = silhouette_samples(X_transformed, labels) y_lower = 10 for i in range(n_clusters): ith_cluster_silhouette_values = sample_silhouette_values[labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 ax1.set_title("轮廓图(Silhouette Plot)") ax1.set_xlabel("轮廓系数值") ax1.set_ylabel("簇标签") ax1.axvline(x=silhouette_avg, color="red", linestyle="--", label=f'平均轮廓系数: {silhouette_avg:.2f}') ax1.legend() plt.tight_layout() plt.show() # ======================== # 2. 聚类中心反变换与可解释性分析 # ======================== # 获取聚类中心(在标准化+编码后的空间) cluster_centers_encoded = pipeline.named_steps['kmeans'].cluster_centers_ # 定义一个函数来将编码后的中心逆变换为原始特征 def inverse_transform_cluster_centers(preprocessor, cluster_centers, numeric_cols, cat_cols): # 分离数值和类别部分 num_features = len(numeric_cols) num_encoded_features = preprocessor.transformers_[0][1].n_features_in_ cat_encoded_features_start = num_encoded_features encoded_feature_names = preprocessor.get_feature_names_out() # 逆标准化数值特征 scaler = preprocessor.transformers_[0][1] centers_numeric_scaled = cluster_centers[:, :num_encoded_features] centers_numeric_raw = scaler.inverse_transform(centers_numeric_scaled) # 处理分类特征(需要从 OneHotEncoder 映射回来) encoder = preprocessor.transformers_[1][1]['encoder'] cat_encoded_array = cluster_centers[:, num_encoded_features:] # 获取每个分类变量对应的类别名 cat_encoder_features = encoder.get_feature_names_out(cat_cols) cat_data_start_idx = 0 centers_categorical_raw = [] for i, cat_col in enumerate(cat_cols): # 找出该列编码了多少个类别 n_classes = encoder.categories_[i].shape[0] cat_end_idx = cat_data_start_idx + n_classes # 提取每个簇在该分类变量上的 one-hot 向量 cat_vectors = cat_encoded_array[:, cat_data_start_idx:cat_end_idx] # 对每个簇,找最大概率的类别作为代表(软分配) predicted_classes = [encoder.categories_[i][np.argmax(vec)] for vec in cat_vectors] centers_categorical_raw.append(predicted_classes) cat_data_start_idx = cat_end_idx # 组合成 DataFrame numeric_df = pd.DataFrame(centers_numeric_raw, columns=numeric_cols) categorical_df = pd.DataFrame(centers_categorical_raw, index=cat_cols).T result = pd.concat([numeric_df, categorical_df], axis=1) return result # 执行逆变换 cluster_centers_original = inverse_transform_cluster_centers( preprocessor, cluster_centers_encoded, numeric_columns, categorical_columns ) print("各簇聚类中心(原始尺度):") print(cluster_centers_original.round(4)) # ======================== # 3.特征雷达图可视化(适合展示多个特征) # ======================== from math import pi def plot_radar_charts(data_df, title="聚类中心雷达图"): categories = data_df.columns.tolist() N = len(categories) # 角度设置 angles = [n / float(N) * 2 * pi for n in range(N)] angles += angles[:1] # 闭合图形 # 创建子图 fig, axes = plt.subplots(1, n_clusters, subplot_kw=dict(polar=True), figsize=(15, 6)) if n_clusters == 1: axes = [axes] # 归一化数据用于绘图(避免某些特征主导) normalized_df = data_df.copy() for col in numeric_columns: min_val, max_val = normalized_df[col].min(), normalized_df[col].max() if max_val != min_val: normalized_df[col] = (normalized_df[col] - min_val) / (max_val - min_val) else: normalized_df[col] = 0.5 for col in categorical_columns: # 分类变量转为序数表示(基于出现顺序) le = pd.Categorical(normalized_df[col]).codes normalized_df[col] = (le - le.min()) / (le.max() - le.min() + 1e-8) for i, ax in enumerate(axes): values = normalized_df.iloc[i].values.flatten().tolist() values += values[:1] ax.plot(angles, values, linewidth=2, linestyle='solid', label=f'簇 {i}') ax.fill(angles, values, alpha=0.3) ax.set_thetagrids([a * 180 / pi for a in angles[:-1]], categories) ax.set_title(f"簇 {i}", size=14, pad=20) ax.grid(True) fig.suptitle(title, fontsize=16, y=1.05) plt.tight_layout() plt.show() # 绘制雷达图 plot_radar_charts(cluster_centers_original[numeric_columns + categorical_columns]) # ======================== # 4. 补充:二维散点图(仍可用,但需说明是前两维) # ======================== plt.figure(figsize=(8, 6)) plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=labels, cmap='viridis', alpha=0.6) # 聚类中心投影(注意:这里是编码+标准化后的坐标) center_proj = pipeline.named_steps['kmeans'].cluster_centers_ plt.scatter(center_proj[:, 0], center_proj[:, 1], marker='X', s=200, c='red', label='聚类中心(编码空间)') plt.title('聚类结果投影(前两个主成分)') plt.xlabel('abs赔付差额(标准化)') plt.ylabel('abs赔付差额/实际赔付金额(标准化)') plt.legend() plt.grid(True) plt.show() 聚类前加随机森林求特征重要性,将特征重要性作为权重加到聚类中
10-26
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值