请你解析一下以下这段代码:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import xgboost as xgb
from matplotlib.gridspec import GridSpec
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
class SHAPPartialDependenceAnalyzer:
"""
XGBoost + SHAP 偏依赖分析器
针对3个自变量对ND因变量的影响分析
"""
def __init__(self, fitting_level='medium', smoothing_level='high'):
"""
初始化分析器
Parameters:
fitting_level: str, 拟合度级别
- 'low': 低拟合度 (防过拟合,适合小样本)
- 'medium': 中等拟合度 (平衡,推荐)
- 'high': 高拟合度 (精细拟合,适合大样本)
- 'custom': 自定义参数
smoothing_level: str, 趋势线平滑程度
- 'low': 保留更多细节,跟随数据波动
- 'medium': 适度平滑
- 'high': 高度平滑,主要显示大趋势 (推荐)
"""
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial']
plt.rcParams['axes.unicode_minus'] = False
# 拟合度配置
self.fitting_level = fitting_level
self.smoothing_level = smoothing_level
self.model_configs = {
'low': {
'n_estimators': 150,
'learning_rate': 0.1,
'max_depth': 4,
'subsample': 0.7,
'colsample_bytree': 0.7,
'gamma': 0.3,
'min_child_weight': 5,
'reg_alpha': 0.3,
'reg_lambda': 2
},
'medium': {
'n_estimators': 300,
'learning_rate': 0.05,
'max_depth': 6,
'subsample': 0.8,
'colsample_bytree': 0.8,
'gamma': 0.1,
'min_child_weight': 3,
'reg_alpha': 0.1,
'reg_lambda': 1
},
'high': {
'n_estimators': 500,
'learning_rate': 0.02,
'max_depth': 8,
'subsample': 0.9,
'colsample_bytree': 0.9,
'gamma': 0.05,
'min_child_weight': 1,
'reg_alpha': 0.05,
'reg_lambda': 0.5
}
}
# 精美配色方案 - 采用渐变色系
self.colors = {
'Csf': '#FF6B6B', # 珊瑚红
'Csq': '#4ECDC4', # 青绿色
'Cst': '#45B7D1' # 天蓝色
}
# 自变量和因变量定义
self.independent_vars = ['Csf', 'Csq', 'Cst']
self.dependent_var = 'ND'
# 控制变量(除了id和自变量、因变量组之外的所有变量)
self.control_vars = None # 将在加载数据时自动识别
def load_and_prepare_data(self, file_path):
"""
加载和预处理数据
"""
print("正在加载数据...")
# 读取CSV文件
try:
df = pd.read_csv(file_path)
print(f"成功加载数据,共 {len(df)} 行,{len(df.columns)} 列")
except Exception as e:
print(f"加载数据失败: {e}")
return None
# 检查必要的列是否存在
required_cols = self.independent_vars + [self.dependent_var]
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
print(f"缺少必要的列: {missing_cols}")
return None
# 自动识别控制变量
exclude_cols = ['id'] + self.independent_vars + [self.dependent_var]
# 排除其他因变量
other_dependent_vars = ['NI', 'ND', 'PcaF1', 'PcaF2', 'ProF1', 'ProF2']
exclude_cols.extend(other_dependent_vars)
self.control_vars = [col for col in df.columns if col not in exclude_cols]
print(f"识别到控制变量: {self.control_vars}")
# 数据清洗
print("开始数据预处理...")
# 删除id列(如果存在)
if 'id' in df.columns:
df = df.drop('id', axis=1)
# 只保留需要的列
required_columns = self.independent_vars + [self.dependent_var] + self.control_vars
df = df[required_columns]
# 处理缺失值
initial_rows = len(df)
df = df.dropna()
final_rows = len(df)
print(f"删除缺失值后,从 {initial_rows} 行减少到 {final_rows} 行")
# 检查数据类型并转换
for col in df.columns:
if df[col].dtype == 'object':
try:
df[col] = pd.to_numeric(df[col])
except:
print(f"警告: 列 {col} 无法转换为数值类型")
print("数据预处理完成!")
print(f"最终数据形状: {df.shape}")
# 显示基本统计信息
print("\n自变量基本统计:")
print(df[self.independent_vars].describe())
print(f"\n因变量 {self.dependent_var} 基本统计:")
print(df[self.dependent_var].describe())
return df
def create_optimized_model(self, X, y):
"""
创建优化的XGBoost回归模型
"""
print("创建和训练XGBoost模型...")
# 分割训练测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# XGBoost回归参数优化 - 根据拟合度级别选择参数
if self.fitting_level in self.model_configs:
config = self.model_configs[self.fitting_level]
print(f"使用 {self.fitting_level} 拟合度配置")
else:
# 自定义配置,使用默认medium配置
config = self.model_configs['medium']
print("使用自定义配置 (默认medium)")
model = xgb.XGBRegressor(
n_estimators=config['n_estimators'],
learning_rate=config['learning_rate'],
max_depth=config['max_depth'],
subsample=config['subsample'],
colsample_bytree=config['colsample_bytree'],
gamma=config['gamma'],
min_child_weight=config['min_child_weight'],
reg_alpha=config['reg_alpha'],
reg_lambda=config['reg_lambda'],
random_state=42,
n_jobs=-1
)
# 训练模型
model.fit(X_train, y_train)
# 模型评估
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f"\n模型性能评估 (拟合度: {self.fitting_level}):")
print(f"训练集 R²: {train_r2:.4f}, RMSE: {train_rmse:.4f}")
print(f"测试集 R²: {test_r2:.4f}, RMSE: {test_rmse:.4f}")
print(f"过拟合程度: {train_r2 - test_r2:.4f} (差值越小越好)")
# 特征重要性
feature_importance = pd.DataFrame({
'Feature': X.columns,
'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print(f"\n前10重要特征:")
print(feature_importance.head(10))
return model, feature_importance
def create_shap_partial_dependence_plots(self, df, save_path=None):
"""
创建SHAP偏依赖图 - 3个自变量的组图
"""
print("\n开始创建SHAP偏依赖分析图...")
# 准备数据
X = df[self.independent_vars + self.control_vars]
y = df[self.dependent_var]
print(f"特征维度: {X.shape}")
print(f"目标变量: {y.shape}")
# 创建和训练模型
model, feature_importance = self.create_optimized_model(X, y)
# 创建SHAP解释器
print("创建SHAP解释器...")
explainer = shap.TreeExplainer(model)
# 使用全部样本进行SHAP分析
X_sample = X
print(f"使用全部 {len(X_sample)} 个样本进行SHAP分析")
# 计算SHAP值
print("计算SHAP值...")
shap_values = explainer.shap_values(X_sample)
# 创建2x2的子图布局,但只用3个位置
fig = plt.figure(figsize=(16, 12))
gs = GridSpec(2, 2, figure=fig, wspace=0.25, hspace=0.35)
# 为每个自变量创建偏依赖图
positions = [(0, 0), (0, 1), (1, 0)] # 3个位置
for i, var in enumerate(self.independent_vars):
print(f"创建 {var} 的偏依赖图...")
row, col = positions[i]
ax = fig.add_subplot(gs[row, col])
# 获取当前变量在X中的索引
var_idx = X.columns.get_loc(var)
# 获取变量值和对应的SHAP值
var_values = X_sample[var].values
var_shap_values = shap_values[:, var_idx]
# 创建散点图
scatter = ax.scatter(
var_values,
var_shap_values,
c=self.colors[var],
alpha=0.6,
s=30,
edgecolors='white',
linewidth=0.5
)
# 🎯 根据平滑程度创建不同样式的趋势线
if len(var_values) > 10:
# 排序数据点
sort_idx = np.argsort(var_values)
sorted_x = var_values[sort_idx]
sorted_y = var_shap_values[sort_idx]
if self.smoothing_level == 'high':
# 高度平滑:大窗口移动平均 + 二次平滑
window_size = max(80, len(sorted_x) // 6)
smoothed_y = pd.Series(sorted_y).rolling(
window=window_size, center=True, min_periods=1
).mean()
# 二次平滑
window_size2 = max(40, len(sorted_x) // 12)
smoothed_y = smoothed_y.rolling(
window=window_size2, center=True, min_periods=1
).mean()
elif self.smoothing_level == 'medium':
# 中度平滑:适中窗口移动平均
window_size = max(40, len(sorted_x) // 10)
smoothed_y = pd.Series(sorted_y).rolling(
window=window_size, center=True, min_periods=1
).mean()
else: # low smoothing
# 低平滑:小窗口移动平均
window_size = max(15, len(sorted_x) // 20)
smoothed_y = pd.Series(sorted_y).rolling(
window=window_size, center=True, min_periods=1
).mean()
# 绘制平滑趋势线
ax.plot(sorted_x, smoothed_y,
color='darkred', linewidth=4, alpha=0.9,
label=f'{self.smoothing_level}平滑趋势线', zorder=5)
# 可选:添加分段线性趋势(仅在高平滑模式下)
if self.smoothing_level == 'high' and len(var_values) > 100:
# 3段线性拟合,显示主要趋势
n_segments = 3
segment_size = len(sorted_x) // n_segments
for i in range(n_segments):
start_idx = i * segment_size
end_idx = (i + 1) * segment_size if i < n_segments - 1 else len(sorted_x)
if end_idx - start_idx > 10:
seg_x = sorted_x[start_idx:end_idx]
seg_y = sorted_y[start_idx:end_idx]
# 线性拟合
z = np.polyfit(seg_x, seg_y, 1)
p = np.poly1d(z)
ax.plot(seg_x, p(seg_x),
color='blue', linewidth=2, alpha=0.6,
linestyle='--', zorder=3)
# 添加零线
ax.axhline(y=0, color='black', linestyle='--', alpha=0.7, linewidth=1)
# 设置标题和标签
ax.set_title(f'{var} 对 {self.dependent_var} 的SHAP偏依赖效应',
fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel(f'{var} 值', fontsize=12)
ax.set_ylabel(f'SHAP值 (对{self.dependent_var}的贡献)', fontsize=12)
# 美化图表
ax.grid(True, alpha=0.3, linestyle='-', color='lightgray')
ax.tick_params(axis='both', which='major', labelsize=10)
# 添加统计信息
correlation = np.corrcoef(var_values, var_shap_values)[0, 1]
ax.text(0.05, 0.95, f'相关系数: {correlation:.3f}',
transform=ax.transAxes, fontsize=10,
bbox=dict(boxstyle='round,pad=0.3', facecolor='lightblue', alpha=0.7))
# 设置坐标轴范围,突出主要数据分布
x_range = np.percentile(var_values, [5, 95])
y_range = np.percentile(var_shap_values, [5, 95])
margin_x = (x_range[1] - x_range[0]) * 0.1
margin_y = (y_range[1] - y_range[0]) * 0.1
ax.set_xlim(x_range[0] - margin_x, x_range[1] + margin_x)
ax.set_ylim(y_range[0] - margin_y, y_range[1] + margin_y)
# 添加总标题
fig.suptitle(f'XGBoost + SHAP 偏依赖分析: 3个自变量对{self.dependent_var}的影响\n' +
f'样本数: {len(df)}, 模型R²: {r2_score(y, model.predict(X)):.3f}',
fontsize=16, fontweight='bold', y=0.98)
# 添加说明文字
fig.text(0.5, 0.02,
'SHAP值表示该变量对预测结果的贡献大小,正值表示增加预测值,负值表示减少预测值\n' +
'趋势线显示变量与SHAP值之间的非线性关系模式',
ha='center', fontsize=11,
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightyellow', alpha=0.8))
# 保存图片
plt.tight_layout(rect=[0, 0.06, 1, 0.96])
if save_path:
try:
plt.savefig(save_path, dpi=300, bbox_inches='tight',
facecolor='white', edgecolor='none')
print(f"\n图表已保存到: {save_path}")
except Exception as e:
print(f"保存图表失败: {e}")
plt.show()
return fig, model, feature_importance
def generate_summary_report(self, df, model, feature_importance):
"""
生成分析摘要报告
"""
print("\n" + "="*60)
print("SHAP偏依赖分析摘要报告")
print("="*60)
X = df[self.independent_vars + self.control_vars]
y = df[self.dependent_var]
# 模型整体性能
y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))
mae = mean_absolute_error(y, y_pred)
print(f"\n1. 模型整体性能:")
print(f" R² 决定系数: {r2:.4f}")
print(f" RMSE 均方根误差: {rmse:.4f}")
print(f" MAE 平均绝对误差: {mae:.4f}")
# 自变量重要性排序
print(f"\n2. 自变量重要性排序:")
independent_importance = feature_importance[
feature_importance['Feature'].isin(self.independent_vars)
].copy()
for i, (_, row) in enumerate(independent_importance.iterrows(), 1):
print(f" {i}. {row['Feature']}: {row['Importance']:.4f}")
# 相关性分析
print(f"\n3. 自变量与{self.dependent_var}的线性相关性:")
correlations = df[self.independent_vars + [self.dependent_var]].corr()[self.dependent_var]
correlations = correlations.drop(self.dependent_var).sort_values(key=abs, ascending=False)
for var in correlations.index:
print(f" {var}: {correlations[var]:+.4f}")
# 数据基本信息
print(f"\n4. 数据基本信息:")
print(f" 总样本数: {len(df)}")
print(f" 自变量数: {len(self.independent_vars)}")
print(f" 控制变量数: {len(self.control_vars)}")
print(f" 因变量范围: [{df[self.dependent_var].min():.3f}, {df[self.dependent_var].max():.3f}]")
print("\n" + "="*60)
def main():
"""
主函数 - 执行完整的分析流程
"""
# 🎯 修改这里来调整拟合度和平滑度!
FITTING_LEVEL = 'medium' # 👈 拟合度: 'low', 'medium', 'high'
SMOOTHING_LEVEL = 'medium' # 👈 平滑度: 'low', 'medium', 'high' (推荐high)
# 初始化分析器
analyzer = SHAPPartialDependenceAnalyzer(
fitting_level=FITTING_LEVEL,
smoothing_level=SMOOTHING_LEVEL
)
print(f"🔧 当前设置:")
print(f" 拟合度: {FITTING_LEVEL}")
print(f" 平滑度: {SMOOTHING_LEVEL}")
print()
print("📝 平滑度说明:")
print(" - low: 保留更多细节波动 (线条较弯曲)")
print(" - medium: 适度平滑 (平衡)")
print(" - high: 高度平滑 (线条最直,主要趋势) 👈 推荐解决弯曲问题")
print()
# 文件路径
file_path = r"F:\Data\表C.csv"
# 加载数据
df = analyzer.load_and_prepare_data(file_path)
if df is not None:
# 创建SHAP偏依赖图
save_path = fr"F:\Data\SHAP偏依赖分析_3变量对ND_{FITTING_LEVEL}拟合_{SMOOTHING_LEVEL}平滑.png"
fig, model, feature_importance = analyzer.create_shap_partial_dependence_plots(
df, save_path=save_path
)
# 生成摘要报告
analyzer.generate_summary_report(df, model, feature_importance)
print("\n分析完成!")
print(f"偏依赖图已保存至: {save_path}")
else:
print("数据加载失败,请检查文件路径和数据格式")
if __name__ == "__main__":
main()