import tensorflow as tf
tf.compat.v1.disable_v2_behavior() # 禁用 TensorFlow 2.x 的行为,让代码以 TensorFlow 1.x 的模式运行
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from sklearn.base import BaseEstimator
import shap
import itertools
# 设置支持CJK字符的字体
matplotlib.rcParams['font.sans-serif'] = ['Microsoft YaHei', 'SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
class KerasEstimatorWrapper(BaseEstimator):
"""增强版Keras模型包装器,支持sklearn接口"""
def __init__(self, model):
self.model = model
def fit(self, X, y):
# 预训练模型不需要实际训练
return self
def predict(self, X):
return self.model.predict(X, verbose=0).flatten()
def score(self, X, y):
"""实现R²评分方法"""
y_pred = self.predict(X)
return r2_score(y.flatten(), y_pred)
def _is_fitted(self):
# 检查模型是否有可训练的权重
return len(self.model.trainable_weights) > 0
def __getattr__(self, name):
# 让包装器类可以访问模型的属性和方法
return getattr(self.model, name)
def __sklearn_is_fitted__(self):
print("Checking if model is fitted...")
is_fitted = self._is_fitted()
print(f"Model is fitted: {is_fitted}")
return is_fitted
# 数据加载和预处理
def load_and_preprocess_data(file_path):
data = pd.read_excel(file_path)
X = data[['t1', 't2', 'G1', 'G2', 'DAl2O3', 'CI', 'CA', 'ZETA', 'JD']]
y = data['RV']
# 标准化处理数据 z-score
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))
# 返回这些值
return X_scaled, y_scaled, scaler_X, scaler_y
# shap分析
def shap_analysis(model, X_all, feature_names, original_X):
# 200个背景数据:用于近似模型的基线行为
background = X_all # 为标准化后的特征数据
# 使用 DeepExplainer 替代 KernelExplainer
explainer = shap.DeepExplainer(model, background)
# numpy.array数组
shap_values = explainer.shap_values(X_all[:200])
# 将 SHAP 值转换为 Pandas DataFrame
shap_df = pd.DataFrame(shap_values[0], columns=feature_names)
# 保存shap值。将 SHAP 值及其特征名称保存为 CSV 文件,供自主绘制特征重要性排列图
# shap_df.to_csv('shap_values.csv', index=False)
# 计算每个特征的平均绝对 SHAP 值,表示该特征的重要性
mean_shap_values = shap_df.abs().mean()
# 将特征和其平均 SHAP 值组合成一个 DataFrame
importance_df = pd.DataFrame({
'Feature': shap_df.columns,
'Mean SHAP Value': mean_shap_values
})
# 保存特征重要性排列数据
# # 按 Mean SHAP Value 降序排序
# importance_df = importance_df.sort_values(by='Mean SHAP Value', ascending=False)
# # 将 Mean SHAP Value及其特征名称保存为 CSV 文件,供自主绘制特征重要性排列图
# importance_df.to_csv('importance_df.csv', index=False)
# shap.Explanation对象
shap_values_exp = shap.Explanation(values=shap_values[0], data=X_all[:200], feature_names=feature_names, base_values=explainer.expected_value)
# shap_values_exp = explainer(X_all[:200])
print(f"shap_values_exp.base_values: {shap_values_exp.base_values}") # explainer.expected_value是模型的基线值(baseline),就是base_value?这里为什么用explainer.expected_value而不是exp.expected_value?
print(f"shap_values_exp.values: {shap_values_exp.values}")
print(f"shap_values_exp类型: {type(shap_values_exp)}")
print(f"shap_values_exp长度: {len(shap_values_exp)}")
# shap图1:特征重要性排列图
shap.summary_plot(shap_values, X_all, feature_names=feature_names)
#shap图2:蜂窝图 全局解释: 旨在理解模型总体结构
plt.figure()
plt.title("SHAP Summary Plot")
shap.plots.beeswarm(shap_values_exp, show = False) # shap图1:蜂窝图,需要手动保存。蜂窝图和摘要图是一样的图不同的叫法
plt.gcf().set_size_inches(12, 10) # 整张画纸的尺寸,可以在窗口中再调整图像的位置
plt.show()
# shap图3:热图(小论文可以放附录或者支撑材料)
shap.plots.heatmap(shap_values_exp)
# shap图4:瀑布图(单个样本分析)
plt.figure()
plt.title("SHAP Waterfall Plot")
shap.plots.waterfall(shap_values_exp[0], show = False) # shap.plots.waterfall期望接收的是单个样本的Explanation对象
plt.gcf().set_size_inches(12, 10) # 整张画纸的尺寸,可以在窗口中再调整图像的位置
plt.show()
# shap图5:dependence plot
features = ['t1', 't2', 'G1', 'G2', 'DAl2O3', 'CI', 'CA', 'ZETA', 'JD']
feature_pairs = list(itertools.combinations(features, 2)) # 生成特征对
# 确保 feature_names 是字符串列表
feature_names = [str(f) for f in feature_names]
# 创建特征名称到索引的映射
feature_to_idx = {name: idx for idx, name in enumerate(feature_names)}
for main_feature, interaction_feature in feature_pairs:
fig = plt.figure()
# 使用 X_all 作为 features 参数,因为它包含实际的特征值
# 使用特征索引而非名称
shap.dependence_plot(
feature_to_idx[main_feature], # 要展示的特征名称.使用索引而非名称
shap_values[0], # SHAP 值数组
original_X,
interaction_index=feature_to_idx[interaction_feature], # 用于显示交互效应的特征
show=False
)
plt.title(f"{main_feature} vs {interaction_feature}")
plt.savefig(
f"{main_feature}_vs_{interaction_feature}.svg",
format="svg",
bbox_inches="tight",
dpi=1200
)
plt.close(fig)
# 7. 主函数
def main():
# 文件路径
file_path = "dataAl-paixu.xlsx"
feature_names = ['t1', 't2', 'G1', 'G2', 'DAl2O3', 'CI', 'CA', 'ZETA', 'JD']
# 加载数据
X_all, y_all, scaler_X, scaler_y = load_and_preprocess_data(file_path) # X_scaled 被赋值给了 X_all
original_X = pd.read_excel(file_path)[['t1', 't2', 'G1', 'G2', 'DAl2O3', 'CI', 'CA', 'ZETA', 'JD']]
# 加载保存的final模型
model = load_model('final-bestmodel.h5')
print("Model loaded successfully.")
print("Model layers:", model.layers)
print("\n[模型结构摘要]")
model.summary()
# 执行分析流程
# garson_df = garson_sensitivity(model, feature_names) # 比较次的重要性分析方法
# partial_dependence_analysis(model, X_all, feature_names) # 局部依赖也是比较次的方法
shap_analysis(model, X_all, feature_names, original_X)
if __name__ == "__main__":
main()
这段代码报错:Traceback (most recent call last):
File "C:\Users\Administrator\Desktop\machine learning\Python BP ANN\explanation\BP ANN-shap-Al.py", line 185, in <module>
main()
File "C:\Users\Administrator\Desktop\machine learning\Python BP ANN\explanation\BP ANN-shap-Al.py", line 173, in main
model = load_model('final-bestmodel.h5')
File "D:\python3\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
raise e.with_traceback(filtered_tb) from None
File "C:\Users\Administrator\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 641, in numpy
"numpy() is only available when eager execution is enabled.")
NotImplementedError: numpy() is only available when eager execution is enabled. 如何解决?