
代码复现
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
第10章 可解释性工具箱 - 可执行示例代码
- 包含:可微掩码优化、梯度归因、代理线性模型、反事实生成、评估指标(充分性/必要性/稳健性/显著性/对抗性)
- 注释与打印为中文,便于实验和教学
- 依赖:numpy, scipy, sklearn, torch, matplotlib
运行示例:
python Chapter10_Interpretability_Tools.py
注意:本文件生成和使用合成数据与简单神经网络模型以示范方法流程。
"""
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from copy import deepcopy
import time
# -----------------------------
# 全局设置与工具函数
# -----------------------------
SEED = 12345
np.random.seed(SEED)
torch.manual_seed(SEED)
def set_seed(s=SEED):
np.random.seed(s)
torch.manual_seed(s)
def sigmoid(x):
return 1/(1+np.exp(-x))
# 将张量转换为 numpy
def to_np(t):
return t.detach().cpu().numpy()
# -----------------------------
# 数据与模型:合成分类任务
# -----------------------------
def generate_synthetic_classification(n_samples=1000, n_features=20, informative=5):
"""生成二分类数据,前 informative 个特征为判别特征"""
rng = np.random.default_rng(SEED)
X = rng.normal(0,1,(n_samples,n_features))
# 线性可分性 + 非线性项
w = np.zeros(n_features)
w[:informative] = rng.normal(2.0,0.5,size=informative)
logits = X @ w + 0.5*np.sin(X[:,0])
probs = sigmoid(logits)
y = (rng.random(n_samples) < probs).astype(int)
return X.astype(np.float32), y.astype(np.int64)
class SimpleMLP(nn.Module):
def __init__(self, n_features, hidden=64):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n_features, hidden),
nn.ReLU(),
nn.Linear(hidden, hidden//2),
nn.ReLU(),
nn.Linear(hidden//2,2)
)
def forward(self,x):
return self.net(x)
# -----------------------------
# 1. 可微掩码优化 (Learning a soft mask)
# -----------------------------
def optimize_soft_mask(model, x, baseline=None, lr=0.1, steps=200, l1=1e-2):
"""
对单个输入 x 优化可微掩码 m in [0,1]^d
目标:最小化 task loss 保持输出,同时压缩掩码稀疏性
返回:soft mask, 二值化 mask
注释:使用模型在预测概率层面保持相似性作为约束
"""
model.eval()
x_t = torch.tensor(x, dtype=torch.float32).unsqueeze(0)
if baseline is None:
baseline = torch.zeros_like(x_t)
# 初始 mask 参数为 logit 空间
d = x.shape[0]
m_param = torch.zeros(d, requires_grad=True)
optimizer = optim.Adam([m_param], lr=lr)
with torch.no_grad():
y_orig_logits = model(x_t)
y_orig_prob = torch.softmax(y_orig_logits, dim=-1)[0,1].item()
for step in range(steps):
optimizer.zero_grad()
m = torch.sigmoid(m_param)
xm = x_t * m + baseline * (1-m)
logits = model(xm)
prob = torch.softmax(logits, dim=-1)[0,1]
# 保持原始概率相近的损失
loss_task = (prob - y_orig_prob)**2
# 掩码稀疏性
loss_reg = l1 * torch.sum(m)
loss = loss_task + loss_reg
loss.backward()
optimizer.step()
if step%50==0:
print(f"[掩码优化] step={step}, loss_task={loss_task.item():.6e}, mask_sum={m.sum().item():.4f}")
m_final = torch.sigmoid(m_param).detach().cpu().numpy()
m_bin = (m_final>=0.5).astype(int)
return m_final, m_bin
# -----------------------------
# 2. 梯度/敏感性归因 (Gradient-based saliency)
# -----------------------------
def gradient_saliency(model, x, target_class=1):
"""计算 gradient * input 作为归因分数"""
model.eval()
x_t = torch.tensor(x, dtype=torch.float32).unsqueeze(0)
x_t.requires_grad_()
logits = model(x_t)
prob = torch.softmax(logits, dim=-1)[0,target_class]
prob.backward()
grad = x_t.grad.detach().cpu().numpy()[0]
saliency = grad * x
saliency = saliency
return saliency
# -----------------------------
# 3. 代理线性模型 (Surrogate explanation)
# -----------------------------
def surrogate_linear_model(f_model, X, n_samples=500, alpha=1.0):
"""在输入邻域上拟合线性代理(岭回归),返回系数和截距"""
rng = np.random.default_rng(SEED)
idx = rng.choice(len(X), size=min(n_samples,len(X)), replace=False)
X_sub = X[idx]
with torch.no_grad():
model = f_model
X_tensor = torch.tensor(X_sub, dtype=torch.float32)
logits = model(X_tensor)
probs = torch.softmax(logits, dim=-1)[:,1].detach().cpu().numpy()
ridge = Ridge(alpha=alpha)
ridge.fit(X_sub, probs)
return ridge.coef_, ridge.intercept_, ridge
# -----------------------------
# 4. 反事实生成 (Counterfactual via optimization)
# -----------------------------
def generate_counterfactual(model, x, target_label=1, lr=0.05, steps=200, lambda_norm=0.01):
"""最小化 L2 距离 + 分类误差,得到反事实 x'"""
model.eval()
x_var = torch.tensor(x, dtype=torch.float32).unsqueeze(0).clone()
x_var = x_var.requires_grad_()
opt = optim.Adam([x_var], lr=lr)
for step in range(steps):
opt.zero_grad()
logits = model(x_var)
loss_cls = nn.CrossEntropyLoss()(logits, torch.tensor([target_label]))
loss_dist = lambda_norm * torch.sum((x_var - torch.tensor(x,dtype=torch.float32))**2)
loss = loss_dist + loss_cls
loss.backward()
opt.step()
if step%50==0:
with torch.no_grad():
pred = torch.argmax(model(x_var),dim=-1).item()
print(f"[反事实] step={step}, loss_cls={loss_cls.item():.6e}, pred_label={pred}")
return x_var.detach().cpu().numpy()[0]
# -----------------------------
# 5. 评估指标实现
# -----------------------------
def sufficiency_metric(model, x, mask, baseline=None, k=None):
"""保真-充分性:仅保留 mask 指示特征后预测距离"""
model.eval()
x_t = torch.tensor(x, dtype=torch.float32)
if baseline is None:
baseline = torch.zeros_like(x_t)
if k is not None:
# 保留前 k 特征索引
idx = np.argsort(-mask)[:k]
m = np.zeros_like(mask); m[idx]=1
else:
m = mask
xm = x_t * torch.tensor(m,dtype=torch.float32) + baseline * (1-torch.tensor(m,dtype=torch.float32))
with torch.no_grad():
p_orig = torch.softmax(model(x_t.unsqueeze(0)),dim=-1)[0,1].item()
p_mask = torch.softmax(model(xm.unsqueeze(0)),dim=-1)[0,1].item()
return abs(p_orig - p_mask)
def necessity_metric(model, x, mask, baseline=None, k=None):
"""必要性:移除被认为重要的特征后预测距离"""
model.eval()
x_t = torch.tensor(x, dtype=torch.float32)
if baseline is None:
baseline = torch.zeros_like(x_t)
if k is not None:
idx = np.argsort(-mask)[:k]
m = np.ones_like(mask); m[idx]=0
else:
m = 1-mask
xm = x_t * torch.tensor(m,dtype=torch.float32) + baseline * (1-torch.tensor(m,dtype=torch.float32))
with torch.no_grad():
p_orig = torch.softmax(model(x_t.unsqueeze(0)),dim=-1)[0,1].item()
p_mask = torch.softmax(model(xm.unsqueeze(0)),dim=-1)[0,1].item()
return abs(p_orig - p_mask)
def stability_metric(explainer_func, model, x, n_pert=20, noise_scale=1e-2):
"""稳健性:对输入进行微扰,评估解释向量差异"""
set_seed()
s0 = explainer_func(model, x)
S = []
for i in range(n_pert):
x_pert = x + np.random.normal(0,noise_scale,size=x.shape)
s = explainer_func(model, x_pert)
S.append(s)
S = np.stack(S)
# 归一化后计算平均相对差
s0n = np.linalg.norm(s0,ord=1)+1e-12
diffs = np.linalg.norm(S - s0[None,:], ord=1, axis=1) / s0n
return diffs.mean()
def permutation_significance(explainer_func, model, X, x_idx=0, n_perm=100):
"""置换检验:在置换输入特征时计算解释值分布,估计每个特征的 p-value"""
set_seed()
x = X[x_idx].copy()
s_obs = explainer_func(model, x)
pvals = np.ones_like(s_obs)
for j in range(len(s_obs)):
greater = 0
for _ in range(n_perm):
x_perm = x.copy()
# 打乱第 j 个特征在样本集中的顺序来构建置换
x_perm[j] = np.random.permutation(X[:,j])[0]
s_perm = explainer_func(model, x_perm)
if abs(s_perm[j]) >= abs(s_obs[j]):
greater += 1
pvals[j] = (greater+1)/(n_perm+1)
return pvals
def adversarial_explain_change(explainer_func, model, x, budget=0.1):
"""最小扰动使解释 top-k 集合发生改变(近似:基于梯度方向)"""
# 计算解释向量的梯度近似:对每个特征尝试沿 +/- 方向扰动并计算解释变化
base_exp = explainer_func(model, x)
topk = np.argsort(-np.abs(base_exp))[:max(1,len(base_exp)//5)]
best_delta = None
best_change = 0
for sign in [+1,-1]:
x_try = x.copy()
# 扰动沿梯度方向: 使用模型输出对输入的梯度近似
x_var = torch.tensor(x_try, dtype=torch.float32).unsqueeze(0).requires_grad_()
logits = model(x_var)
score = torch.softmax(logits,dim=-1)[0,1]
score.backward()
grad = x_var.grad.detach().numpy()[0]
direction = np.sign(grad)
delta = sign * budget * direction
x_new = np.clip(x + delta, -np.inf, np.inf)
new_exp = explainer_func(model, x_new)
change = np.linalg.norm(new_exp - base_exp, ord=1)
if change > best_change:
best_change = change
best_delta = delta
return best_delta, best_change
# -----------------------------
# 6. 演示流程:训练模型、生成解释、评估
# -----------------------------
def demo_all():
print('中文提示:开始可解释性工具箱演示')
# 数据
X, y = generate_synthetic_classification(n_samples=1000, n_features=20, informative=5)
X_train, X_test = X[:800], X[800:]
y_train, y_test = y[:800], y[800:]
# 训练模型
model = SimpleMLP(n_features=X.shape[1])
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
ds = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
loader = DataLoader(ds, batch_size=64, shuffle=True)
print('中文提示:训练简单神经网络模型...')
for epoch in range(10):
model.train()
los = 0.0
for xb, yb in loader:
optimizer.zero_grad()
logits = model(xb.float())
loss = criterion(logits, yb)
loss.backward()
optimizer.step()
los += loss.item()
if (epoch+1)%2==0:
print(f' epoch {epoch+1}, 平均损失={los/len(loader):.6f}')
model.eval()
with torch.no_grad():
preds = torch.argmax(model(torch.tensor(X_test).float()),dim=-1).numpy()
print('中文提示:测试准确率=', accuracy_score(y_test,preds))
# 选择一个测试样本
x0 = X_test[0]
print('\n中文提示:样本 x0 的前 8 个特征值:', x0[:8])
# 1) 可微掩码优化
print('\n中文提示:执行可微掩码优化...')
m_soft, m_bin = optimize_soft_mask(model, x0, lr=0.2, steps=300, l1=1e-2)
print('中文提示:soft mask 前8值=', np.round(m_soft[:8],3))
print('中文提示:binary mask 前8值=', m_bin[:8])
# 2) 梯度归因
print('\n中文提示:计算梯度归因 (gradient * input) ...')
grad_sal = gradient_saliency(model, x0)
print('中文提示:梯度归因前8值=', np.round(grad_sal[:8],4))
# 3) 代理线性模型
print('\n中文提示:拟合线性代理模型 (Ridge) ...')
coef, intercept, ridge = surrogate_linear_model(model, X_train, n_samples=400)
print('中文提示:线性代理系数前8=', np.round(coef[:8],4))
# 4) 反事实生成
print('\n中文提示:生成反事实示例 (目标类别取反) ...')
orig_label = int(torch.argmax(model(torch.tensor(x0).float().unsqueeze(0))).item())
target_label = 1 - orig_label
x_cf = generate_counterfactual(model, x0, target_label=target_label, lr=0.05, steps=300, lambda_norm=0.001)
with torch.no_grad():
pred_orig = int(torch.argmax(model(torch.tensor(x0).float().unsqueeze(0))).item())
pred_cf = int(torch.argmax(model(torch.tensor(x_cf).float().unsqueeze(0))).item())
print(f'中文提示:原始预测={pred_orig}, 反事实预测={pred_cf}')
# 5) 评估指标
print('\n中文提示:计算充分性与必要性指标...')
suff = sufficiency_metric(model, x0, m_soft)
nec = necessity_metric(model, x0, m_soft)
print(f'中文提示:充分性差异={suff:.6f}, 必要性差异={nec:.6f}')
print('\n中文提示:计算稳健性指标 (基于梯度归因) ...')
stab = stability_metric(gradient_saliency, model, x0, n_pert=30, noise_scale=1e-3)
print(f'中文提示:归因稳定性平均相对差={stab:.6f}')
print('\n中文提示:置换检验估计 p-values (对前10个特征) ...')
pvals = permutation_significance(gradient_saliency, model, X_test, x_idx=0, n_perm=200)
print('中文提示:p-values 前10=', np.round(pvals[:10],4))
print('\n中文提示:对抗性解释变化测试(近似)...')
delta, change = adversarial_explain_change(gradient_saliency, model, x0, budget=0.1)
print(f'中文提示:发现最优扰动 change={change:.6f}, delta前8={np.round(delta[:8],4)}')
# 可视化:原始、反事实、掩码覆盖
plt.figure(figsize=(10,6))
plt.subplot(3,1,1)
plt.plot(x0, label='原始输入'); plt.title('原始输入特征'); plt.legend()
plt.subplot(3,1,2)
plt.plot(x_cf, label='反事实输入'); plt.title('反事实输入特征'); plt.legend()
plt.subplot(3,1,3)
plt.bar(np.arange(len(m_soft)), m_soft); plt.title('可微掩码值 (soft mask)')
plt.tight_layout(); plt.show()
print('\n中文提示:演示完成。')
if __name__=='__main__':
demo_all()

被折叠的 条评论
为什么被折叠?



