AI模型全方位测试方案
目录
- 引言
- 测试环境搭建
- 数据集准备与预处理
- 模型基础测试
- 模型性能测试
- 模型鲁棒性测试
- 模型公平性与偏见测试
- 模型可解释性测试
- 模型安全测试
- 端到端系统测试
- 测试自动化框架
- 评分标准与评估体系
- 测试报告与可视化
- 持续测试与监控
- 结论与展望
- 附录
1. 引言
1.1 AI模型测试的重要性
随着人工智能技术的快速发展,AI模型已广泛应用于各个领域。然而,模型在实际部署前需要进行全面测试,以确保其可靠性、安全性和公平性。不充分的测试可能导致模型在生产环境中表现不佳,甚至造成严重的伦理和社会问题。
1.2 测试方案概述
本方案提供了一套完整的AI模型测试框架,涵盖从基础功能到高级特性的全方位测试。方案采用Python作为实现语言,结合多种测试工具和库,适用于大多数机器学习模型。
1.3 测试范围
本方案涵盖以下测试维度:
- 基础功能测试
- 性能测试
- 鲁棒性测试
- 公平性与偏见测试
- 可解释性测试
- 安全测试
- 端到端系统测试
2. 测试环境搭建
2.1 硬件要求
# 环境检查脚本
import torch
import tensorflow as tf
def check_environment():
# 检查GPU可用性
print(f"PyTorch GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"PyTorch GPU device count: {torch.cuda.device_count()}")
print(f"Current PyTorch GPU: {torch.cuda.current_device()}")
print(f"PyTorch GPU name: {torch.cuda.get_device_name(0)}")
# TensorFlow GPU检查
print(f"TensorFlow GPU available: {len(tf.config.list_physical_devices('GPU')) > 0}")
if len(tf.config.list_physical_devices('GPU')) > 0:
print(f"TensorFlow GPU devices: {tf.config.list_physical_devices('GPU')}")
# 内存信息
import psutil
mem = psutil.virtual_memory()
print(f"Total memory: {mem.total / (1024**3):.2f} GB")
print(f"Available memory: {mem.available / (1024**3):.2f} GB")
check_environment()
2.2 软件依赖
# requirements.txt
numpy>=1.21.0
pandas>=1.3.0
scikit-learn>=1.0.0
tensorflow>=2.6.0
torch>=1.10.0
pytest>=6.2.5
pytest-benchmark>=3.4.1
alibi>=0.6.0
shap>=0.40.0
lime>=0.2.0.1
adversarial-robustness-toolbox>=1.10.0
fairlearn>=0.7.0
mlflow>=1.20.2
great-expectations>=0.13.0
2.3 测试目录结构
ai_model_testing/
│── data/
│ ├── raw/ # 原始数据
│ ├── processed/ # 处理后的数据
│ └── test_sets/ # 测试数据集
├── models/ # 模型文件
├── tests/ # 测试代码
│ ├── unit/ # 单元测试
│ ├── integration/ # 集成测试
│ ├── performance/ # 性能测试
│ └── system/ # 系统测试
├── docs/ # 文档
├── reports/ # 测试报告
└── config/ # 配置文件
3. 数据集准备与预处理
3.1 数据质量测试
import great_expectations as ge
import pandas as pd
def test_data_quality(data_path: str, expectations_path: str):
"""
使用Great Expectations测试数据质量
:param data_path: 数据文件路径
:param expectations_path: 期望配置文件路径
"""
# 加载数据
df = pd.read_csv(data_path)
ge_df = ge.from_pandas(df)
# 加载期望配置
expectation_suite = ge.data_asset.FileDataAsset(
file_path=expectations_path
).get_expectation_suite()
# 执行验证
validation_result = ge_df.validate(
expectation_suite=expectation_suite,
data_asset_name=data_path,
run_id=f"validation_{pd.Timestamp.now().isoformat()}"
)
# 输出结果
if validation_result["success"]:
print("数据质量验证通过")
else:
print("数据质量验证失败:")
for result in validation_result["results"]:
if not result["success"]:
print(f"- {result['expectation_config']['kwargs']['column']}: "
f"{result['expectation_config']['expectation_type']} "
f"失败,详情: {result['result']}")
return validation_result
3.2 数据分割策略
from sklearn.model_selection import train_test_split
import numpy as np
def split_data_with_stratification(
features: np.ndarray,
labels: np.ndarray,
test_size: float = 0.2,
val_size: float = 0.1,
random_state: int = 42,
stratify: bool = True
):
"""
分层分割数据集为训练集、验证集和测试集
:param features: 特征矩阵
:param labels: 标签数组
:param test_size: 测试集比例
:param val_size: 验证集比例(相对于训练集剩余部分)
:param random_state: 随机种子
:param stratify: 是否进行分层抽样
:return: X_train, X_val, X_test, y_train, y_val, y_test
"""
stratify_arg = labels if stratify else None
# 先分割出测试集
X_train, X_test, y_train, y_test = train_test_split(
features, labels,
test_size=test_size,
random_state=random_state,
stratify=stratify_arg
)
# 调整验证集比例(相对于剩余训练集)
val_size_adjusted = val_size / (1 - test_size)
# 分割训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train,
test_size=val_size_adjusted,
random_state=random_state,
stratify=y_train if stratify else None
)
return X_train, X_val, X_test, y_train, y_val, y_test
3.3 数据增强测试
import pytest
from unittest.mock import MagicMock
class TestDataAugmentation:
"""测试数据增强功能"""
@pytest.fixture
def sample_image(self):
# 生成测试图像
return np.random.randint(0, 256, (224, 224, 3), dtype=np.uint8)
def test_augmentation_consistency(self, sample_image):
"""测试增强后的图像是否保持基本特征"""
from albumentations import (
Compose, HorizontalFlip, RandomBrightnessContrast
)
augmenter = Compose([
HorizontalFlip(p=0.5),
RandomBrightnessContrast(p=0.8)
])
augmented = augmenter(image=sample_image)["image"]
# 基本断言
assert augmented.shape == sample_image.shape
assert augmented.dtype == sample_image.dtype
assert not np.array_equal(augmented, sample_image)
# 像素值范围检查
assert augmented.min() >= 0
assert augmented.max() <= 255
def test_augmentation_reproducibility(self, sample_image):
"""测试增强是否可复现"""
from albumentations import Compose, Rotate
augmenter = Compose([Rotate(limit=45, p=1.0)])
# 相同随机种子应产生相同结果
augmenter_1 = augmenter(image=sample_image, seed=42)["image"]
augmenter_2 = augmenter(image=sample_image, seed=42)["image"]
assert np.array_equal(augmenter_1, augmenter_2)
# 不同随机种子应产生不同结果
augmenter_3 = augmenter(image=sample_image, seed=24)["image"]
assert not np.array_equal(augmenter_1, augmenter_3)
def test_augmentation_pipeline(self, mocker):
"""测试增强流水线是否正确调用"""
mock_augmenter = mocker.MagicMock()
mock_augmenter.side_effect = lambda **kwargs: {
"image": kwargs["image"] + 10,
"mask": kwargs["mask"] * 2
}
image = np.zeros((10, 10), dtype=np.uint8)
mask = np.ones((10, 10), dtype=np.uint8)
result = mock_augmenter(image=image, mask=mask)
# 验证调用参数
mock_augmenter.assert_called_once_with(image=image, mask=mask)
# 验证返回结果
assert np.array_equal(result["image"], image + 10)
assert np.array_equal(result["mask"], mask * 2)
4. 模型基础测试
4.1 模型训练测试
import pytest
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
class TestModelTraining:
"""测试模型训练功能"""
@pytest.fixture
def sample_data(self):
# 生成测试数据
X, y = make_classification(
n_samples=1000,
n_features=20,
n_classes=2,
random_state=42
)
return X, y
def test_training_convergence(self, sample_data):
"""测试模型是否能够收敛"""
X, y = sample_data
model = RandomForestClassifier(n_estimators=100, random_state=42)
# 使用交叉验证评估模型
scores = cross_val_score(model, X, y, cv=5)
# 验证准确率
assert scores.mean() > 0.8
assert all(score > 0.75 for score in scores)
def test_training_reproducibility(self, sample_data):
"""测试训练过程是否可复现"""
X, y = sample_data
# 训练第一个模型
model1 = RandomForestClassifier(n_estimators=10, random_state=42)
model1.fit(X, y)
# 训练第二个模型(相同随机种子)
model2 = RandomForestClassifier(n_estimators=10, random_state=42)
model2.fit(X, y)
# 预测结果应该相同
pred1 = model1.predict(X[:10])
pred2 = model2.predict(X[:10])
assert np.array_equal(pred1, pred2)
# 不同随机种子应产生不同结果
model3 = RandomForestClassifier(n_estimators=10, random_state=24)
model3.fit(X, y)
pred3 = model3.predict(X[:10])
assert not np.array_equal(pred1, pred3)
def test_feature_importance(self, sample_data):
"""测试特征重要性计算"""
X, y = sample_data
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X, y)
importances = model.feature_importances_
# 基本断言
assert len(importances) == X.shape[1]
assert all(imp >= 0 for imp in importances)
assert sum(importances) == pytest.approx(1.0, abs=1e-6)
# 至少有一个特征重要性大于0
assert max(importances) > 0
4.2 模型推理测试
import pytest
import numpy as np
from unittest.mock import MagicMock
class TestModelInference:
"""测试模型推理功能"""
@pytest.fixture
def trained_model(self):
# 模拟一个训练好的模型
model = MagicMock()
# 设置预测行为
model.predict.return_value = np.array([0, 1, 0, 1, 0])
model.predict_proba.return_value = np.array([
[0.9, 0.1],
[0.2, 0.8],
[0.7, 0.3],
[0.4, 0.6],
[0.85, 0.15]
])
return model
def test_predict_output_shape(self, trained_model):
"""测试预测输出形状"""
X_test = np.random.rand(5, 10) # 5个样本,10个特征
# 测试predict方法
y_pred = trained_model.predict(X_test)
assert y_pred.shape == (5,)
# 测试predict_proba方法
y_proba = trained_model.predict_proba(X_test)
assert y_proba.shape == (5, 2) # 5个样本,2个类别
def test_predict_output_range(self, trained_model):
"""测试预测输出范围"""
X_test = np.random.rand(5, 10)
# 测试predict输出
y_pred = trained_model.predict(X_test)
assert all(pred in {0, 1} for pred in y_pred)
# 测试predict_proba输出
y_proba = trained_model.predict_proba(X_test)
assert all(0 <= proba <= 1 for proba in y_proba.ravel())
assert np.allclose(y_proba.sum(axis=1), np.ones(5))
def test_batch_inference(self, trained_model):
"""测试批量推理"""
# 小批量
X_small = np.random.rand(5, 10)
y_small = trained_model.predict(X_small)
assert len(y_small) == 5
# 大批量
X_large = np.random.rand(1000, 10)
y_large = trained_model.predict(X_large)
assert len(y_large) == 1000
# 边缘情况:空输入
X_empty = np.random.rand(0, 10)
y_empty = trained_model.predict(X_empty)
assert len(y_empty) == 0
def test_inference_time(self, trained_model, benchmark):
"""测试推理时间"""
X_test = np.random.rand(100, 10)
# 使用pytest-benchmark测试推理时间
benchmark(trained_model.predict, X_test)
# 获取基准测试结果
stats = benchmark.stats
print(f"平均推理时间: {stats['mean']:.4f}秒")
print(f"标准差: {stats['stddev']:.4f}秒")
4.3 模型保存与加载测试
import tempfile
import pickle
import joblib
import torch
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
class TestModelPersistence:
"""测试模型保存与加载功能"""
@pytest.fixture
def sample_model(self):
# 创建一个简单的模型
X, y = make_classification(n_samples=100, n_features=5, random_state=42)
model = RandomForestClassifier(n_estimators=10, random_state=42)
model.fit(X, y)
return model
def test_pickle_persistence(self, sample_model):
"""测试使用pickle保存和加载模型"""
with tempfile.NamedTemporaryFile(delete=False) as tmp:
# 保存模型
pickle.dump(sample_model, tmp)
tmp_path = tmp.name
try:
# 加载模型
with open(tmp_path, 'rb') as f:
loaded_model = pickle.load(f)
# 验证模型功能
X_test = np.random.rand(5, 5)
orig_pred = sample_model.predict(X_test)
loaded_pred = loaded_model.predict(X_test)
assert np.array_equal(orig_pred, loaded_pred)
finally:
# 清理临时文件
import os
os.unlink(tmp_path)
def test_joblib_persistence(self, sample_model):
"""测试使用joblib保存和加载模型"""
with tempfile.NamedTemporaryFile(delete=False) as tmp:
# 保存模型
joblib.dump(sample_model, tmp)
tmp_path = tmp.name
try:
# 加载模型
loaded_model = joblib.load(tmp_path)
# 验证模型功能
X_test = np.random.rand(5, 5)
orig_pred = sample_model.predict(X_test)
loaded_pred = loaded_model.predict(X_test)
assert np.array_equal(orig_pred, loaded_pred)
finally:
# 清理临时文件
import os
os.unlink(tmp_path)
def test_pytorch_persistence(self):
"""测试PyTorch模型保存和加载"""
# 创建一个简单的PyTorch模型
model = torch.nn.Sequential(
torch.nn.Linear(10, 5),
torch.nn.ReLU(),
torch.nn.Linear(5, 2)
with tempfile.NamedTemporaryFile(delete=False) as tmp:
# 保存模型
torch.save(model.state_dict(), tmp)
tmp_path = tmp.name
try:
# 创建一个新模型实例
loaded_model = torch.nn.Sequential(
torch.nn.Linear(10, 5),
torch.nn.ReLU(),
torch.nn.Linear(5, 2))
# 加载状态字典
loaded_model.load_state_dict(torch.load(tmp_path))
loaded_model.eval()
# 验证模型参数
for orig_param, loaded_param in zip(
model.parameters(), loaded_model.parameters()
):
assert torch.allclose(orig_param, loaded_param)
finally:
import os
os.unlink(tmp_path)
def test_tensorflow_persistence(self):
"""测试TensorFlow模型保存和加载"""
# 创建一个简单的TensorFlow模型
model = tf.keras.Sequential([
tf.keras.layers.Dense(5, input_shape=(10,), activation='relu'),
tf.keras.layers.Dense(2, activation='softmax')
])
with tempfile.NamedTemporaryFile(suffix='.h5', delete=False) as tmp:
# 保存模型
model.save(tmp.name)
tmp_path = tmp.name
try:
# 加载模型
loaded_model = tf.keras.models.load_model(tmp_path)
# 验证模型结构
assert len(model.layers) == len(loaded_model.layers)
for orig_layer, loaded_layer in zip(
model.layers, loaded_model.layers
):
assert orig_layer.input_shape == loaded_layer.input_shape
assert orig_layer.output_shape == loaded_layer.output_shape
# 验证模型权重
for orig_weight, loaded_weight in zip(
model.get_weights(), loaded_model.get_weights()
):
assert np.allclose(orig_weight, loaded_weight)
finally:
import os
os.unlink(tmp_path)
5. 模型性能测试
5.1 基准性能测试
import pytest
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
class TestModelPerformance:
"""测试模型性能指标"""
@pytest.fixture
def sample_data_and_model(self):
# 生成测试数据和模型
X, y = make_classification(
n_samples=1000,
n_features=20,
n_classes=2,
random_state=42
)
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 训练模型
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
return model, X_test, y_test
def test_accuracy(self, sample_data_and_model):
"""测试准确率"""
model, X_test, y_test = sample_data_and_model
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"模型准确率: {acc:.4f}")
assert acc > 0.85 # 根据具体问题调整阈值
def test_precision_recall(self, sample_data_and_model):
"""测试精确率和召回率"""
model, X_test, y_test = sample_data_and_model
y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"精确率: {precision:.4f}, 召回率: {recall:.4f}")
assert precision > 0.85
assert recall > 0.85
def test_f1_score(self, sample_data_and_model):
"""测试F1分数"""
model, X_test, y_test = sample_data_and_model
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print(f"F1分数: {f1:.4f}")
assert f1 > 0.85
def test_confusion_matrix(self, sample_data_and_model):
"""测试混淆矩阵"""
from sklearn.metrics import confusion_matrix
model, X_test, y_test = sample_data_and_model
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("混淆矩阵:")
print(cm)
# 验证混淆矩阵形状
assert cm.shape == (2, 2)
# 验证对角线元素(正确分类)占主导
assert cm[0, 0] + cm[1, 1] > cm[0, 1] + cm[1, 0]
def test_roc_auc(self, sample_data_and_model):
"""测试ROC AUC"""
from sklearn.metrics import roc_auc_score
model, X_test, y_test = sample_data_and_model
y_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC AUC: {roc_auc:.4f}")
assert roc_auc > 0.9
def test_classification_report(self, sample_data_and_model):
"""测试分类报告"""
from sklearn.metrics import classification_report
model, X_test, y_test = sample_data_and_model
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print("分类报告:")
print(report)
# 验证报告包含关键指标
assert "precision" in report
assert "recall" in report
assert "f1-score" in report
assert "accuracy" in report
5.2 推理速度测试
import timeit
import pytest
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
class TestInferenceSpeed:
"""测试模型推理速度"""
@pytest.fixture
def sample_model_and_data(self):
# 创建模型和测试数据
X, y = make_classification(
n_samples=1000,
n_features=20,
random_state=42
)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
# 生成不同大小的测试数据
test_data = {
"small": np.random.rand(1, 20),
"medium": np.random.rand(100, 20),
"large": np.random.rand(10000, 20)
}
return model, test_data
def test_single_inference_latency(self, sample_model_and_data):
"""测试单个推理延迟"""
model, test_data = sample_model_and_data
X = test_data["small"]
# 预热
model.predict(X)
# 测量延迟
latency = timeit.timeit(
lambda: model.predict(X),
number=100
) / 100
print(f"单个推理平均延迟: {latency * 1000:.2f}毫秒")
# 根据应用需求设置阈值
assert latency < 0.1 # 100毫秒
def test_batch_inference_throughput(self, sample_model_and_data):
"""测试批量推理吞吐量"""
model, test_data = sample_model_and_data
X = test_data["large"]
# 预热
model.predict(X[:100])
# 测量吞吐量
start_time = time.time()
n_samples = X.shape[0]
model.predict(X)
duration = time.time() - start_time
throughput = n_samples / duration
print(f"批量推理吞吐量: {throughput:.2f}样本/秒")
# 根据应用需求设置阈值
assert throughput > 1000 # 1000样本/秒
def test_inference_scalability(self, sample_model_and_data, benchmark):
"""测试推理的扩展性"""
model, test_data = sample_model_and_data
# 测试不同批量大小的推理时间
batch_sizes = [1, 10, 100, 1000]
results = {}
for size in batch_sizes:
X = np.random.rand(size, 20)
def predict():
return model.predict(X)
# 使用pytest-benchmark
benchmark(predict)
stats = benchmark.stats
results[size] = {
"mean_time": stats["mean"],
"time_per_sample": stats["mean"] / size
}
# 打印结果
print("\n推理扩展性测试结果:")
print("批量大小 | 总时间(秒) | 每样本时间(秒)")
for size, res in results.items():
print(f"{size:8} | {res['mean_time']:.6f} | {res['time_per_sample']:.6f}")
# 验证随着批量增大,每样本时间应减少或保持稳定
time_per_sample = [res["time_per_sample"] for res in results.values()]
assert np.all(np.diff(time_per_sample) <= 0.0001 # 允许微小波动
5.3 资源利用率测试
import pytest
import psutil
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
class TestResourceUtilization:
"""测试模型资源利用率"""
@pytest.fixture
def sample_model_and_data(self):
# 创建模型和测试数据
X, y = make_classification(
n_samples=1000,
n_features=20,
random_state=42
)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
# 生成测试数据
test_data = np.random.rand(1000, 20)
return model, test_data
def test_cpu_utilization(self, sample_model_and_data):
"""测试CPU利用率"""
model, X = sample_model_and_data
# 获取初始CPU使用率
cpu_before = psutil.cpu_percent(interval=None)
# 执行推理
model.predict(X)
# 获取推理期间的CPU使用率
cpu_during = psutil.cpu_percent(interval=1)
print(f"推理前CPU使用率: {cpu_before}%")
print(f"推理期间CPU使用率: {cpu_during}%")
# 验证CPU使用率增加
assert cpu_during > cpu_before
def test_memory_usage(self, sample_model_and_model):
"""测试内存使用情况"""
model, X = sample_model_and_data
# 获取初始内存使用
mem_before = psutil.Process().memory_info().rss / (1024 ** 2) # MB
# 执行推理
model.predict(X)
# 获取推理后内存使用
mem_after = psutil.Process().memory_info().rss / (1024 ** 2)
print(f"推理前内存使用: {mem_before:.2f} MB")
print(f"推理后内存使用: {mem_after:.2f} MB")
print(f"内存增量: {mem_after - mem_before:.2f} MB")
# 验证内存使用在合理范围内
assert mem_after - mem_before < 100 # 内存增量应小于100MB
def test_gpu_utilization(self, sample_model_and_data):
"""测试GPU利用率(如果可用)"""
try:
import torch
import pynvml
if not torch.cuda.is_available():
pytest.skip("GPU不可用")
model, X = sample_model_and_data
# 初始化NVML
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
# 获取初始GPU使用率
util_before = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
# 执行推理
model.predict(X)
# 获取推理期间GPU使用率
util_during = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
print(f"推理前GPU使用率: {util_before}%")
print(f"推理期间GPU使用率: {util_during}%")
# 验证GPU使用率增加
assert util_during > util_before
# 关闭NVML
pynvml.nvmlShutdown()
except ImportError:
pytest.skip("pynvml不可用")
6. 模型鲁棒性测试
6.1 对抗样本测试
import pytest
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from art.estimators.classification import SklearnClassifier
from art.attacks.evasion import FastGradientMethod
class TestAdversarialRobustness:
"""测试模型对抗样本鲁棒性"""
@pytest.fixture
def sample_model_and_data(self):
# 创建模型和测试数据
X, y = make_classification(
n_samples=1000,
n_features=20,
random_state=42
)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
# 创建ART分类器
art_classifier = SklearnClassifier(model=model)
# 分割测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
return art_classifier, X_test, y_test
def test_clean_accuracy(self, sample_model_and_data):
"""测试干净样本的准确率"""
classifier, X_test, y_test = sample_model_and_data
# 干净样本预测
y_pred = classifier.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
acc = np.mean(y_pred == y_test)
print(f"干净样本准确率: {acc:.4f}")
assert acc > 0.85
def test_fgsm_attack(self, sample_model_and_data):
"""测试FGSM对抗攻击"""
classifier, X_test, y_test = sample_model_and_data
# 创建FGSM攻击
attack = FastGradientMethod(
estimator=classifier,
eps=0.1 # 扰动大小
)
# 生成对抗样本
X_test_adv = attack.generate(x=X_test)
# 对抗样本预测
y_pred_adv = classifier.predict(X_test_adv)
y_pred_adv = np.argmax(y_pred_adv, axis=1)
# 计算对抗样本准确率
acc_adv = np.mean(y_pred_adv == y_test)
print(f"对抗样本准确率: {acc_adv:.4f}")
# 计算准确率下降比例
y_pred_clean = np.argmax(classifier.predict(X_test), axis=1)
acc_clean = np.mean(y_pred_clean == y_test)
drop = (acc_clean - acc_adv) / acc_clean
print(f"准确率下降比例: {drop:.2%}")
# 验证对抗样本与原始样本的差异
perturbation = np.mean(np.abs(X_test_adv - X_test))
print(f"平均扰动大小: {perturbation:.6f}")
# 根据应用需求设置阈值
assert drop < 0.5 # 准确率下降不超过50%
def test_robustness_curve(self, sample_model_and_data):
"""测试不同扰动强度下的鲁棒性曲线"""
classifier, X_test, y_test = sample_model_and_data
epsilons = [0.01, 0.05, 0.1, 0.2, 0.3]
clean_acc = np.mean(
np.argmax(classifier.predict(X_test), axis=1) == y_test
)
results = []
for eps in epsilons:
attack = FastGradientMethod(
estimator=classifier,
eps=eps
)
X_adv = attack.generate(x=X_test)
y_pred = np.argmax(classifier.predict(X_adv), axis=1)
acc = np.mean(y_pred == y_test)
results.append((eps, acc))
print(f"扰动强度: {eps:.2f}, 对抗准确率: {acc:.4f}")
# 绘制鲁棒性曲线(在测试中可以通过打印数据实现)
print("\n鲁棒性曲线数据:")
print("扰动强度 | 对抗准确率 | 准确率下降")
for eps, acc in results:
print(f"{eps:.2f} | {acc:.4f} | {clean_acc - acc:.4f}")
# 验证随着扰动增加,准确率应单调下降
accuracies = [acc for _, acc in results]
assert np.all(np.diff(accuracies) <= 0)
6.2 噪声鲁棒性测试
import pytest
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
class TestNoiseRobustness:
"""测试模型噪声鲁棒性"""
@pytest.fixture
def sample_model_and_data(self):
# 创建模型和测试数据
X, y = make_classification(
n_samples=1000,
n_features=20,
random_state=42
)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
# 分割测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
return model, X_test, y_test
def test_gaussian_noise(self, sample_model_and_data):
"""测试高斯噪声鲁棒性"""
model, X_test, y_test = sample_model_and_data
noise_levels = [0.01, 0.05, 0.1, 0.2]
clean_acc = model.score(X_test, y_test)
results = []
for std in noise_levels:
# 添加高斯噪声
noise = np.random.normal(0, std, size=X_test.shape)
X_noisy = X_test + noise
# 计算噪声数据准确率
acc = model.score(X_noisy, y_test)
results.append((std, acc))
print(f"噪声标准差: {std:.2f}, 准确率: {acc:.4f}")
# 验证噪声对准确率的影响
print("\n高斯噪声测试结果:")
print("噪声标准差 | 准确率 | 准确率下降")
for std, acc in results:
print(f"{std:.2f} | {acc:.4f} | {clean_acc - acc:.4f}")
# 验证随着噪声增加,准确率应下降
accuracies = [acc for _, acc in results]
assert np.all(np.diff(accuracies) <= 0.05 # 允许轻微波动
def test_missing_data(self, sample_model_and_data):
"""测试缺失数据鲁棒性"""
model, X_test, y_test = sample_model_and_data
missing_ratios = [0.1, 0.3, 0.5, 0.7]
clean_acc = model.score(X_test, y_test)
results = []
for ratio in missing_ratios:
# 创建缺失数据
X_missing = X_test.copy()
mask = np.random.rand(*X_test.shape) < ratio
X_missing[mask] = np.nan
# 简单填充(实际应用中应使用更复杂的填充策略)
X_filled = np.where(
np.isnan(X_missing),
np.nanmean(X_test, axis=0),
X_missing
)
# 计算填充后数据准确率
acc = model.score(X_filled, y_test)
results.append((ratio, acc))
print(f"缺失比例: {ratio:.1f}, 填充后准确率: {acc:.4f}")
# 验证缺失数据对准确率的影响
print("\n缺失数据测试结果:")
print("缺失比例 | 准确率 | 准确率下降")
for ratio, acc in results:
print(f"{ratio:.1f} | {acc:.4f} | {clean_acc - acc:.4f}")
# 验证随着缺失比例增加,准确率应下降
accuracies = [acc for _, acc in results]
assert np.all(np.diff(accuracies) <= 0.1 # 允许较大波动
def test_outlier_robustness(self, sample_model_and_data):
"""测试异常值鲁棒性"""
model, X_test, y_test = sample_model_and_data
outlier_levels = [1.0, 2.0, 5.0, 10.0]
clean_acc = model.score(X_test, y_test)
results = []
for level in outlier_levels:
# 添加异常值
X_outlier = X_test.copy()
n_outliers = int(0.1 * X_test.size) # 10%的元素作为异常值
indices = np.random.choice(
X_test.size, n_outliers, replace=False
)
rows, cols = np.unravel_index(indices, X_test.shape)
X_outlier[rows, cols] = level * np.random.randn(n_outliers)
# 计算异常值数据准确率
acc = model.score(X_outlier, y_test)
results.append((level, acc))
print(f"异常值水平: {level:.1f}, 准确率: {acc:.4f}")
# 验证异常值对准确率的影响
print("\n异常值测试结果:")
print("异常值水平 | 准确率 | 准确率下降")
for level, acc in results:
print(f"{level:.1f} | {acc:.4f} | {clean_acc - acc:.4f}")
# 验证随着异常值水平增加,准确率应下降
accuracies = [acc for _, acc in results]
assert np.all(np.diff(accuracies) <= 0.1) # 允许较大波动
6.3 输入扰动测试
import pytest
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
class TestInputPerturbation:
"""测试模型输入扰动鲁棒性"""
@pytest.fixture
def sample_model_and_data(self):
# 创建模型和测试数据
X, y = make_classification(
n_samples=1000,
n_features=20,
random_state=42
)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
# 分割测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
return model, X_test, y_test
def test_feature_permutation(self, sample_model_and_data):
"""测试特征排列鲁棒性"""
model, X_test, y_test = sample_model_and_data
# 原始准确率
orig_acc = model.score(X_test, y_test)
# 随机排列特征顺序
permuted_indices = np.random.permutation(X_test.shape[1])
X_permuted = X_test[:, permuted_indices]
# 排列后准确率
permuted_acc = model.score(X_permuted, y_test)
print(f"原始准确率: {orig_acc:.4f}")
print(f"特征排列后准确率: {permuted_acc:.4f}")
# 随机森林应对特征排列保持不变
assert np.isclose(orig_acc, permuted_acc, atol=1e-4)
def test_feature_drop(self, sample_model_and_data):
"""测试特征丢弃鲁棒性"""
model, X_test, y_test = sample_model_and_data
# 原始准确率
orig_acc = model.score(X_test, y_test)
# 丢弃部分特征
keep_indices = np.random.choice(
X_test.shape[1],
size=int(0.8 * X_test.shape[1]),
replace=False
)
X_dropped = X_test[:, keep_indices]
# 重新训练模型(因为特征数量变化)
X_train, _, y_train, _ = train_test_split(
X_test[:, keep_indices], y_test,
test_size=0.2, random_state=42
)
model_dropped = RandomForestClassifier(
n_estimators=100, random_state=42
)
model_dropped.fit(X_train, y_train)
# 丢弃特征后准确率
dropped_acc = model_dropped.score(X_dropped, y_test)
print(f"原始准确率: {orig_acc:.4f}")
print(f"丢弃20%特征后准确率: {dropped_acc:.4f}")
# 验证准确率下降在合理范围内
assert (orig_acc - dropped_acc) < 0.15
def test_feature_scale(self, sample_model_and_data):
"""测试特征缩放鲁棒性"""
model, X_test, y_test = sample_model_and_data
# 原始准确率
orig_acc = model.score(X_test, y_test)
# 应用不同缩放
scalers = {
"Standard": lambda x: (x - np.mean(x, axis=0)) / np.std(x, axis=0),
"MinMax": lambda x: (x - np.min(x, axis=0)) / (
np.max(x, axis=0) - np.min(x, axis=0)
),
"Robust": lambda x: (x - np.median(x, axis=0)) / (
np.percentile(x, 75, axis=0) - np.percentile(x, 25, axis=0)
)
}
results = []
for name, scaler in scalers.items():
try:
X_scaled = scaler(X_test)
acc = model.score(X_scaled, y_test)
results.append((name, acc))
print(f"{name}缩放后准确率: {acc:.4f}")
except Exception as e:
print(f"{name}缩放失败: {str(e)}")
results.append((name, np.nan))
# 验证缩放对准确率的影响
print("\n特征缩放测试结果:")
print("缩放方法 | 准确率 | 准确率变化")
for name, acc in results:
if not np.isnan(acc):
print(f"{name:8} | {acc:.4f} | {acc - orig_acc:+.4f}")
# 验证准确率变化在合理范围内
valid_accs = [acc for _, acc in results if not np.isnan(acc)]
assert all(abs(acc - orig_acc) < 0.1 for acc in valid_accs)
7. 模型公平性与偏见测试
7.1 群体公平性测试
import pytest
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from fairlearn.metrics import (
demographic_parity_difference,
equalized_odds_difference
)
class TestGroupFairness:
"""测试模型群体公平性"""
@pytest.fixture
def sample_model_and_data(self):
# 创建带有敏感特征的数据
X, y = make_classification(
n_samples=2000,
n_features=10,
n_classes=2,
random_state=42
)
# 添加敏感特征(假设性别: 0-女性, 1-男性)
sensitive_feature = np.random.randint(0, 2, size=2000)
X = np.column_stack((X, sensitive_feature))
# 人为引入偏见: 男性更可能被分类为正类
y = np.where(
(sensitive_feature == 1) & (y == 0) & (np.random.rand(2000) > 0.7),
1, y
)
# 转换为DataFrame
feature_names = [f"feature_{i}" for i in range(10)] + ["gender"]
df = pd.DataFrame(X, columns=feature_names)
df["label"] = y
# 分割数据集
X = df.drop("label", axis=1)
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 训练模型
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train.drop("gender", axis=1), y_train)
return {
"model": model,
"X_test": X_test,
"y_test": y_test,
"sensitive_feature": X_test["gender"]
}
def test_demographic_parity(self, sample_model_and_data):
"""测试人口统计均等性"""
data = sample_model_and_data
model = data["model"]
X_test = data["X_test"].drop("gender", axis=1)
y_test = data["y_test"]
sensitive = data["sensitive_feature"]
# 模型预测
y_pred = model.predict(X_test)
# 计算人口统计均等性差异
dp_diff = demographic_parity_difference(
y_true=y_test,
y_pred=y_pred,
sensitive_features=sensitive
)
print(f"人口统计均等性差异: {dp_diff:.4f}")
# 验证差异在可接受范围内
assert abs(dp_diff) < 0.2
def test_equalized_odds(self, sample_model_and_data):
"""测试均等几率"""
data = sample_model_and_data
model = data["model"]
X_test = data["X_test"].drop("gender", axis=1)
y_test = data["y_test"]
sensitive = data["sensitive_feature"]
# 模型预测
y_pred = model.predict(X_test)
# 计算均等几率差异
eo_diff = equalized_odds_difference(
y_true=y_test,
y_pred=y_pred,
sensitive_features=sensitive
)
print(f"均等几率差异: {eo_diff:.4f}")
# 验证差异在可接受范围内
assert abs(eo_diff) < 0.25
def test_disparate_impact(self, sample_model_and_data):
"""测试不同影响"""
data = sample_model_and_data
model = data["model"]
X_test = data["X_test"].drop("gender", axis=1)
sensitive = data["sensitive_feature"]
# 模型预测
y_pred = model.predict(X_test)
# 计算不同群体中的正类比例
group_0_rate = np.mean(y_pred[sensitive == 0])
group_1_rate = np.mean(y_pred[sensitive == 1])
# 计算不同影响比率
di_ratio = group_0_rate / group_1_rate
print(f"不同影响比率: {di_ratio:.4f}")
# 验证比率在公平范围内(通常0.8-1.25)
assert 0.8 <= di_ratio <= 1.25
def test_fairness_metrics_by_subgroup(self, sample_model_and_data):
"""测试各子组的性能指标"""
data = sample_model_and_data
model = data["model"]
X_test = data["X_test"].drop("gender", axis=1)
y_test = data["y_test"]
sensitive = data["sensitive_feature"]
# 模型预测
y_pred = model.predict(X_test)
# 计算各子组的指标
from sklearn.metrics import accuracy_score, precision_score, recall_score
results = []
for group in [0, 1]:
mask = (sensitive == group)
acc = accuracy_score(y_test[mask], y_pred[mask])
prec = precision_score(y_test[mask], y_pred[mask])
rec = recall_score(y_test[mask], y_pred[mask])
results.append((group, acc, prec, rec))
print(f"\n组别 {group} 性能:")
print(f"准确率: {acc:.4f}")
print(f"精确率: {prec:.4f}")
print(f"召回率: {rec:.4f}")
# 验证组间差异
acc_diff = abs(results[0][1] - results[1][1])
prec_diff = abs(results[0][2] - results[1][2])
rec_diff = abs(results[0][3] - results[1][3])
print(f"\n组间差异:")
print(f"准确率差异: {acc_diff:.4f}")
print(f"精确率差异: {prec_diff:.4f}")
print(f"召回率差异: {rec_diff:.4f}")
# 验证差异在可接受范围内
assert acc_diff < 0.15
assert prec_diff < 0.2
assert rec_diff < 0.2
7.2 个体公平性测试
import pytest
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import euclidean_distances
class TestIndividualFairness:
"""测试模型个体公平性"""
@pytest.fixture
def sample_model_and_data(self):
# 创建测试数据
X, y = make_classification(
n_samples=1000,
n_features=10,
n_classes=2,
random_state=42
)
# 训练模型
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
# 选择测试样本
X_test = X[:100]
y_test = y[:100]
return model, X_test, y_test
def test_similar_input_similar_output(self, sample_model_and_data):
"""测试相似输入产生相似输出"""
model, X_test, _ = sample_model_and_data
# 计算样本间距离
distances = euclidean_distances(X_test)
np.fill_diagonal(distances, np.inf) # 忽略对角线
# 获取预测概率
y_proba = model.predict_proba(X_test)[:, 1]
# 对于每个样本,找到最相似的5个样本
similar_pairs = []
for i in range(len(X_test)):
# 找到最相似的样本索引
closest_indices = np.argsort(distances[i])[:5]
# 记录相似样本对的预测差异
for j in closest_indices:
proba_diff = abs(y_proba[i] - y_proba[j])
dist = distances[i, j]
similar_pairs.append((dist, proba_diff))
# 计算平均预测差异
dists, diffs = zip(*similar_pairs)
avg_diff = np.mean(diffs)
print(f"相似样本的平均预测概率差异: {avg_diff:.4f}")
# 验证相似样本的预测差异在合理范围内
assert avg_diff < 0.2
def test_consistency_score(self, sample_model_and_data):
"""测试一致性分数"""
model, X_test, _ = sample_model_and_data
# 计算样本间距离
distances = euclidean_distances(X_test)
# 获取预测
y_pred = model.predict(X_test)
# 计算一致性分数
n = len(X_test)
consistency = 0.0
for i in range(n):
# 找到k个最近邻(k=5)
closest_indices = np.argsort(distances[i])[1:6] # 排除自己
# 计算预测一致性
same_label = np.sum(y_pred[closest_indices] == y_pred[i])
consistency += same_label / 5
consistency /= n
print(f"一致性分数: {consistency:.4f}")
# 验证一致性分数在合理范围内
assert consistency > 0.7
def test_counterfactual_fairness(self, sample_model_and_data):
"""测试反事实公平性"""
model, X_test, _ = sample_model_and_data
# 选择一个样本作为基准
base_idx = 0
base_sample = X_test[base_idx]
base_pred = model.predict([base_sample])[0]
# 创建反事实样本(修改敏感特征)
sensitive_feature_idx = 0 # 假设第一个特征是敏感特征
counterfactual_samples = base_sample.copy().reshape(1, -1)
counterfactual_samples[:, sensitive_feature_idx] = 1 - counterfactual_samples[:, sensitive_feature_idx]
# 反事实预测
cf_pred = model.predict(counterfactual_samples)[0]
print(f"原始预测: {base_pred}, 反事实预测: {cf_pred}")
# 验证预测是否相同(理想情况下应相同)
assert base_pred == cf_pred
7.3 偏见缓解技术测试
import pytest
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from fairlearn.reductions import (
ExponentiatedGradient,
GridSearch,
DemographicParity,
EqualizedOdds
)
class TestBiasMitigation:
"""测试偏见缓解技术"""
@pytest.fixture
def biased_data(self):
# 创建带有偏见的数据
X, y = make_classification(
n_samples=2000,
n_features=10,
n_classes=2,
random_state=42
)
# 添加敏感特征(性别: 0-女性, 1-男性)
sensitive_feature = np.random.randint(0, 2, size=2000)
X = np.column_stack((X, sensitive_feature))
# 人为引入偏见: 男性更可能被分类为正类
y = np.where(
(sensitive_feature == 1) & (y == 0) & (np.random.rand(2000) > 0.7),
1, y
)
# 转换为DataFrame
feature_names = [f"feature_{i}" for i in range(10)] + ["gender"]
df = pd.DataFrame(X, columns=feature_names)
df["label"] = y
# 分割数据集
X = df.drop("label", axis=1)
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
return {
"X_train": X_train,
"X_test": X_test,
"y_train": y_train,
"y_test": y_test,
"sensitive_feature": X_train["gender"],
"sensitive_feature_test": X_test["gender"]
}
def test_exponentiated_gradient(self, biased_data):
"""测试指数梯度缓解方法"""
X_train = biased_data["X_train"].drop("gender", axis=1)
y_train = biased_data["y_train"]
sensitive = biased_data["sensitive_feature"]
# 基础模型
base_model = RandomForestClassifier(
n_estimators=50, random_state=42
)
# 定义公平性约束
constraint = DemographicParity()
# 创建缓解模型
mitigator = ExponentiatedGradient(
estimator=base_model,
constraints=constraint,
max_iter=10
)
# 训练缓解模型
mitigator.fit(X_train, y_train, sensitive_features=sensitive)
# 评估
X_test = biased_data["X_test"].drop("gender", axis=1)
y_test = biased_data["y_test"]
sensitive_test = biased_data["sensitive_feature_test"]
# 原始模型预测
base_model.fit(X_train, y_train)
y_pred_base = base_model.predict(X_test)
# 缓解模型预测
y_pred_mitigated = mitigator.predict(X_test)
# 计算公平性指标
from fairlearn.metrics import demographic_parity_difference
dp_base = demographic_parity_difference(
y_test, y_pred_base, sensitive_features=sensitive_test
)
dp_mitigated = demographic_parity_difference(
y_test, y_pred_mitigated, sensitive_features=sensitive_test
)
print(f"原始模型人口统计均等性差异: {dp_base:.4f}")
print(f"缓解后人口统计均等性差异: {dp_mitigated:.4f}")
# 验证缓解效果
assert abs(dp_mitigated) < abs(dp_base)
# 验证准确率
from sklearn.metrics import accuracy_score
acc_base = accuracy_score(y_test, y_pred_base)
acc_mitigated = accuracy_score(y_test, y_pred_mitigated)
print(f"原始模型准确率: {acc_base:.4f}")
print(f"缓解模型准确率: {acc_mitigated:.4f}")
# 验证准确率下降在合理范围内
assert (acc_base - acc_mitigated) < 0.15
def test_grid_search_mitigation(self, biased_data):
"""测试网格搜索缓解方法"""
X_train = biased_data["X_train"].drop("gender", axis=1)
y_train = biased_data["y_train"]
sensitive = biased_data["sensitive_feature"]
# 基础模型
base_model = RandomForestClassifier(
n_estimators=50, random_state=42
)
# 定义公平性约束
constraint = EqualizedOdds()
# 创建缓解模型
mitigator = GridSearch(
estimator=base_model,
constraints=constraint,
grid_size=10
)
# 训练缓解模型
mitigator.fit(X_train, y_train, sensitive_features=sensitive)
# 评估
X_test = biased_data["X_test"].drop("gender", axis=1)
y_test = biased_data["y_test"]
sensitive_test = biased_data["sensitive_feature_test"]
# 原始模型预测
base_model.fit(X_train, y_train)
y_pred_base = base_model.predict(X_test)
# 缓解模型预测
y_pred_mitigated = mitigator.predict(X_test)
# 计算公平性指标
from fairlearn.metrics import equalized_odds_difference
eo_base = equalized_odds_difference(
y_test, y_pred_base, sensitive_features=sensitive_test
)
eo_mitigated = equalized_odds_difference(
y_test, y_pred_mitigated, sensitive_features=sensitive_test
)
print(f"原始模型均等几率差异: {eo_base:.4f}")
print(f"缓解后均等几率差异: {eo_mitigated:.4f}")
# 验证缓解效果
assert abs(eo_mitigated) < abs(eo_base)
def test_preprocessing_mitigation(self, biased_data):
"""测试预处理缓解方法"""
from fairlearn.preprocessing import CorrelationRemover
X_train = biased_data["X_train"].copy()
sensitive = X_train["gender"]
# 移除敏感特征与其他特征的关联
cr = CorrelationRemover(sensitive_feature_ids=["gender"], alpha=1.0)
X_train_mitigated = cr.fit_transform(X_train)
# 转换为DataFrame
X_train_mitigated = pd.DataFrame(
X_train_mitigated,
columns=[col for col in X_train.columns if col != "gender"] + ["gender"]
)
# 训练模型
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(
X_train_mitigated.drop("gender", axis=1),
biased_data["y_train"]
)
# 对测试集应用相同的转换
X_test = biased_data["X_test"].copy()
X_test_mitigated = cr.transform(X_test)
X_test_mitigated = pd.DataFrame(
X_test_mitigated,
columns=[col for col in X_test.columns if col != "gender"] + ["gender"]
)
# 评估
y_pred = model.predict(X_test_mitigated.drop("gender", axis=1))
sensitive_test = biased_data["sensitive_feature_test"]
# 计算公平性指标
from fairlearn.metrics import demographic_parity_difference
dp_diff = demographic_parity_difference(
biased_data["y_test"], y_pred, sensitive_features=sensitive_test
)
print(f"预处理后人口统计均等性差异: {dp_diff:.4f}")
# 验证缓解效果(与未缓解的基准比较)
base_model = RandomForestClassifier(n_estimators=50, random_state=42)
base_model.fit(
biased_data["X_train"].drop("gender", axis=1),
biased_data["y_train"]
)
y_pred_base = base_model.predict(biased_data["X_test"].drop("gender", axis=1))
dp_base = demographic_parity_difference(
biased_data["y_test"], y_pred_base, sensitive_features=sensitive_test
)
assert abs(dp_diff) < abs(dp_base)
8. 模型可解释性测试
8.1 特征重要性测试
import pytest
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
import shap
class TestFeatureImportance:
"""测试模型特征重要性"""
@pytest.fixture
def sample_model_and_data(self):
# 创建测试数据
X, y = make_classification(
n_samples=1000,
n_features=10,
n_informative=5,
n_redundant=2,
n_classes=2,
random_state=42
)
# 训练模型
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
return model, X, y
def test_permutation_importance(self, sample_model_and_data):
"""测试排列重要性"""
from sklearn.inspection import permutation_importance
model, X, y = sample_model_and_data
# 计算排列重要性
result = permutation_importance(
model, X, y, n_repeats=10, random_state=42
)
importances = result.importances_mean
sorted_idx = np.argsort(importances)[::-1]
# 打印重要性
print("\n排列特征重要性:")
for i in sorted_idx:
print(f"特征 {i}: {importances[i]:.4f} ± {result.importances_std[i]:.4f}")
# 验证信息性特征排名靠前
assert sorted_idx[0] in [0, 1, 2, 3, 4] # 前5个是信息性特征
assert sorted_idx[-1] in [5, 6, 7, 8, 9] # 后5个包含冗余和无信息特征
def test_shap_values(self, sample_model_and_data):
"""测试SHAP值"""
model, X, y = sample_model_and_data
# 计算SHAP值
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
# 汇总分析
if isinstance(shap_values, list): # 多分类情况
shap_values = shap_values[1] # 取正类的SHAP值
# 计算特征重要性(平均绝对SHAP值)
importance = np.mean(np.abs(shap_values), axis=0)
sorted_idx = np.argsort(importance)[::-1]
# 打印重要性
print("\nSHAP特征重要性:")
for i in sorted_idx:
print(f"特征 {i}: {importance[i]:.4f}")
# 验证信息性特征排名靠前
assert sorted_idx[0] in [0, 1, 2, 3, 4] # 前5个是信息性特征
def test_feature_importance_consistency(self, sample_model_and_data):
"""测试不同特征重要性方法的一致性"""
model, X, y = sample_model_and_data
# 方法1: 内置特征重要性
builtin_importance = model.feature_importances_
# 方法2: 排列重要性
from sklearn.inspection import permutation_importance
perm_importance = permutation_importance(
model, X, y, n_repeats=10, random_state=42
).importances_mean
# 方法3: SHAP值
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
if isinstance(shap_values, list):
shap_values = shap_values[1]
shap_importance = np.mean(np.abs(shap_values), axis=0)
# 标准化重要性分数
def normalize(x):
return (x - np.min(x)) / (np.max(x) - np.min(x))
builtin_norm = normalize(builtin_importance)
perm_norm = normalize(perm_importance)
shap_norm = normalize(shap_importance)
# 计算方法间相关性
corr_builtin_perm = np.corrcoef(builtin_norm, perm_norm)[0, 1]
corr_builtin_shap = np.corrcoef(builtin_norm, shap_norm)[0, 1]
corr_perm_shap = np.corrcoef(perm_norm, shap_norm)[0, 1]
print(f"内置与排列重要性相关性: {corr_builtin_perm:.4f}")
print(f"内置与SHAP重要性相关性: {corr_builtin_shap:.4f}")
print(f"排列与SHAP重要性相关性: {corr_perm_shap:.4f}")
# 验证方法间存在合理相关性
assert corr_builtin_perm > 0.5
assert corr_builtin_shap > 0.5
assert corr_perm_shap > 0.5
def test_feature_importance_stability(self, sample_model_and_data):
"""测试特征重要性稳定性"""
model, X, y = sample_model_and_data
# 多次计算SHAP值并检查稳定性
explainer = shap.TreeExplainer(model)
all_shap_values = []
for _ in range(5):
# 使用不同的样本子集
subset_idx = np.random.choice(
len(X), size=min(500, len(X)), replace=False
)
X_subset = X[subset_idx]
shap_values = explainer.shap_values(X_subset)
if isinstance(shap_values, list):
shap_values = shap_values[1]
importance = np.mean(np.abs(shap_values), axis=0)
all_shap_values.append(importance)
# 计算重要性排名变化
rankings = []
for imp in all_shap_values:
rankings.append(np.argsort(imp)[::-1])
# 计算排名稳定性
from scipy.stats import kendalltau
stability_scores = []
for i in range(len(rankings) - 1):
for j in range(i + 1, len(rankings)):
tau, _ = kendalltau(rankings[i], rankings[j])
stability_scores.append(tau)
avg_stability = np.mean(stability_scores)
print(f"特征重要性排名平均稳定性(Kendall's tau): {avg_stability:.4f}")
# 验证稳定性
assert avg_stability > 0.7
8.2 局部解释性测试
import pytest
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
import lime
import lime.lime_tabular
class TestLocalInterpretability:
"""测试模型局部解释性"""
@pytest.fixture
def sample_model_and_data(self):
# 创建测试数据
X, y = make_classification(
n_samples=1000,
n_features=5, # 使用较少特征便于解释
n_informative=3,
n_redundant=1,
n_classes=2,
random_state=42
)
# 训练模型
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X, y)
return model, X, y
def test_lime_explanations(self, sample_model_and_data):
"""测试LIME解释"""
model, X, y = sample_model_and_data
# 创建LIME解释器
explainer = lime.lime_tabular.LimeTabularExplainer(
training_data=X,
feature_names=[f"feature_{i}" for i in range(X.shape[1])],
class_names=["class_0", "class_1"],
mode="classification",
random_state=42
)
# 选择一个测试样本
sample_idx = 42
instance = X[sample_idx]
# 生成解释
exp = explainer.explain_instance(
data_row=instance,
predict_fn=model.predict_proba,
num_features=3 # 展示最重要的3个特征
)
# 获取解释结果
lime_features = [x[0] for x in exp.as_list()]
lime_weights = [x[1] for x in exp.as_list()]
# 打印解释
print("\nLIME解释结果:")
for feature, weight in zip(lime_features, lime_weights):
print(f"{feature}: {weight:.4f}")
# 验证解释合理性
assert len(lime_features) == 3
assert any("feature_0" in f or "feature_1" in f or "feature_2" in f
for f in lime_features) # 信息性特征应出现
def test_lime_stability(self, sample_model_and_data):
"""测试LIME解释的稳定性"""
model, X, y = sample_model_and_data
# 创建LIME解释器
explainer = lime.lime_tabular.LimeTabularExplainer(
training_data=X,
feature_names=[f"feature_{i}" for i in range(X.shape[1])],
class_names=["class_0", "class_1"],
mode="classification",
random_state=42
)
# 选择一个测试样本
instance = X[42]
# 多次解释同一样本
all_explanations = []
for _ in range(5):
exp = explainer.explain_instance(
data_row=instance,
predict_fn=model.predict_proba,
num_features=3
)
explanation = sorted(exp.as_list(), key=lambda x: abs(x[1]), reverse=True)
all_explanations.append(explanation)
# 检查最重要的特征是否一致
top_features = [exp[0][0] for exp in all_explanations]
print(f"多次解释的最重要特征: {top_features}")
# 验证至少3次解释的最重要特征相同
from collections import Counter
most_common = Counter(top_features).most_common(1)[0]
assert most_common[1] >= 3
def test_shap_local_explanations(self, sample_model_and_data):
"""测试SHAP局部解释"""
import shap
model, X, y = sample_model_and_data
# 创建SHAP解释器
explainer = shap.TreeExplainer(model)
# 选择一个测试样本
instance = X[42:43] # 保持二维数组形状
# 计算SHAP值
shap_values = explainer.shap_values(instance)
if isinstance(shap_values, list): # 多分类情况
shap_values = shap_values[1] # 取正类的SHAP值
# 打印特征贡献
print("\nSHAP局部解释:")
for i in range(len(instance[0])):
print(f"特征 {i}: {shap_values[0][i]:.4f}")
# 验证解释合理性
assert np.abs(shap_values).sum() > 0 # 贡献不应全为零
assert any(np.abs(v) > 0.1 for v in shap_values[0]) # 至少有一个显著贡献
def test_contrastive_explanations(self, sample_model_and_data):
"""测试对比解释"""
model, X, y = sample_model_and_data
# 选择两个对比样本
instance_a = X[42] # 正类样本
instance_b = X[24] # 负类样本
# 计算预测
pred_a = model.predict_proba([instance_a])[0][1]
pred_b = model.predict_proba([instance_b])[0][1]
# 计算特征差异
diff = instance_a - instance_b
abs_diff = np.abs(diff)
# 获取特征重要性
importances = model.feature_importances_
# 计算对比解释分数
contrast_scores = abs_diff * importances
# 排序特征
sorted_indices = np.argsort(contrast_scores)[::-1]
# 打印对比解释
print("\n对比解释:")
print(f"样本A预测概率: {pred_a:.4f}")
print(f"样本B预测概率: {pred_b:.4f}")
print("最重要的差异特征:")
for i in sorted_indices[:3]: # 展示前3个最重要的差异特征
print(f"特征 {i}: 差异={diff[i]:.4f}, 重要性={importances[i]:.4f}")
# 验证解释合理性
assert len(sorted_indices) == X.shape[1]
assert contrast_scores[sorted_indices[0]] > 0
8.3 全局解释性测试
import pytest
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
import shap
import dalex as dx
class TestGlobalInterpretability:
"""测试模型全局解释性"""
@pytest.fixture
def sample_model_and_data(self):
# 创建测试数据
X, y = make_classification(
n_samples=1000,
n_features=5, # 使用较少特征便于解释
n_informative=3,
n_redundant=1,
n_classes=2,
random_state=42
)
# 转换为DataFrame以便更好的解释
import pandas as pd
X_df = pd.DataFrame(
X,
columns=[f"feature_{i}" for i in range(X.shape[1])]
)
# 训练模型
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_df, y)
return model, X_df, y
def test_shap_summary_plot(self, sample_model_and_data):
"""测试SHAP摘要图"""
model, X, y = sample_model_and_data
# 创建SHAP解释器
explainer = shap.TreeExplainer(model)
# 计算SHAP值
shap_values = explainer.shap_values(X)
if isinstance(shap_values, list): # 多分类情况
shap_values = shap_values[1] # 取正类的SHAP值
# 生成摘要图(在测试中我们只检查数据)
shap.summary_plot(shap_values, X, show=False)
# 计算全局特征重要性
global_importance = np.mean(np.abs(shap_values), axis=0)
sorted_idx = np.argsort(global_importance)[::-1]
# 打印全局重要性
print("\nSHAP全局特征重要性:")
for i in sorted_idx:
print(f"{X.columns[i]}: {global_importance[i]:.4f}")
# 验证信息性特征排名靠前
assert "feature_0" in [X.columns[i] for i in sorted_idx[:3]]
assert "feature_1" in [X.columns[i] for i in sorted_idx[:3]]
def test_partial_dependence(self, sample_model_and_data):
"""测试部分依赖图(PDP)"""
from sklearn.inspection import PartialDependenceDisplay
model, X, y = sample_model_and_data
# 选择一个特征进行分析
feature = "feature_0"
# 计算部分依赖
display = PartialDependenceDisplay.from_estimator(
model, X, features=[feature], kind="both"
)
# 获取PDP数据
pd_results = display.pd_results[0]
pdp_values = pd_results["average"]
ice_values = pd_results["individual"]
# 打印PDP信息
print(f"\n特征 '{feature}' 的部分依赖分析:")
print(f"平均PDP值范围: {np.min(pdp_values):.4f} 到 {np.max(pdp_values):.4f}")
print(f"ICE曲线数量: {len(ice_values)}")
# 验证PDP合理性
assert len(pdp_values) > 0
assert ice_values.shape[0] == len(X)
def test_accumulated_local_effects(self, sample_model_and_data):
"""测试累积局部效应(ALE)"""
model, X, y = sample_model_and_data
# 使用dalex计算ALE
exp = dx.Explainer(model, X, y)
ale = exp.model_profile(type="accumulated")
# 获取ALE数据
ale_results = ale.result
feature = "feature_0"
ale_feature = ale_results[ale_results["_vname_"] == feature]
# 打印ALE信息
print(f"\n特征 '{feature}' 的累积局部效应:")
print(ale_feature[["_x_", "_yhat_"]].head())
# 验证ALE合理性
assert len(ale_feature) > 0
assert "_yhat_" in ale_feature.columns
def test_global_surrogate(self, sample_model_and_data):
"""测试全局代理模型"""
from sklearn.tree import DecisionTreeClassifier
model, X, y = sample_model_and_data
# 获取模型预测
y_pred = model.predict(X)
# 训练代理模型(决策树)
surrogate = DecisionTreeClassifier(max_depth=3, random_state=42)
surrogate.fit(X, y_pred)
# 评估代理模型
surrogate_score = surrogate.score(X, y_pred)
print(f"\n代理模型准确率: {surrogate_score:.4f}")
# 验证代理模型质量
assert surrogate_score > 0.8
# 检查代理模型特征重要性
surrogate_importance = surrogate.feature_importances_
sorted_idx = np.argsort(surrogate_importance)[::-1]
print("代理模型特征重要性:")
for i in sorted_idx:
print(f"{X.columns[i]}: {surrogate_importance[i]:.4f}")
# 验证与原模型特征重要性的一致性
original_importance = model.feature_importances_
corr = np.corrcoef(original_importance, surrogate_importance)[0, 1]
print(f"与原模型特征重要性的相关性: {corr:.4f}")
assert corr > 0.5
9. 模型安全测试
9.1 对抗攻击测试
import pytest
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from art.estimators.classification import SklearnClassifier
from art.attacks.evasion import (
FastGradientMethod,
CarliniL2Method,
DeepFool
)
class TestAdversarialAttacks:
"""测试模型对抗攻击"""
@pytest.fixture
def sample_model_and_data(self):
# 创建测试数据
X, y = make_classification(
n_samples=1000,
n_features=10,
n_classes=2,
random_state=42
)
# 训练模型
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
# 创建ART分类器
art_classifier = SklearnClassifier(model=model)
# 分割测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
return art_classifier, X_test, y_test
def test_fgsm_attack(self, sample_model_and_data):
"""测试FGSM攻击"""
classifier, X_test, y_test = sample_model_and_data
# 原始准确率
y_pred = classifier.predict(X_test)
orig_acc = np.mean(np.argmax(y_pred, axis=1) == y_test)
print(f"原始准确率: {orig_acc:.4f}")
# 创建FGSM攻击
attack = FastGradientMethod(
estimator=classifier,
eps=0.1 # 扰动大小
)
# 生成对抗样本
X_test_adv = attack.generate(x=X_test)
# 对抗样本预测
y_pred_adv = classifier.predict(X_test_adv)
adv_acc = np.mean(np.argmax(y_pred_adv, axis=1) == y_test)
print(f"对抗样本准确率: {adv_acc:.4f}")
# 计算攻击成功率
attack_success = 1 - adv_acc / orig_acc
print(f"攻击成功率: {attack_success:.2%}")
# 验证攻击效果
assert attack_success > 0.1 # 至少10%的成功率
# 验证扰动大小
perturbation = np.mean(np.abs(X_test_adv - X_test))
print(f"平均扰动大小: {perturbation:.6f}")
assert perturbation > 0.001
assert perturbation < 0.5
def test_carlini_wagner_attack(self, sample_model_and_data):
"""测试Carlini & Wagner攻击"""
classifier, X_test, y_test = sample_model_and_data
# 原始准确率
y_pred = classifier.predict(X_test)
orig_acc = np.mean(np.argmax(y_pred, axis=1) == y_test)
# 创建C&W攻击
attack = CarliniL2Method(
classifier=classifier,
max_iter=10 # 减少迭代次数以加快测试
)
# 生成对抗样本(只对前10个样本测试,因为计算量大)
X_test_adv = attack.generate(x=X_test[:10])
# 对抗样本预测
y_pred_adv = classifier.predict(X_test_adv)
adv_acc = np.mean(np.argmax(y_pred_adv, axis=1) == y_test[:10])
attack_success = 1 - adv_acc / orig_acc
print(f"C&W攻击成功率: {attack_success:.2%}")
# 验证攻击效果
assert attack_success > 0.3 # C&W通常更强大
# 验证扰动大小
perturbation = np.mean(np.abs(X_test_adv - X_test[:10]))
print(f"平均扰动大小: {perturbation:.6f}")
assert perturbation > 0.001
assert perturbation < 0.5
def test_deepfool_attack(self, sample_model_and_data):
"""测试DeepFool攻击"""
classifier, X_test, y_test = sample_model_and_data
# 原始准确率
y_pred = classifier.predict(X_test)
orig_acc = np.mean(np.argmax(y_pred, axis=1) == y_test)
# 创建DeepFool攻击
attack = DeepFool(
classifier=classifier,
max_iter=10 # 减少迭代次数以加快测试
)
# 生成对抗样本(只对前10个样本测试,因为计算量大)
X_test_adv = attack.generate(x=X_test[:10])
# 对抗样本预测
y_pred_adv = classifier.predict(X_test_adv)
adv_acc = np.mean(np.argmax(y_pred_adv, axis=1) == y_test[:10])
attack_success = 1 - adv_acc / orig_acc
print(f"DeepFool攻击成功率: {attack_success:.2%}")
# 验证攻击效果
assert attack_success > 0.2
# 验证扰动大小
perturbation = np.mean(np.abs(X_test_adv - X_test[:10]))
print(f"平均扰动大小: {perturbation:.6f}")
assert perturbation > 0.001
assert perturbation < 0.5
def test_attack_transferability(self, sample_model_and_data):
"""测试攻击可转移性"""
from sklearn.linear_model import LogisticRegression
classifier, X_test, y_test = sample_model_and_data
# 训练第二个不同的模型
model2 = LogisticRegression(max_iter=1000)
model2.fit(classifier.model._model.X_train, classifier.model._model.y_train)
classifier2 = SklearnClassifier(model=model2)
# 对第一个模型生成对抗样本
attack = FastGradientMethod(classifier, eps=0.1)
X_test_adv = attack.generate(x=X_test[:100]) # 只测试100个样本
# 在原始模型上的攻击成功率
y_pred_orig = classifier.predict(X_test_adv)
y_pred_orig = np.argmax(y_pred_orig, axis=1)
success_orig = 1 - np.mean(y_pred_orig == y_test[:100])
# 在第二个模型上的攻击成功率
y_pred_transfer = classifier2.predict(X_test_adv)
y_pred_transfer = np.argmax(y_pred_transfer, axis=1)
success_transfer = 1 - np.mean(y_pred_transfer == y_test[:100])
print(f"原始模型攻击成功率: {success_orig:.2%}")
print(f"转移到第二个模型的成功率: {success_transfer:.2%}")
# 验证可转移性
assert success_transfer > 0.5 * success_orig
9.2 模型逆向工程测试
import pytest
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
class TestModelInversion:
"""测试模型逆向工程"""
@pytest.fixture
def sample_model_and_data(self):
# 创建测试数据
X, y = make_classification(
n_samples=1000,
n_features=10,
n_classes=2,
random_state=42
)
# 训练模型
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
# 分割测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
return model, X_test, y_test
def test_model_extraction(self, sample_model_and_data):
"""测试模型提取攻击"""
model, X_test, y_test = sample_model_and_data
# 模拟攻击者:使用有限查询训练替代模型
n_queries = 100 # 攻击者可以进行的查询次数
X_queries = X_test[:n_queries]
y_queries = model.predict(X_queries)
# 训练替代模型
surrogate = RandomForestClassifier(n_estimators=50, random_state=42)
surrogate.fit(X_queries, y_queries)
# 评估替代模型与原模型的相似性
y_pred_orig = model.predict(X_test)
y_pred_surr = surrogate.predict(X_test)
agreement = np.mean(y_pred_orig == y_pred_surr)
print(f"替代模型与原模型预测一致率: {agreement:.2%}")
# 验证替代模型效果
assert agreement > 0.7 # 替代模型应达到一定相似性
def test_membership_inference(self, sample_model_and_data):
"""测试成员推理攻击"""
model, X_test, y_test = sample_model_and_data
# 假设攻击者知道部分训练数据和部分非训练数据
# 在实际场景中,非训练数据可能来自同一分布的其他数据
X_train = model._model.X_train # 访问训练数据
n_samples = min(100, len(X_train), len(X_test))
# 创建混合数据集(一半训练数据,一半测试数据)
X_mixed = np.vstack([
X_train[:n_samples//2],
X_test[:n_samples//2]
])
membership_labels = np.array(
[1] * (n_samples//2) + [0] * (n_samples//2)
)
# 使用模型预测作为特征
pred_confidences = model.predict_proba(X_mixed)[:, 1]
# 训练成员推理分类器
from sklearn.linear_model import LogisticRegression
attack_model = LogisticRegression()
attack_model.fit(pred_confidences.reshape(-1, 1), membership_labels)
# 评估攻击效果
attack_acc = attack_model.score(
pred_confidences.reshape(-1, 1), membership_labels
)
print(f"成员推理攻击准确率: {attack_acc:.2%}")
# 验证攻击效果
assert attack_acc > 0.6 # 高于随机猜测
def test_attribute_inference(self, sample_model_and_data):
"""测试属性推理攻击"""
model, X_test, y_test = sample_model_and_data
# 假设攻击者想推断某个敏感特征(如第0个特征)
sensitive_feature_idx = 0
sensitive_feature = X_test[:, sensitive_feature_idx]
# 其他特征作为已知信息
known_features = np.delete(X_test, sensitive_feature_idx, axis=1)
# 使用模型预测作为额外信息
predictions = model.predict_proba(X_test)[:, 1]
# 组合已知信息和预测
attack_features = np.column_stack([known_features, predictions])
# 训练属性推理模型
from sklearn.ensemble import GradientBoostingRegressor
attack_model = GradientBoostingRegressor()
attack_model.fit(attack_features, sensitive_feature)
# 评估攻击效果
pred_sensitive = attack_model.predict(attack_features)
correlation = np.corrcoef(pred_sensitive, sensitive_feature)[0, 1]
print(f"属性推理相关性: {correlation:.4f}")
# 验证攻击效果
assert abs(correlation) > 0.3 # 存在一定相关性
9.3 数据投毒测试
import pytest
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from art.attacks.poisoning import PoisoningAttackSVM
class TestDataPoisoning:
"""测试数据投毒攻击"""
@pytest.fixture
def sample_model_and_data(self):
# 创建测试数据
X, y = make_classification(
n_samples=1000,
n_features=10,
n_classes=2,
random_state=42
)
# 分割训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
return X_train, X_test, y_train, y_test
def test_label_flipping_attack(self, sample_model_and_data):
"""测试标签翻转攻击"""
X_train, X_test, y_train, y_test = sample_model_and_data
# 原始模型
model_clean = RandomForestClassifier(n_estimators=50, random_state=42)
model_clean.fit(X_train, y_train)
clean_acc = model_clean.score(X_test, y_test)
print(f"干净模型准确率: {clean_acc:.4f}")
# 实施标签翻转攻击(翻转20%的训练标签)
flip_indices = np.random.choice(
len(y_train), size=int(0.2 * len(y_train)), replace=False
)
y_train_poisoned = y_train.copy()
y_train_poisoned[flip_indices] = 1 - y_train_poisoned[flip_indices]
# 投毒后模型
model_poisoned = RandomForestClassifier(n_estimators=50, random_state=42)
model_poisoned.fit(X_train, y_train_poisoned)
poisoned_acc = model_poisoned.score(X_test, y_test)
print(f"投毒模型准确率: {poisoned_acc:.4f}")
# 验证攻击效果
assert (clean_acc - poisoned_acc) > 0.05 # 准确率应下降
def test_poisoning_svm_attack(self, sample_model_and_data):
"""测试SVM投毒攻击"""
X_train, X_test, y_train, y_test = sample_model_and_data
# 创建ART分类器
from art.estimators.classification import SklearnClassifier
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_train, y_train)
classifier = SklearnClassifier(model=model)
# 创建投毒攻击
attack = PoisoningAttackSVM(
classifier=classifier,
x_train=X_train,
y_train=y_train,
step=0.1,
eps=1.0,
max_iter=10
)
# 生成投毒样本(目标是将正类误分类为负类)
target_class = 0
poison_percentage = 0.1 # 10%投毒样本
n_poison = int(poison_percentage * len(X_train))
# 选择要投毒的样本(正类样本)
pos_indices = np.where(y_train == 1)[0]
poison_indices = np.random.choice(
pos_indices, size=min(n_poison, len(pos_indices)), replace=False
)
# 生成投毒样本
x_poison, y_poison = attack.poison(
x=X_train[poison_indices],
y=np.full(len(poison_indices), target_class)
)
# 创建投毒训练集
X_train_poisoned = np.vstack([X_train, x_poison])
y_train_poisoned = np.concatenate([y_train, y_poison])
# 训练投毒模型
model_poisoned = RandomForestClassifier(n_estimators=50, random_state=42)
model_poisoned.fit(X_train_poisoned, y_train_poisoned)
# 评估攻击效果
# 在原始正类样本上的误分类率
pos_test_indices = np.where(y_test == 1)[0]
y_pred = model_poisoned.predict(X_test[pos_test_indices])
misclassification_rate = np.mean(y_pred != y_test[pos_test_indices])
print(f"正类样本误分类率: {misclassification_rate:.2%}")
# 验证攻击效果
assert misclassification_rate > 0.3 # 误分类率应显著增加
def test_backdoor_attack(self, sample_model_and_data):
"""测试后门攻击"""
X_train, X_test, y_train, y_test = sample_model_and_data
# 创建后门样本(修改特定特征并改变标签)
backdoor_indices = np.random.choice(
len(X_train), size=int(0.1 * len(X_train)), replace=False
)
X_train_poisoned = X_train.copy()
y_train_poisoned = y_train.copy()
# 添加后门触发器(将前两个特征设置为极端值)
X_train_poisoned[backdoor_indices, 0] = 10.0
X_train_poisoned[backdoor_indices, 1] = -10.0
# 翻转这些样本的标签
y_train_poisoned[backdoor_indices] = 1 - y_train_poisoned[backdoor_indices]
# 训练投毒模型
model_poisoned = RandomForestClassifier(n_estimators=50, random_state=42)
model_poisoned.fit(X_train_poisoned, y_train_poisoned)
# 评估攻击效果
# 1. 在干净测试集上的准确率
clean_acc = model_poisoned.score(X_test, y_test)
print(f"干净测试集准确率: {clean_acc:.4f}")
# 2. 在后门测试集上的攻击成功率
X_test_backdoor = X_test.copy()
X_test_backdoor[:, 0] = 10.0 # 激活后门
X_test_backdoor[:, 1] = -10.0
y_pred_backdoor = model_poisoned.predict(X_test_backdoor)
backdoor_success = np.mean(
y_pred_backdoor != y_test # 标签已被翻转
)
print(f"后门攻击成功率: {backdoor_success:.2%}")
# 验证攻击效果
assert clean_acc > 0.7 # 保持干净样本的性能
assert backdoor_success > 0.5 # 后门攻击成功率应高
10. 端到端系统测试
10.1 模型部署测试
import pytest
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
import requests
import json
from unittest.mock import MagicMock
class TestModelDeployment:
"""测试模型部署"""
@pytest.fixture
def sample_model_and_data(self):
# 创建测试数据
X, y = make_classification(
n_samples=100,
n_features=5,
n_classes=2,
random_state=42
)
# 训练模型
model = RandomForestClassifier(n_estimators=10, random_state=42)
model.fit(X, y)
return model, X, y
def test_api_predictions(self, sample_model_and_data, mocker):
"""测试API预测端点"""
model, X, y = sample_model_and_data
# 模拟API响应
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {
"predictions": model.predict(X[:5]).tolist()
}
# 使用mocker替换requests.post
mocker.patch('requests.post', return_value=mock_response)
# 测试API调用
test_data = X[:5].tolist()
response = requests.post(
"http://example.com/predict",
json={"data": test_data}
)
# 验证响应
assert response.status_code == 200
predictions = response.json()["predictions"]
assert len(predictions) == 5
assert all(isinstance(p, (int, np.integer)) for p in predictions)
# 验证mock被调用
requests.post.assert_called_once()
call_args = requests.post.call_args[1]
assert "json" in call_args
assert call_args["json"]["data"] == test_data
def test_batch_processing(self, sample_model_and_data, mocker):
"""测试批量处理能力"""
model, X, y = sample_model_and_data
# 模拟批量API响应
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {
"predictions": model.predict(X).tolist()
}
mocker.patch('requests.post', return_value=mock_response)
# 大批量数据(大于通常的API限制)
large_data = np.vstack([X] * 10) # 1000个样本
# 测试API调用
response = requests.post(
"http://example.com/predict",
json={"data": large_data.tolist()}
)
# 验证响应
assert response.status_code == 200
predictions = response.json()["predictions"]
assert len(predictions) == len(large_data)
def test_error_handling(self, mocker):
"""测试错误处理"""
# 模拟错误响应
mock_response = MagicMock()
mock_response.status_code = 400
mock_response.json.return_value = {
"error": "Invalid input data"
}
mocker.patch('requests.post', return_value=mock_response)
# 发送无效数据
invalid_data = [{"invalid": "data"}]
response = requests.post(
"http://example.com/predict",
json={"data": invalid_data}
)
# 验证错误响应
assert response.status_code == 400
assert "error" in response.json()
def test_model_versions(self, sample_model_and_data, mocker):
"""测试模型版本控制"""
model, X, y = sample_model_and_data
# 模拟版本API响应
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {
"model_version": "1.0.0",
"predictions": model.predict(X[:1]).tolist()
}
mocker.patch('requests.post', return_value=mock_response)
# 测试带版本头的API调用
headers = {"X-Model-Version": "1.0.0"}
response = requests.post(
"http://example.com/predict",
json={"data": X[:1].tolist()},
headers=headers
)
# 验证响应
assert response.status_code == 200
assert response.json()["model_version"] == "1.0.0"
def test_performance_metrics(self, sample_model_and_data, mocker):
"""测试性能指标收集"""
model, X, y = sample_model_and_data
# 模拟带性能指标的API响应
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {
"predictions": model.predict(X[:1]).tolist(),
"metrics": {
"inference_time_ms": 12.34,
"model_load_time_s": 1.23
}
}
mocker.patch('requests.post', return_value=mock_response)
# 测试API调用
response = requests.post(
"http://example.com/predict",
json={"data": X[:1].tolist()}
)
# 验证性能指标
assert response.status_code == 200
assert "metrics" in response.json()
metrics = response.json()["metrics"]
assert "inference_time_ms" in metrics
assert "model_load_time_s" in metrics
10.2 模型监控测试
import pytest
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime, timedelta
class TestModelMonitoring:
"""测试模型监控"""
@pytest.fixture
def sample_prediction_data(self):
# 创建带有时间戳的预测数据
now = datetime.now()
dates = [now - timedelta(days=i) for i in range(30)]
# 生成预测数据(模拟性能下降)
preds = np.concatenate([
np.random.binomial(1, 0.7, 15), # 前15天良好性能
np.random.binomial(1, 0.5, 15) # 后15天性能下降
])
# 创建DataFrame
df = pd.DataFrame({
"date": dates,
"prediction": preds,
"ground_truth": np.random.binomial(1, 0.65, 30)
})
return df
def test_concept_drift_detection(self, sample_prediction_data):
"""测试概念漂移检测"""
from alibi_detect import KSDrift
df = sample_prediction_data
# 分割参考数据和测试数据
ref_data = df[df["date"] < df["date"].median()]["prediction"].values
test_data = df[df["date"] >= df["date"].median()]["prediction"].values
# 初始化漂移检测器
cd = KSDrift(
x_ref=ref_data.reshape(-1, 1),
p_val=0.05
)
# 检测漂移
preds = cd.predict(
x=test_data.reshape(-1, 1),
return_p_val=True,
return_distance=True
)
# 获取结果
is_drift = preds["data"]["is_drift"]
p_val = preds["data"]["p_val"]
distance = preds["data"]["distance"]
print(f"检测到漂移: {is_drift}")
print(f"p值: {p_val:.4f}")
print(f"距离: {distance:.4f}")
# 验证漂移检测
assert is_drift
def test_performance_degradation(self, sample_prediction_data):
"""测试性能下降检测"""
df = sample_prediction_data
# 计算滚动准确率
df["correct"] = (df["prediction"] == df["ground_truth"]).astype(int)
df["rolling_acc"] = df["correct"].rolling(window=7).mean()
# 检测性能下降
initial_acc = df["rolling_acc"].iloc[:7].mean()
final_acc = df["rolling_acc"].iloc[-7:].mean()
degradation = initial_acc - final_acc
print(f"初始准确率: {initial_acc:.4f}")
print(f"最终准确率: {final_acc:.4f}")
print(f"下降幅度: {degradation:.4f}")
# 验证性能下降
assert degradation > 0.1
def test_data_drift_dashboard(self, sample_model_and_data):
"""测试数据漂移仪表板"""
import evidently
from evidently.test_suite import TestSuite
from evidently.tests import TestFeatureValueDrift
model, X, y = sample_model_and_data
# 创建参考数据和当前数据
X_ref = pd.DataFrame(X[:500], columns=[f"f_{i}" for i in range(X.shape[1])])
X_curr = pd.DataFrame(X[500:], columns=[f"f_{i}" for i in range(X.shape[1])])
# 人为引入数据漂移(修改部分特征)
X_curr["f_0"] = X_curr["f_0"] * 1.5 + 0.1
# 创建测试套件
data_drift_suite = TestSuite(tests=[
TestFeatureValueDrift(column_name=f"f_{i}") for i in range(3)
])
# 运行测试
data_drift_suite.run(
reference_data=X_ref,
current_data=X_curr
)
# 获取测试结果
test_results = []
for test in data_drift_suite.as_dict()["tests"]:
test_results.append({
"feature": test["parameters"]["column_name"],
"drift_detected": test["status"] == "FAIL"
})
# 打印结果
print("\n数据漂移检测结果:")
for res in test_results:
print(f"{res['feature']}: 漂移检测到={'是' if res['drift_detected'] else '否'}")
# 验证漂移检测
assert any(res["drift_detected"] for res in test_results)
def test_prediction_distribution(self, sample_prediction_data):
"""测试预测分布监控"""
df = sample_prediction_data
# 计算预测分布变化
early_preds = df[df["date"] < df["date"].median()]["prediction"]
late_preds = df[df["date"] >= df["date"].median()]["prediction"]
early_mean = early_preds.mean()
late_mean = late_preds.mean()
change = abs(late_mean - early_mean)
print(f"早期预测均值: {early_mean:.4f}")
print(f"近期预测均值: {late_mean:.4f}")
print(f"变化幅度: {change:.4f}")
# 验证分布变化
assert change > 0.1
10.3 模型回归测试
import pytest
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
import mlflow
class TestModelRegression:
"""测试模型回归"""
@pytest.fixture
def sample_model_and_data(self):
# 创建测试数据
X, y = make_classification(
n_samples=1000,
n_features=10,
n_classes=2,
random_state=42
)
# 训练模型
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X, y)
return model, X, y
def test_performance_regression(self, sample_model_and_data):
"""测试性能回归"""
model, X, y = sample_model_and_data
# 当前模型性能
current_acc = model.score(X, y)
# 模拟"之前"的性能(假设更好)
previous_acc = current_acc + 0.05
print(f"当前准确率: {current_acc:.4f}")
print(f"之前准确率: {previous_acc:.4f}")
print(f"性能下降: {previous_acc - current_acc:.4f}")
# 验证性能下降
assert (previous_acc - current_acc) > 0.03
def test_mlflow_model_tracking(self, sample_model_and_data, tmp_path):
"""测试MLflow模型跟踪"""
model, X, y = sample_model_and_data
# 设置MLflow跟踪
mlflow.set_tracking_uri(f"file:{tmp_path}/mlruns")
# 记录当前模型
with mlflow.start_run():
mlflow.sklearn.log_model(model, "model")
mlflow.log_metric("accuracy", model.score(X, y))
# 记录特征重要性
for i, imp in enumerate(model.feature_importances_):
mlflow.log_metric(f"feature_{i}_importance", imp)
run_id = mlflow.active_run().info.run_id
# 验证模型已记录
client = mlflow.tracking.MlflowClient()
run = client.get_run(run_id)
assert "model" in run.info.artifact_uri
assert "accuracy" in run.data.metrics
assert any("feature_" in key for key in run.data.metrics)
def test_model_comparison(self, sample_model_and_data):
"""测试模型比较"""
model, X, y = sample_model_and_data
# 训练两个不同版本的模型
model_v1 = RandomForestClassifier(
n_estimators=50, random_state=42
)
model_v1.fit(X, y)
model_v2 = RandomForestClassifier(
n_estimators=100, max_depth=5, random_state=42
)
model_v2.fit(X, y)
# 比较性能
acc_v1 = model_v1.score(X, y)
acc_v2 = model_v2.score(X, y)
print(f"模型v1准确率: {acc_v1:.4f}")
print(f"模型v2准确率: {acc_v2:.4f}")
# 比较特征重要性
imp_v1 = model_v1.feature_importances_
imp_v2 = model_v2.feature_importances_
imp_corr = np.corrcoef(imp_v1, imp_v2)[0, 1]
print(f"特征重要性相关性: {imp_corr:.4f}")
# 验证比较结果
assert abs(acc_v1 - acc_v2) < 0.1
assert imp_corr > 0.7
def test_threshold_optimization(self, sample_model_and_data):
"""测试阈值优化"""
model, X, y = sample_model_and_data
# 获取预测概率
y_proba = model.predict_proba(X)[:, 1]
# 测试不同阈值
thresholds = np.linspace(0.1, 0.9, 9)
results = []
for thresh in thresholds:
y_pred = (y_proba >= thresh).astype(int)
acc = np.mean(y_pred == y)
results.append((thresh, acc))
print(f"阈值={thresh:.2f}, 准确率={acc:.4f}")
# 找到最佳阈值
best_thresh, best_acc = max(results, key=lambda x: x[1])
print(f"最佳阈值: {best_thresh:.2f}, 最佳准确率: {best_acc:.4f}")
# 验证最佳阈值合理性
assert 0.3 <= best_thresh <= 0.7
assert best_acc > 0.8
11. 测试自动化框架
11.1 测试流水线设计
import pytest
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
import mlflow
import json
from datetime import datetime
class TestAutomationFramework:
"""测试自动化框架"""
@pytest.fixture
def sample_model_and_data(self):
# 创建测试数据
X, y = make_classification(
n_samples=1000,
n_features=10,
n_classes=2,
random_state=42
)
# 训练模型
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X, y)
return model, X, y
def test_pipeline_execution(self, sample_model_and_data, tmp_path):
"""测试端到端测试流水线执行"""
model, X, y = sample_model_and_data
# 创建测试结果目录
results_dir = tmp_path / "results"
results_dir.mkdir()
# 模拟测试流水线
test_results = {
"timestamp": datetime.now().isoformat(),
"model_type": "RandomForestClassifier",
"tests": []
}
# 1. 基础功能测试
acc = model.score(X, y)
test_results["tests"].append({
"name": "basic_accuracy",
"status": "passed" if acc > 0.8 else "failed",
"metric": acc,
"threshold": 0.8
})
# 2. 性能测试
import time
start = time.time()
model.predict(X[:100])
inference_time = time.time() - start
test_results["tests"].append({
"name": "inference_speed",
"status": "passed" if inference_time < 1.0 else "failed",
"metric": inference_time,
"threshold": 1.0,
"unit": "seconds"
})
# 3. 公平性测试
from fairlearn.metrics import demographic_parity_difference
sensitive_feature = np.random.randint(0, 2, size=len(y))
y_pred = model.predict(X)
dp_diff = demographic_parity_difference(
y, y_pred, sensitive_features=sensitive_feature
)
test_results["tests"].append({
"name": "fairness_dp_diff",
"status": "passed" if abs(dp_diff) < 0.2 else "failed",
"metric": dp_diff,
"threshold": 0.2
})
# 保存测试结果
results_file = results_dir / "test_results.json"
with open(results_file, "w") as f:
json.dump(test_results, f, indent=2)
# 验证测试结果文件
assert results_file.exists()
loaded_results = json.loads(results_file.read_text())
assert len(loaded_results["tests"]) == 3
assert any(t["name"] == "basic_accuracy" for t in loaded_results["tests"])
def test_mlflow_integration(self, sample_model_and_data, tmp_path):
"""测试MLflow集成"""
mlflow.set_tracking_uri(f"file:{tmp_path}/mlruns")
model, X, y = sample_model_and_data
with mlflow.start_run():
# 记录模型
mlflow.sklearn.log_model(model, "model")
# 运行测试并记录结果
acc = model.score(X, y)
mlflow.log_metric("accuracy", acc)
# 记录测试状态
mlflow.log_metric("test_status", 1 if acc > 0.8 else 0)
# 添加测试标签
mlflow.set_tag("test_result", "passed" if acc > 0.8 else "failed")
# 验证MLflow记录
client = mlflow.tracking.MlflowClient()
runs = client.search_runs(experiment_ids=["0"])
assert len(runs) == 1
assert "accuracy" in runs[0].data.metrics
assert "test_result" in runs[0].data.tags
def test_ci_cd_integration(self, mocker):
"""测试CI/CD集成"""
# 模拟CI/CD环境变量
mocker.patch.dict("os.environ", {
"CI": "true",
"GIT_COMMIT": "abc123",
"BUILD_NUMBER": "42"
})
# 模拟测试结果
test_results = {
"commit": "abc123",
"build": "42",
"tests_passed": 5,
"tests_failed": 1
}
# 模拟CI/CD API调用
mock_post = mocker.patch("requests.post")
mock_response = mocker.MagicMock()
mock_response.status_code = 200
mock_post.return_value = mock_response
# 发送测试结果
import requests
response = requests.post(
"http://ci-cd-server/results",
json=test_results
)
# 验证调用
mock_post.assert_called_once_with(
"http://ci-cd-server/results",
json=test_results
)
assert response.status_code == 200
def test_alerting_system(self, mocker):
"""测试告警系统"""
# 模拟测试失败
test_results = {
"test_name": "fairness_test",
"status": "failed",
"metric": 0.25,
"threshold": 0.2
}
# 模拟发送邮件
mock_send_email = mocker.patch("smtplib.SMTP.sendmail")
# 发送告警
from smtplib import SMTP
with SMTP("localhost") as smtp:
message = f"""\
Subject: Test Failed: {test_results["test_name"]}
Test {test_results["test_name"]} failed.
Metric value: {test_results["metric"]}
Threshold: {test_results["threshold"]}
"""
smtp.sendmail(
"alerts@example.com",
"team@example.com",
message
)
# 验证邮件发送
mock_send_email.assert_called_once()
args, _ = mock_send_email.call_args
assert "team@example.com" in args[1]
assert "Test Failed" in args[2]
11.2 测试报告生成
import pytest
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from jinja2 import Template
import matplotlib.pyplot as plt
class TestReporting:
"""测试报告生成"""
@pytest.fixture
def sample_test_results(self):
# 创建示例测试结果
return {
"model_info": {
"type": "RandomForestClassifier",
"timestamp": "2023-01-01T12:00:00",
"version": "1.0.0"
},
"metrics": {
"accuracy": 0.85,
"precision": 0.83,
"recall": 0.87,
"f1": 0.85,
"inference_time": 0.12
},
"fairness": {
"demographic_parity_diff": 0.15,
"equalized_odds_diff": 0.18
},
"robustness": {
"noise_accuracy": 0.82,
"adversarial_accuracy": 0.75
},
"passed_tests": 8,
"failed_tests": 2
}
def test_html_report_generation(self, sample_test_results, tmp_path):
"""测试HTML报告生成"""
# 创建Jinja2模板
template_str = """
<!DOCTYPE html>
<html>
<head>
<title>Model Test Report</title>
</head>
<body>
<h1>Model Test Report</h1>
<p>Model: {{ model_info.type }} v{{ model_info.version }}</p>
<p>Date: {{ model_info.timestamp }}</p>
<h2>Summary</h2>
<p>Passed tests: {{ passed_tests }}</p>
<p>Failed tests: {{ failed_tests }}</p>
<h2>Metrics</h2>
<ul>
<li>Accuracy: {{ metrics.accuracy|round(4) }}</li>
<li>Precision: {{ metrics.precision|round(4) }}</li>
<li>Recall: {{ metrics.recall|round(4) }}</li>
<li>F1: {{ metrics.f1|round(4) }}</li>
</ul>
</body>
</html>
"""
template = Template(template_str)
# 渲染报告
html_report = template.render(**sample_test_results)
# 保存报告
report_file = tmp_path / "report.html"
report_file.write_text(html_report)
# 验证报告
assert report_file.exists()
content = report_file.read_text()
assert "Model Test Report" in content
assert str(sample_test_results["metrics"]["accuracy"])[:4] in content
def test_metrics_visualization(self, sample_test_results, tmp_path):
"""测试指标可视化"""
# 创建指标图表
fig, ax = plt.subplots(figsize=(10, 5))
metrics = sample_test_results["metrics"]
bars = ax.bar(
list(metrics.keys()),
list(metrics.values()),
color=['blue', 'green', 'orange', 'red', 'purple']
)
ax.set_title("Model Performance Metrics")
ax.set_ylabel("Score")
# 保存图表
plot_file = tmp_path / "metrics_plot.png"
fig.savefig(plot_file)
plt.close(fig)
# 验证图表文件
assert plot_file.exists()
assert plot_file.stat().st_size > 0
def test_failure_analysis(self, sample_test_results):
"""测试失败分析"""
# 识别失败测试
failures = []
# 检查公平性指标
if sample_test_results["fairness"]["demographic_parity_diff"] > 0.2:
failures.append({
"test": "demographic_parity",
"value": sample_test_results["fairness"]["demographic_parity_diff"],
"threshold": 0.2
})
# 检查鲁棒性指标
if sample_test_results["robustness"]["adversarial_accuracy"] < 0.8:
failures.append({
"test": "adversarial_robustness",
"value": sample_test_results["robustness"]["adversarial_accuracy"],
"threshold": 0.8
})
# 验证失败分析
assert len(failures) == 1 # 根据样本数据,只有对抗鲁棒性会失败
assert failures[0]["test"] == "adversarial_robustness"
def test_comparison_report(self, sample_test_results):
"""测试比较报告"""
# 创建两个版本的测试结果
old_results = sample_test_results.copy()
new_results = sample_test_results.copy()
# 修改新结果(模拟改进)
new_results["metrics"]["accuracy"] = 0.88
new_results["metrics"]["f1"] = 0.87
new_results["failed_tests"] = 1
# 计算差异
comparison = {
"accuracy_diff": new_results["metrics"]["accuracy"] - old_results["metrics"]["accuracy"],
"f1_diff": new_results["metrics"]["f1"] - old_results["metrics"]["f1"],
"fixed_tests": old_results["failed_tests"] - new_results["failed_tests"]
}
# 验证比较结果
assert comparison["accuracy_diff"] == pytest.approx(0.03)
assert comparison["fixed_tests"] == 1
11.3 测试调度与触发
import pytest
import time
from unittest.mock import MagicMock, patch
from datetime import datetime, timedelta
class TestScheduling:
"""测试调度与触发"""
def test_cron_scheduling(self, mocker):
"""测试Cron调度"""
# 模拟调度器
mock_scheduler = mocker.MagicMock()
mocker.patch("apscheduler.schedulers.background.BackgroundScheduler",
return_value=mock_scheduler)
# 导入并创建调度器
from apscheduler.schedulers.background import BackgroundScheduler
scheduler = BackgroundScheduler()
# 添加Cron作业
def test_job():
print("Running scheduled tests...")
scheduler.add_job(
test_job,
'cron',
hour=3,
minute=30,
day_of_week='mon-fri'
)
# 验证调度器调用
mock_scheduler.add_job.assert_called_once()
args, kwargs = mock_scheduler.add_job.call_args
assert kwargs['trigger'] == 'cron'
assert kwargs['hour'] == 3
assert kwargs['minute'] == 30
def test_event_trigger(self, mocker):
"""测试事件触发"""
# 创建模拟事件总线
event_bus = mocker.MagicMock()
# 注册测试事件处理器
def handle_model_update(event):
print(f"Running tests for model {event['model_id']}")
return {"status": "success"}
event_bus.subscribe.return_value = handle_model_update
# 模拟事件发布
test_event = {
"type": "model_updated",
"model_id": "model_123",
"timestamp": datetime.now().isoformat()
}
# 触发事件
handler = event_bus.subscribe("model_updated")
result = handler(test_event)
# 验证事件处理
event_bus.subscribe.assert_called_once_with("model_updated")
assert result["status"] == "success"
def test_webhook_trigger(self, mocker):
"""测试Webhook触发"""
# 模拟Flask应用
mock_flask = mocker.patch("flask.Flask")
mock_request = mocker.patch("flask.request")
# 设置模拟请求
mock_request.method = "POST"
mock_request.json = {
"event": "model_deployed",
"model_version": "1.2.3"
}
# 创建测试端点
app = mock_flask.return_value
app.route.return_value(lambda: {"status": "ok"})
# 调用Webhook
with app.test_request_context('/webhook', json=mock_request.json):
response = app.route("/webhook", methods=["POST"])(lambda: {"status": "ok"})()
# 验证Webhook设置
mock_flask.assert_called_once()
app.route.assert_called_with("/webhook", methods=["POST"])
assert response["status"] == "ok"
def test_conditional_triggering(self, mocker):
"""测试条件触发"""
# 模拟监控数据
monitor_data = {
"accuracy": 0.82,
"throughput": 150,
"last_updated": datetime.now() - timedelta(hours=1)
}
# 定义触发条件
def should_trigger_tests(data):
return (
data["accuracy"] < 0.85 or
data["throughput"] < 100 or
datetime.now() - data["last_updated"] > timedelta(days=1)
)
# 测试触发条件
trigger = should_trigger_tests(monitor_data)
# 验证触发
assert trigger is True # 因为accuracy < 0.85
12. 评分标准与评估体系
12.1 性能评分标准
import pytest
import numpy as np
class TestPerformanceScoring:
"""测试性能评分标准"""
@pytest.fixture
def sample_metrics(self):
return {
"accuracy": 0.85,
"precision": 0.83,
"recall": 0.87,
"f1": 0.85,
"inference_time": 0.12,
"throughput": 150
}
def test_accuracy_scoring(self, sample_metrics):
"""测试准确率评分"""
accuracy = sample_metrics["accuracy"]
# 定义评分标准
if accuracy >= 0.9:
score = 5
elif accuracy >= 0.8:
score = 4
elif accuracy >= 0.7:
score = 3
elif accuracy >= 0.6:
score = 2
else:
score = 1
print(f"准确率: {accuracy:.2f}, 得分: {score}")
# 验证评分
assert score == 4
def test_latency_scoring(self, sample_metrics):
"""测试延迟评分"""
latency = sample_metrics["inference_time"] * 1000 # 转换为毫秒
# 定义评分标准
if latency <= 10:
score = 5
elif latency <= 50:
score = 4
elif latency <= 100:
score = 3
elif latency <= 500:
score = 2
else:
score = 1
print(f"延迟: {latency:.2f}ms, 得分: {score}")
# 验证评分
assert score == 4 # 120ms
def test_throughput_scoring(self, sample_metrics):
"""测试吞吐量评分"""
throughput = sample_metrics["throughput"] # 样本/秒
# 定义评分标准
if throughput >= 500:
score = 5
elif throughput >= 200:
score = 4
elif throughput >= 100:
score = 3
elif throughput >= 50:
score = 2
else:
score = 1
print(f"吞吐量: {throughput}样本/秒, 得分: {score}")
# 验证评分
assert score == 3
def test_composite_score(self, sample_metrics):
"""测试综合评分"""
# 计算各维度得分
accuracy_score = 4 # 来自test_accuracy_scoring
latency_score = 4 # 来自test_latency_scoring
throughput_score = 3 # 来自test_throughput_scoring
# 定义权重
weights = {
"accuracy": 0.5,
"latency": 0.3,
"throughput": 0.2
}
# 计算加权得分
composite_score = (
accuracy_score * weights["accuracy"] +
latency_score * weights["latency"] +
throughput_score * weights["throughput"]
)
print(f"综合得分: {composite_score:.2f}")
# 验证评分
expected_score = 4 * 0.5 + 4 * 0.3 + 3 * 0.2
assert composite_score == pytest.approx(expected_score)
def test_performance_benchmarking(self, sample_metrics):
"""测试性能基准比较"""
# 定义基准
benchmarks = {
"accuracy": 0.88,
"inference_time": 0.08,
"throughput": 180
}
# 计算相对性能
metrics = sample_metrics
relative_perf = {
"accuracy": metrics["accuracy"] / benchmarks["accuracy"],
"inference_time": benchmarks["inference_time"] / metrics["inference_time"],
"throughput": metrics["throughput"] / benchmarks["throughput"]
}
# 计算总体性能比率
overall_ratio = np.mean(list(relative_perf.values()))
print(f"相对性能比率: {overall_ratio:.2f}")
print(f"准确率比率: {relative_perf['accuracy']:.2f}")
print(f"延迟比率: {relative_perf['inference_time']:.2f}")
print(f"吞吐量比率: {relative_perf['throughput']:.2f}")
# 验证比率计算
assert 0 < overall_ratio < 2
assert relative_perf["accuracy"] == pytest.approx(0.85 / 0.88)
12.2 鲁棒性评分标准
import pytest
import numpy as np
class TestRobustnessScoring:
"""测试鲁棒性评分标准"""
@pytest.fixture
def sample_robustness_metrics(self):
return {
"clean_accuracy": 0.85,
"noisy_accuracy": 0.78,
"adversarial_accuracy": 0.65,
"missing_data_accuracy": 0.72,
"outlier_accuracy": 0.75
}
def test_noise_robustness_score(self, sample_robustness_metrics):
"""测试噪声鲁棒性评分"""
clean_acc = sample_robustness_metrics["clean_accuracy"]
noisy_acc = sample_robustness_metrics["noisy_accuracy"]
# 计算性能下降比例
drop = (clean_acc - noisy_acc) / clean_acc
# 定义评分标准
if drop <= 0.05:
score = 5
elif drop <= 0.1:
score = 4
elif drop <= 0.2:
score = 3
elif drop <= 0.3:
score = 2
else:
score = 1
print(f"噪声鲁棒性下降: {drop:.2%}, 得分: {score}")
# 验证评分
expected_drop = (0.85 - 0.78) / 0.85
assert drop == pytest.approx(expected_drop)
assert score == 3
def test_adversarial_robustness_score(self, sample_robustness_metrics):
"""测试对抗鲁棒性评分"""
clean_acc = sample_robustness_metrics["clean_accuracy"]
adv_acc = sample_robustness_metrics["adversarial_accuracy"]
# 计算性能下降比例
drop = (clean_acc - adv_acc) / clean_acc
# 定义评分标准
if drop <= 0.1:
score = 5
elif drop <= 0.2:
score = 4
elif drop <= 0.3:
score = 3
elif drop <= 0.5:
score = 2
else:
score = 1
print(f"对抗鲁棒性下降: {drop:.2%}, 得分: {score}")
# 验证评分
expected_drop = (0.85 - 0.65) / 0.85
assert drop == pytest.approx(expected_drop)
assert score == 3
def test_missing_data_score(self, sample_robustness_metrics):
"""测试缺失数据鲁棒性评分"""
clean_acc = sample_robustness_metrics["clean_accuracy"]
missing_acc = sample_robustness_metrics["missing_data_accuracy"]
# 计算性能下降比例
drop = (clean_acc - missing_acc) / clean_acc
# 定义评分标准
if drop <= 0.05:
score = 5
elif drop <= 0.1:
score = 4
elif drop <= 0.2:
score = 3
elif drop <= 0.3:
score = 2
else:
score = 1
print(f"缺失数据鲁棒性下降: {drop:.2%}, 得分: {score}")
# 验证评分
expected_drop = (0.85 - 0.72) / 0.85
assert drop == pytest.approx(expected_drop)
assert score == 3
def test_composite_robustness_score(self, sample_robustness_metrics):
"""测试综合鲁棒性评分"""
# 计算各维度得分
noise_score = 3 # 来自test_noise_robustness_score
adv_score = 3 # 来自test_adversarial_robustness_score
missing_score = 3 # 来自test_missing_data_score
# 定义权重
weights = {
"noise": 0.3,
"adversarial": 0.4,
"missing_data": 0.3
}
# 计算加权得分
composite_score = (
noise_score * weights["noise"] +
adv_score * weights["adversarial"] +
missing_score * weights["missing_data"]
)
print(f"综合鲁棒性得分: {composite_score:.2f}")
# 验证评分
expected_score = 3 * 0.3 + 3 * 0.4 + 3 * 0.3
assert composite_score == pytest.approx(expected_score)
def test_robustness_benchmarking(self, sample_robustness_metrics):
"""测试鲁棒性基准比较"""
# 定义基准
benchmarks = {
"noise_drop": 0.1,
"adv_drop": 0.15,
"missing_drop": 0.1
}
# 计算实际下降比例
metrics = sample_robustness_metrics
actual_drops = {
"noise": (metrics["clean_accuracy"] - metrics["noisy_accuracy"]) / metrics["clean_accuracy"],
"adv": (metrics["clean_accuracy"] - metrics["adversarial_accuracy"]) / metrics["clean_accuracy"],
"missing": (metrics["clean_accuracy"] - metrics["missing_data_accuracy"]) / metrics["clean_accuracy"]
}
# 计算相对鲁棒性比率
relative_robustness = {
"noise": benchmarks["noise_drop"] / actual_drops["noise"],
"adv": benchmarks["adv_drop"] / actual_drops["adv"],
"missing": benchmarks["missing_drop"] / actual_drops["missing"]
}
# 计算总体鲁棒性比率
overall_ratio = np.mean(list(relative_robustness.values()))
print(f"相对鲁棒性比率: {overall_ratio:.2f}")
print(f"噪声鲁棒性比率: {relative_robustness['noise']:.2f}")
print(f"对抗鲁棒性比率: {relative_robustness['adv']:.2f}")
print(f"缺失数据鲁棒性比率: {relative_robustness['missing']:.2f}")
# 验证比率计算
assert 0 < overall_ratio < 2
assert relative_robustness["noise"] == pytest.approx(0.1 / ((0.85-0.78)/0.85))
12.3 公平性评分标准
import pytest
import numpy as np
class TestFairnessScoring:
"""测试公平性评分标准"""
@pytest.fixture
def sample_fairness_metrics(self):
return {
"demographic_parity_diff": 0.15,
"equalized_odds_diff": 0.18,
"disparate_impact_ratio": 0.85,
"accuracy_gap": 0.12
}
def test_demographic_parity_score(self, sample_fairness_metrics):
"""测试人口统计均等性评分"""
dp_diff = sample_fairness_metrics["demographic_parity_diff"]
# 定义评分标准
if abs(dp_diff) <= 0.05:
score = 5
elif abs(dp_diff) <= 0.1:
score = 4
elif abs(dp_diff) <= 0.15:
score = 3
elif abs(dp_diff) <= 0.2:
score = 2
else:
score = 1
print(f"人口统计均等性差异: {dp_diff:.2f}, 得分: {score}")
# 验证评分
assert score == 3
def test_equalized_odds_score(self, sample_fairness_metrics):
"""测试均等几率评分"""
eo_diff = sample_fairness_metrics["equalized_odds_diff"]
# 定义评分标准
if abs(eo_diff) <= 0.05:
score = 5
elif abs(eo_diff) <= 0.1:
score = 4
elif abs(eo_diff) <= 0.15:
score = 3
elif abs(eo_diff) <= 0.2:
score = 2
else:
score = 1
print(f"均等几率差异: {eo_diff:.2f}, 得分: {score}")
# 验证评分
assert score == 3
def test_disparate_impact_score(self, sample_fairness_metrics):
"""测试不同影响评分"""
di_ratio = sample_fairness_metrics["disparate_impact_ratio"]
# 定义评分标准(接近1.0最好)
if 0.9 <= di_ratio <= 1.1:
score = 5
elif 0.8 <= di_ratio <= 1.2:
score = 4
elif 0.7 <= di_ratio <= 1.3:
score = 3
elif 0.6 <= di_ratio <= 1.4:
score = 2
else:
score = 1
print(f"不同影响比率: {di_ratio:.2f}, 得分: {score}")
# 验证评分
assert score == 4
def test_composite_fairness_score(self, sample_fairness_metrics):
"""测试综合公平性评分"""
# 计算各维度得分
dp_score = 3 # 来自test_demographic_parity_score
eo_score = 3 # 来自test_equalized_odds_score
di_score = 4 # 来自test_disparate_impact_score
# 定义权重
weights = {
"demographic_parity": 0.4,
"equalized_odds": 0.4,
"disparate_impact": 0.2
}
# 计算加权得分
composite_score = (
dp_score * weights["demographic_parity"] +
eo_score * weights["equalized_odds"] +
di_score * weights["disparate_impact"]
)
print(f"综合公平性得分: {composite_score:.2f}")
# 验证评分
expected_score = 3 * 0.4 + 3 * 0.4 + 4 * 0.2
assert composite_score == pytest.approx(expected_score)
def test_fairness_benchmarking(self, sample_fairness_metrics):
"""测试公平性基准比较"""
# 定义基准
benchmarks = {
"dp_diff": 0.1,
"eo_diff": 0.12,
"di_ratio": 0.9
}
# 计算实际指标
metrics = sample_fairness_metrics
actual_metrics = {
"dp_diff": metrics["demographic_parity_diff"],
"eo_diff": metrics["equalized_odds_diff"],
"di_ratio": metrics["disparate_impact_ratio"]
}
# 计算相对公平性比率
relative_fairness = {
"dp": benchmarks["dp_diff"] / actual_metrics["dp_diff"],
"eo": benchmarks["eo_diff"] / actual_metrics["eo_diff"],
"di": actual_metrics["di_ratio"] / benchmarks["di_ratio"]
}
# 计算总体公平性比率
overall_ratio = np.mean(list(relative_fairness.values()))
print(f"相对公平性比率: {overall_ratio:.2f}")
print(f"人口统计均等性比率: {relative_fairness['dp']:.2f}")
print(f"均等几率比率: {relative_fairness['eo']:.2f}")
print(f"不同影响比率: {relative_fairness['di']:.2f}")
# 验证比率计算
assert 0 < overall_ratio < 2
assert relative_fairness["dp"] == pytest.approx(0.1 / 0.15)
12.4 综合评分体系
import pytest
import numpy as np
class TestOverallScoring:
"""测试综合评分体系"""
@pytest.fixture
def sample_scores(self):
return {
"performance": {
"accuracy": 4,
"latency": 4,
"throughput": 3,
"composite": 3.8
},
"robustness": {
"noise": 3,
"adversarial": 3,
"missing_data": 3,
"composite": 3.0
},
"fairness": {
"demographic_parity": 3,
"equalized_odds": 3,
"disparate_impact": 4,
"composite": 3.2
}
}
def test_overall_quality_score(self, sample_scores):
"""测试总体质量评分"""
# 获取各维度综合分
perf_score = sample_scores["performance"]["composite"]
robust_score = sample_scores["robustness"]["composite"]
fair_score = sample_scores["fairness"]["composite"]
# 定义权重
weights = {
"performance": 0.5,
"robustness": 0.3,
"fairness": 0.2
}
# 计算总体质量分
overall_score = (
perf_score * weights["performance"] +
robust_score * weights["robustness"] +
fair_score * weights["fairness"]
)
print(f"总体质量得分: {overall_score:.2f}")
# 验证评分计算
expected_score = 3.8 * 0.5 + 3.0 * 0.3 + 3.2 * 0.2
assert overall_score == pytest.approx(expected_score)
def test_rating_classification(self, sample_scores):
"""测试评级分类"""
# 计算总体质量分(来自test_overall_quality_score)
overall_score = 3.8 * 0.5 + 3.0 * 0.3 + 3.2 * 0.2
# 定义评级标准
if overall_score >= 4.5:
rating = "A+"
elif overall_score >= 4.0:
rating = "A"
elif overall_score >= 3.5:
rating = "B+"
elif overall_score >= 3.0:
rating = "B"
elif overall_score >= 2.5:
rating = "C+"
elif overall_score >= 2.0:
rating = "C"
else:
rating = "D"
print(f"总体得分: {overall_score:.2f}, 评级: {rating}")
# 验证评级
assert rating == "B"
def test_score_visualization(self, sample_scores):
"""测试评分可视化"""
import matplotlib.pyplot as plt
# 准备数据
categories = ["性能", "鲁棒性", "公平性"]
scores = [
sample_scores["performance"]["composite"],
sample_scores["robustness"]["composite"],
sample_scores["fairness"]["composite"]
]
# 创建雷达图
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, polar=True)
# 计算角度
angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False)
angles = np.concatenate((angles, [angles[0]]))
# 绘制雷达图
scores = np.concatenate((scores, [scores[0]]))
ax.plot(angles, scores, 'o-', linewidth=2)
ax.fill(angles, scores, alpha=0.25)
# 设置标签
ax.set_thetagrids(angles[:-1] * 180 / np.pi, categories)
ax.set_ylim(0, 5)
ax.set_title("模型质量评估雷达图", size=20)
# 验证图形生成
assert len(ax.lines) > 0
assert len(ax.collections) > 0
plt.close(fig)
def test_score_card(self, sample_scores):
"""测试评分卡生成"""
# 创建评分卡文本
score_card = f"""
=== 模型质量评分卡 ===
性能:
- 准确率: {sample_scores['performance']['accuracy']}/5
- 延迟: {sample_scores['performance']['latency']}/5
- 吞吐量: {sample_scores['performance']['throughput']}/5
- 综合: {sample_scores['performance']['composite']:.1f}/5
鲁棒性:
- 噪声鲁棒性: {sample_scores['robustness']['noise']}/5
- 对抗鲁棒性: {sample_scores['robustness']['adversarial']}/5
- 缺失数据鲁棒性: {sample_scores['robustness']['missing_data']}/5
- 综合: {sample_scores['robustness']['composite']:.1f}/5
公平性:
- 人口统计均等性: {sample_scores['fairness']['demographic_parity']}/5
- 均等几率: {sample_scores['fairness']['equalized_odds']}/5
- 不同影响: {sample_scores['fairness']['disparate_impact']}/5
- 综合: {sample_scores['fairness']['composite']:.1f}/5
"""
# 验证评分卡内容
assert "模型质量评分卡" in score_card
assert "性能:" in score_card
assert "鲁棒性:" in score_card
assert "公平性:" in score_card
assert str(sample_scores['performance']['accuracy']) in score_card
13. 测试报告与可视化
13.1 自动化报告生成
import pytest
import json
from datetime import datetime
from jinja2 import Template
import pandas as pd
class TestReportGeneration:
"""测试自动化报告生成"""
@pytest.fixture
def sample_test_results(self):
return {
"model_info": {
"name": "CustomerChurnPredictor",
"version": "1.2.0",
"type": "RandomForestClassifier",
"training_date": "2023-05-15"
},
"test_summary": {
"total_tests": 25,
"passed": 22,
"failed": 3,
"success_rate": 0.88
},
"performance_metrics": {
"accuracy": 0.872,
"precision": 0.854,
"recall": 0.891,
"f1": 0.872,
"inference_time_ms": 45.2
},
"robustness_metrics": {
"noise_accuracy": 0.832,
"adversarial_accuracy": 0.712,
"missing_data_accuracy": 0.815
},
"fairness_metrics": {
"demographic_parity_diff": 0.142,
"equalized_odds_diff": 0.168,
"disparate_impact_ratio": 0.87
},
"failed_tests": [
{
"name": "adversarial_robustness",
"metric": "accuracy",
"value": 0.712,
"threshold": 0.75
},
{
"name": "equalized_odds",
"metric": "difference",
"value": 0.168,
"threshold": 0.15
},
{
"name": "throughput_stress_test",
"metric": "requests_per_second",
"value": 850,
"threshold": 1000
}
],
"execution_details": {
"start_time": "2023-06-01T09:00:00",
"end_time": "2023-06-01T10:30:00",
"environment": "Linux, 8CPU, 32GB RAM, Tesla V100"
}
}
def test_html_report(self, sample_test_results, tmp_path):
"""测试HTML报告生成"""
# 创建HTML模板
template_str = """
<!DOCTYPE html>
<html>
<head>
<title>AI模型测试报告 - {{ model_info.name }} v{{ model_info.version }}</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; }
h1 { color: #2c3e50; }
h2 { color: #3498db; border-bottom: 1px solid #eee; padding-bottom: 5px; }
.metric-card {
background: #f9f9f9; border: 1px solid #ddd;
border-radius: 5px; padding: 15px; margin: 10px 0;
}
.passed { color: #27ae60; }
.failed { color: #e74c3c; }
table { width: 100%; border-collapse: collapse; }
th, td { padding: 8px; text-align: left; border-bottom: 1px solid #ddd; }
tr:hover { background-color: #f5f5f5; }
</style>
</head>
<body>
<h1>AI模型测试报告</h1>
<p><strong>模型名称:</strong> {{ model_info.name }} v{{ model_info.version }}</p>
<p><strong>模型类型:</strong> {{ model_info.type }}</p>
<p><strong>训练日期:</strong> {{ model_info.training_date }}</p>
<h2>测试概览</h2>
<div class="metric-card">
<p><strong>测试总数:</strong> {{ test_summary.total_tests }}</p>
<p><strong>通过:</strong> <span class="passed">{{ test_summary.passed }}</span></p>
<p><strong>失败:</strong> <span class="failed">{{ test_summary.failed }}</span></p>
<p><strong>成功率:</strong> {{ "%.2f"|format(test_summary.success_rate * 100) }}%</p>
</div>
<h2>性能指标</h2>
<table>
<tr>
<th>指标</th>
<th>值</th>
</tr>
{% for metric, value in performance_metrics.items() %}
<tr>
<td>{{ metric }}</td>
<td>{{ "%.4f"|format(value) if value is number else value }}</td>
</tr>
{% endfor %}
</table>
<h2>失败测试详情</h2>
{% if failed_tests %}
<table>
<tr>
<th>测试名称</th>
<th>指标</th>
<th>值</th>
<th>阈值</th>
</tr>
{% for test in failed_tests %}
<tr>
<td>{{ test.name }}</td>
<td>{{ test.metric }}</td>
<td>{{ "%.4f"|format(test.value) if test.value is number else test.value }}</td>
<td>{{ "%.4f"|format(test.threshold) if test.threshold is number else test.threshold }}</td>
</tr>
{% endfor %}
</table>
{% else %}
<p>所有测试均已通过</p>
{% endif %}
<h2>执行详情</h2>
<p><strong>开始时间:</strong> {{ execution_details.start_time }}</p>
<p><strong>结束时间:</strong> {{ execution_details.end_time }}</p>
<p><strong>测试环境:</strong> {{ execution_details.environment }}</p>
</body>
</html>
"""
# 渲染报告
template = Template(template_str)
html_report = template.render(**sample_test_results)
# 保存报告
report_file = tmp_path / "model_test_report.html"
report_file.write_text(html_report)
# 验证报告
assert report_file.exists()
content = report_file.read_text()
assert sample_test_results["model_info"]["name"] in content
assert str(sample_test_results["test_summary"]["passed"]) in content
assert "失败测试详情" in content
def test_json_report(self, sample_test_results, tmp_path):
"""测试JSON报告生成"""
# 保存JSON报告
report_file = tmp_path / "test_results.json"
with open(report_file, "w") as f:
json.dump(sample_test_results, f, indent=2)
# 验证报告
assert report_file.exists()
loaded_data = json.loads(report_file.read_text())
assert loaded_data["model_info"]["name"] == sample_test_results["model_info"]["name"]
assert loaded_data["test_summary"]["total_tests"] == sample_test_results["test_summary"]["total_tests"]
def test_executive_summary(self, sample_test_results):
"""测试执行摘要生成"""
# 创建执行摘要
summary = f"""
AI模型测试执行摘要
模型: {sample_test_results['model_info']['name']} v{sample_test_results['model_info']['version']}
类型: {sample_test_results['model_info']['type']}
训练日期: {sample_test_results['model_info']['training_date']}
测试结果:
- 总测试数: {sample_test_results['test_summary']['total_tests']}
- 通过: {sample_test_results['test_summary']['passed']}
- 失败: {sample_test_results['test_summary']['failed']}
- 成功率: {sample_test_results['test_summary']['success_rate'] * 100:.1f}%
关键指标:
- 准确率: {sample_test_results['performance_metrics']['accuracy']:.3f}
- 对抗鲁棒性: {sample_test_results['robustness_metrics']['adversarial_accuracy']:.3f}
- 人口统计均等性差异: {sample_test_results['fairness_metrics']['demographic_parity_diff']:.3f}
关键问题:
{len(sample_test_results['failed_tests'])}个测试未通过:
"""
for test in sample_test_results["failed_tests"]:
summary += f"- {test['name']}: {test['value']} (阈值: {test['threshold']})\n"
summary += f"\n测试时间: {sample_test_results['execution_details']['start_time']} 到 {sample_test_results['execution_details']['end_time']}"
# 验证摘要
assert sample_test_results["model_info"]["name"] in summary
assert str(sample_test_results["test_summary"]["total_tests"]) in summary
assert "关键问题" in summary
assert len(sample_test_results["failed_tests"]) > 0
def test_metric_trend_report(self):
"""测试指标趋势报告"""
# 创建模拟历史数据
history = [
{"version": "1.0.0", "date": "2023-01-01", "accuracy": 0.82, "robustness": 0.75, "fairness": 0.78},
{"version": "1.1.0", "date": "2023-02-01", "accuracy": 0.85, "robustness": 0.78, "fairness": 0.80},
{"version": "1.2.0", "date": "2023-03-01", "accuracy": 0.87, "robustness": 0.80, "fairness": 0.82},
{"version": "1.3.0", "date": "2023-04-01", "accuracy": 0.88, "robustness": 0.82, "fairness": 0.83}
]
# 转换为DataFrame
df = pd.DataFrame(history)
# 生成趋势报告
trend_report = f"""
模型指标趋势报告
版本变化:
{df[['version', 'date']].to_string(index=False)}
准确率趋势:
初始: {df['accuracy'].iloc[0]:.3f}
最新: {df['accuracy'].iloc[-1]:.3f}
变化: +{(df['accuracy'].iloc[-1] - df['accuracy'].iloc[0]):.3f}
鲁棒性趋势:
初始: {df['robustness'].iloc[0]:.3f}
最新: {df['robustness'].iloc[-1]:.3f}
变化: +{(df['robustness'].iloc[-1] - df['robustness'].iloc[0]):.3f}
"""
# 验证趋势报告
assert "模型指标趋势报告" in trend_report
assert "1.0.0" in trend_report
assert "1.3.0" in trend_report
assert "+0.06" in trend_report # 准确率变化
13.2 可视化仪表板
import pytest
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from unittest.mock import MagicMock
class TestVisualizationDashboard:
"""测试可视化仪表板"""
@pytest.fixture
def sample_test_data(self):
return {
"performance": {
"accuracy": 0.872,
"precision": 0.854,
"recall": 0.891,
"f1": 0.872,
"inference_time": 45.2
},
"fairness": {
"demographic_parity": 0.142,
"equalized_odds": 0.168,
"disparate_impact": 0.87
},
"robustness": {
"noise": 0.832,
"adversarial": 0.712,
"missing_data": 0.815
}
}
def test_metric_radar_chart(self, sample_test_data):
"""测试指标雷达图"""
# 准备数据
categories = [
'Accuracy', 'Precision', 'Recall', 'F1',
'Demographic Parity', 'Equalized Odds',
'Noise Robustness', 'Adversarial Robustness'
]
values = [
sample_test_data["performance"]["accuracy"],
sample_test_data["performance"]["precision"],
sample_test_data["performance"]["recall"],
sample_test_data["performance"]["f1"],
1 - sample_test_data["fairness"]["demographic_parity"],
1 - sample_test_data["fairness"]["equalized_odds"],
sample_test_data["robustness"]["noise"],
sample_test_data["robustness"]["adversarial"]
]
# 创建雷达图
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=values,
theta=categories,
fill='toself',
name='Model Metrics'
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 1]
)),
showlegend=True,
title="Model Metrics Radar Chart"
)
# 验证图形
assert len(fig.data) == 1
assert fig.layout.title.text == "Model Metrics Radar Chart"
assert len(fig.data[0].r) == len(categories)
def test_performance_bar_chart(self, sample_test_data):
"""测试性能条形图"""
# 准备数据
metrics = sample_test_data["performance"]
df = pd.DataFrame({
"Metric": list(metrics.keys()),
"Value": list(metrics.values())
})
# 创建条形图
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(df["Metric"], df["Value"])
ax.set_title("Model Performance Metrics")
ax.set_ylim(0, 1)
# 添加数值标签
for bar in bars:
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2., height,
f"{height:.3f}",
ha='center', va='bottom')
# 验证图形
assert len(bars) == len(metrics)
assert ax.get_title() == "Model Performance Metrics"
plt.close(fig)
def test_fairness_comparison_chart(self, sample_test_data):
"""测试公平性比较图"""
# 准备数据
fairness = sample_test_data["fairness"]
thresholds = {
"demographic_parity": 0.15,
"equalized_odds": 0.15,
"disparate_impact": 0.8
}
# 创建比较图
fig = go.Figure()
fig.add_trace(go.Bar(
x=list(fairness.keys()),
y=list(fairness.values()),
name='Actual Value',
marker_color='rgb(55, 83, 109)'
))
fig.add_trace(go.Bar(
x=list(thresholds.keys()),
y=list(thresholds.values()),
name='Threshold',
marker_color='rgb(26, 118, 255)'
))
fig.update_layout(
title='Fairness Metrics Comparison',
barmode='group'
)
# 验证图形
assert len(fig.data) == 2
assert fig.layout.title.text == "Fairness Metrics Comparison"
def test_interactive_dashboard(self, mocker):
"""测试交互式仪表板"""
# 模拟Dash组件
mock_dash = mocker.patch("dash.Dash")
mock_dash.return_value = MagicMock()
# 模拟Dash核心组件
mock_dcc = mocker.patch("dash_core_components.Graph")
mock_html = mocker.patch("dash_html_components.Div")
# 导入Dash应用
from dash import Dash, dcc, html
# 创建模拟应用
app = Dash(__name__)
app.layout = html.Div([
html.H1("AI Model Testing Dashboard"),
dcc.Graph(id='performance-metrics'),
dcc.Graph(id='fairness-metrics'),
dcc.Graph(id='robustness-metrics')
])
# 验证Dash组件调用
mock_dash.assert_called_once()
assert mock_html.call_count >= 1
assert mock_dcc.call_count >= 3
def test_metric_over_time_plot(self):
"""测试指标随时间变化图"""
# 创建模拟历史数据
history = pd.DataFrame({
"date": pd.date_range(start="2023-01-01", periods=6, freq="M"),
"accuracy": [0.82, 0.83, 0.85, 0.86, 0.87, 0.88],
"robustness": [0.72, 0.75, 0.77, 0.79, 0.80, 0.82],
"fairness": [0.75, 0.77, 0.79, 0.80, 0.81, 0.83]
})
# 创建趋势图
fig = go.Figure()
fig.add_trace(go.Scatter(
x=history["date"],
y=history["accuracy"],
mode="lines+markers",
name="Accuracy"
))
fig.add_trace(go.Scatter(
x=history["date"],
y=history["robustness"],
mode="lines+markers",
name="Robustness"
))
fig.add_trace(go.Scatter(
x=history["date"],
y=history["fairness"],
mode="lines+markers",
name="Fairness"
))
fig.update_layout(
title="Model Metrics Over Time",
xaxis_title="Date",
yaxis_title="Score",
hovermode="x unified"
)
# 验证图形
assert len(fig.data) == 3
assert fig.layout.title.text == "Model Metrics Over Time"
13.3 问题诊断报告
import pytest
import pandas as pd
from unittest.mock import MagicMock
class TestDiagnosticReports:
"""测试问题诊断报告"""
@pytest.fixture
def sample_failures(self):
return [
{
"test_name": "adversarial_robustness",
"metric": "accuracy",
"value": 0.68,
"threshold": 0.75,
"description": "模型对对抗样本的鲁棒性不足",
"suggestions": [
"增加对抗训练",
"使用更复杂的模型架构",
"添加输入预处理"
]
},
{
"test_name": "equalized_odds",
"metric": "difference",
"value": 0.18,
"threshold": 0.15,
"description": "不同群体的真阳性率和假阳性率差异过大",
"suggestions": [
"应用公平性约束重新训练",
"使用后处理校准方法",
"检查训练数据中的偏见"
]
}
]
def test_failure_diagnosis_report(self, sample_failures):
"""测试失败诊断报告"""
report = "AI模型测试问题诊断报告\n\n"
report += "发现以下关键问题:\n\n"
for i, failure in enumerate(sample_failures, 1):
report += f"{i}. {failure['test_name']}测试失败\n"
report += f" - 指标: {failure['metric']} (值: {failure['value']}, 阈值: {failure['threshold']})\n"
report += f" - 问题描述: {failure['description']}\n"
report += f" - 改进建议:\n"
for suggestion in failure["suggestions"]:
report += f" * {suggestion}\n"
report += "\n"
# 验证报告
assert "问题诊断报告" in report
assert "adversarial_robustness" in report
assert "equalized_odds" in report
assert len(sample_failures[0]["suggestions"]) == 3
def test_feature_analysis_report(self):
"""测试特征分析报告"""
# 模拟特征重要性
features = ["age", "income", "balance", "transactions"]
importance = [0.35, 0.25, 0.20, 0.20]
# 创建报告
report = "特征重要性分析报告\n\n"
report += "模型预测中最有影响力的特征:\n\n"
for feat, imp in sorted(zip(features, importance), key=lambda x: -x[1]):
report += f"- {feat}: {imp:.2f}\n"
report += "\n建议:\n"
report += "- 检查高重要性特征的分布和潜在偏见\n"
report += "- 验证特征与目标变量的实际关系\n"
report += "- 考虑对敏感特征进行去相关处理\n"
# 验证报告
assert "特征重要性分析报告" in report
assert "age: 0.35" in report
assert "建议:" in report
def test_bias_analysis_report(self):
"""测试偏见分析报告"""
# 模拟偏见分析结果
bias_report = {
"sensitive_feature": "gender",
"metrics": {
"demographic_parity": 0.15,
"equalized_odds": 0.18,
"disparate_impact": 0.82
},
"affected_groups": {
"female": {
"precision": 0.78,
"recall": 0.82,
"f1": 0.80
},
"male": {
"precision": 0.85,
"recall": 0.88,
"f1": 0.86
}
},
"recommendations": [
"使用公平性感知的重新加权方法",
"应用对抗性去偏见技术",
"收集更多平衡的训练数据"
]
}
# 生成报告
report = f"""
偏见分析报告
敏感特征: {bias_report['sensitive_feature']}
公平性指标:
- 人口统计均等性差异: {bias_report['metrics']['demographic_parity']:.3f}
- 均等几率差异: {bias_report['metrics']['equalized_odds']:.3f}
- 不同影响比率: {bias_report['metrics']['disparate_impact']:.3f}
不同群体性能:
"""
for group, metrics in bias_report["affected_groups"].items():
report += f"\n{group.capitalize()}:\n"
report += f"- 精确率: {metrics['precision']:.3f}\n"
report += f"- 召回率: {metrics['recall']:.3f}\n"
report += f"- F1分数: {metrics['f1']:.3f}\n"
report += "\n改进建议:\n"
for rec in bias_report["recommendations"]:
report += f"- {rec}\n"
# 验证报告
assert "偏见分析报告" in report
assert "敏感特征: gender" in report
assert "精确率: 0.850" in report
assert len(bias_report["recommendations"]) == 3
def test_performance_bottleneck_analysis(self):
"""测试性能瓶颈分析"""
# 模拟性能分析数据
perf_data = {
"inference_time_breakdown": {
"data_loading": 0.05,
"preprocessing": 0.12,
"model_inference": 0.25,
"postprocessing": 0.03
},
"resource_utilization": {
"cpu": 85,
"memory": 65,
"gpu": 45
},
"bottlenecks": [
"模型推理时间占总时间的55%",
"CPU利用率接近饱和"
],
"recommendations": [
"优化模型架构减少计算量",
"考虑模型量化或剪枝",
"增加CPU资源或使用批处理"
]
}
# 生成报告
report = """
性能瓶颈分析报告
推理时间分解:
"""
for stage, time in perf_data["inference_time_breakdown"].items():
report += f"- {stage}: {time:.2f}s\n"
report += "\n资源利用率:\n"
for resource, util in perf_data["resource_utilization"].items():
report += f"- {resource}: {util}%\n"
report += "\n识别到的瓶颈:\n"
for bottleneck in perf_data["bottlenecks"]:
report += f"- {bottleneck}\n"
report += "\n优化建议:\n"
for rec in perf_data["recommendations"]:
report += f"- {rec}\n"
# 验证报告
assert "性能瓶颈分析报告" in report
assert "model_inference" in report
assert "CPU利用率" in report
assert len(perf_data["recommendations"]) == 3
14. 持续测试与监控
14.1 模型漂移检测
import pytest
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from alibi_detect import KSDrift, ChiSquareDrift
class TestDriftDetection:
"""测试模型漂移检测"""
@pytest.fixture
def sample_production_data(self):
# 创建模拟生产数据(随时间变化)
dates = pd.date_range(start="2023-01-01", end="2023-03-01", freq="D")
n_samples = len(dates)
# 模拟特征漂移(逐渐变化)
np.random.seed(42)
base = np.random.normal(0, 1, n_samples)
drift = np.linspace(0, 2, n_samples)
# 创建DataFrame
df = pd.DataFrame({
"date": dates,
"feature1": base + drift,
"feature2": base - 0.5 * drift,
"feature3": np.random.normal(1, 0.5, n_samples)
})
return df
def test_feature_drift_detection(self, sample_production_data):
"""测试特征漂移检测"""
df = sample_production_data
# 定义参考窗口(前30天)
ref_window = df[df["date"] <= "2023-01-30"]
ref_data = ref_window[["feature1", "feature2", "feature3"]].values
# 初始化漂移检测器
cd = KSDrift(
x_ref=ref_data,
p_val=0.05,
preprocess_fn=None
)
# 检测后续窗口中的漂移
window_size = 7 # 每周检测一次
drift_results = []
for start in pd.date_range(start="2023-02-01", end="2023-03-01", freq=f"{window_size}D"):
end = start + timedelta(days=window_size-1)
window_data = df[(df["date"] >= start) & (df["date"] <= end)]
if len(window_data) == 0:
continue
test_data = window_data[["feature1", "feature2", "feature3"]].values
# 检测漂移
preds = cd.predict(
x=test_data,
return_p_val=True,
return_distance=True
)
drift_results.append({
"window_start": start,
"window_end": end,
"is_drift": preds["data"]["is_drift"],
"p_val": preds["data"]["p_val"][0],
"distance": preds["data"]["distance"][0]
})
# 转换为DataFrame
drift_df = pd.DataFrame(drift_results)
# 验证漂移检测结果
print("\n漂移检测结果:")
print(drift_df)
# 至少应检测到一次漂移(因为我们模拟了漂移)
assert any(drift_df["is_drift"])
# 验证p值随时间减小(漂移增大)
pvals = drift_df["p_val"].values
assert np.all(np.diff(pvals) <= 0) # p值应单调递减
def test_chi_square_drift_detection(self, sample_production_data):
"""测试卡方特征漂移检测"""
df = sample_production_data
# 定义参考窗口(前30天)
ref_window = df[df["date"] <= "2023-01-30"]
ref_data = ref_window[["feature1", "feature2", "feature3"]].values
# 初始化卡方漂移检测器
cd = ChiSquareDrift(
x_ref=ref_data,
p_val=0.05,
preprocess_fn=None
)
# 测试最后一个窗口(漂移应最大)
test_window = df[df["date"] >= "2023-02-20"]
test_data = test_window[["feature1", "feature2", "feature3"]].values
preds = cd.predict(
x=test_data,
return_p_val=True,
return_distance=True
)
print(f"\n卡方漂移检测结果:")
print(f"是否漂移: {preds['data']['is_drift']}")
print(f"p值: {preds['data']['p_val'][0]:.4f}")
# 验证检测到漂移
assert preds["data"]["is_drift"]
assert preds["data"]["p_val"][0] < 0.05
def test_concept_drift_detection(self):
"""测试概念漂移检测"""
# 模拟预测性能随时间变化(模拟概念漂移)
dates = pd.date_range(start="2023-01-01", end="2023-03-01", freq="D")
n_samples = len(dates)
# 模拟准确率下降
accuracy = np.concatenate([
np.random.normal(0.85, 0.02, 30), # 第一个月稳定
np.linspace(0.85, 0.75, 31), # 二月份开始下降
np.random.normal(0.75, 0.03, 28) # 三月份保持低位
])
# 创建DataFrame
df = pd.DataFrame({
"date": dates,
"accuracy": accuracy
})
# 检测概念漂移(使用滚动窗口统计)
window_size = 14 # 两周窗口
df["rolling_avg"] = df["accuracy"].rolling(window_size).mean()
df["rolling_std"] = df["accuracy"].rolling(window_size).std()
# 计算变化点(滚动平均下降超过2个标准差)
df["change_point"] = (
df["rolling_avg"] < (df["rolling_avg"].iloc[0] - 2 * df["rolling_std"].iloc[0])
)
# 验证概念漂移检测
change_points = df[df["change_point"]]
print(f"\n检测到的概念漂移点数量: {len(change_points)}")
assert not df["change_point"].iloc[:30].any() # 第一个月无变化
assert df["change_point"].iloc[40:].any() # 二月底应有变化
def test_multivariate_drift_detection(self, sample_production_data):
"""测试多变量漂移检测"""
from alibi_detect import MMDDrift
df = sample_production_data
# 定义参考窗口(前30天)
ref_window = df[df["date"] <= "2023-01-30"]
ref_data = ref_window[["feature1", "feature2", "feature3"]].values
# 初始化MMD漂移检测器
cd = MMDDrift(
x_ref=ref_data,
p_val=0.05,
preprocess_fn=None
)
# 测试最后一个窗口(漂移应最大)
test_window = df[df["date"] >= "2023-02-20"]
test_data = test_window[["feature1", "feature2", "feature3"]].values
preds = cd.predict(
x=test_data,
return_p_val=True,
return_distance=True
)
print(f"\nMMD漂移检测结果:")
print(f"是否漂移: {preds['data']['is_drift']}")
print(f"p值: {preds['data']['p_val']:.4f}")
# 验证检测到漂移
assert preds["data"]["is_drift"]
assert preds["data"]["p_val"] < 0.05
14.2 自动化重训练
import pytest
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from unittest.mock import MagicMock, patch
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
class TestAutomatedRetraining:
"""测试自动化重训练"""
@pytest.fixture
def sample_model_and_data(self):
# 创建测试数据
X, y = make_classification(
n_samples=1000,
n_features=10,
n_classes=2,
random_state=42
)
# 训练初始模型
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X, y)
return model, X, y
def test_performance_based_retraining(self, sample_model_and_data):
"""测试基于性能的重训练触发"""
model, X, y = sample_model_and_data
# 模拟性能监控数据
perf_history = {
"date": pd.date_range(start="2023-01-01", periods=30, freq="D"),
"accuracy": np.concatenate([
np.random.normal(0.85, 0.02, 20),
np.random.normal(0.75, 0.03, 10)
])
}
df = pd.DataFrame(perf_history)
# 定义重训练条件
df["retrain_needed"] = (
df["accuracy"] < 0.8 # 准确率低于80%
)
# 验证重训练触发
retrain_days = df[df["retrain_needed"]]["date"]
print(f"\n需要重训练的日期: {len(retrain_days)}天")
assert not df["retrain_needed"].iloc[:15].any() # 前15天不应触发
assert df["retrain_needed"].iloc[-5:].all() # 最后5天应触发
def test_drift_based_retraining(self):
"""测试基于漂移检测的重训练触发"""
# 模拟漂移检测结果
drift_results = pd.DataFrame({
"date": pd.date_range(start="2023-01-01", periods=30, freq="D"),
"drift_detected": [False] * 20 + [True] * 10,
"drift_confidence": [0.0] * 20 + np.linspace(0.9, 0.95, 10).tolist()
})
# 定义重训练条件
drift_results["retrain_needed"] = (
drift_results["drift_detected"] &
(drift_results["drift_confidence"] > 0.9)
)
# 验证重训练触发
retrain_days = drift_results[drift_results["retrain_needed"]]["date"]
print(f"\n需要重训练的日期(漂移): {len(retrain_days)}天")
assert not drift_results["retrain_needed"].iloc[:15].any()
assert drift_results["retrain_needed"].iloc[-5:].all()
@patch('sklearn.ensemble.RandomForestClassifier.fit')
def test_retraining_pipeline(self, mock_fit, sample_model_and_data):
"""测试重训练流水线"""
model, X, y = sample_model_and_data
# 模拟新数据
X_new, y_new = make_classification(
n_samples=500,
n_features=10,
n_classes=2,
random_state=24
)
# 模拟数据加载器
def data_loader():
return X_new, y_new
# 触发重训练
print("\n触发模型重训练...")
new_model = RandomForestClassifier(n_estimators=50, random_state=42)
new_model.fit(*data_loader())
# 验证重训练调用
mock_fit.assert_called_once()
called_args = mock_fit.call_args[0]
assert np.array_equal(called_args[0], X_new)
assert np.array_equal(called_args[1], y_new)
def test_model_replacement_strategy(self, sample_model_and_data):
"""测试模型替换策略"""
model, X, y = sample_model_and_data
# 模拟新旧模型评估
old_perf = {
"accuracy": 0.78,
"precision": 0.75,
"recall": 0.82
}
new_perf = {
"accuracy": 0.82,
"precision": 0.80,
"recall": 0.85
}
# 定义替换条件
replace_model = (
(new_perf["accuracy"] > old_perf["accuracy"] + 0.02) and
(new_perf["precision"] >= old_perf["precision"]) and
(new_perf["recall"] >= old_perf["recall"])
)
print(f"\n旧模型性能: {old_perf}")
print(f"新模型性能: {new_perf}")
print(f"是否替换模型: {replace_model}")
# 验证替换决策
assert replace_model is True
def test_retraining_impact_analysis(self, sample_model_and_data):
"""测试重训练影响分析"""
model, X, y = sample_model_and_data
# 模拟重训练前后性能
impact_data = pd.DataFrame({
"retraining": ["before", "after"],
"accuracy": [0.78, 0.82],
"fairness": [0.72, 0.75],
"robustness": [0.68, 0.72]
})
# 计算改进百分比
impact_data["accuracy_improvement"] = (
(impact_data["accuracy"] - impact_data["accuracy"].iloc[0]) /
impact_data["accuracy"].iloc[0]
)
impact_data["fairness_improvement"] = (
(impact_data["fairness"] - impact_data["fairness"].iloc[0]) /
impact_data["fairness"].iloc[0]
)
# 验证改进
print("\n重训练影响分析:")
print(impact_data)
assert impact_data["accuracy_improvement"].iloc[1] > 0
assert impact_data["fairness_improvement"].iloc[1] > 0
14.3 监控告警系统
import pytest
import pandas as pd
from datetime import datetime, timedelta
from unittest.mock import MagicMock, patch
class TestMonitoringAlerts:
"""测试监控告警系统"""
@pytest.fixture
def sample_monitoring_data(self):
# 创建模拟监控数据
dates = pd.date_range(start="2023-01-01", periods=30, freq="D")
return pd.DataFrame({
"date": dates,
"accuracy": [0.85] * 20 + [0.82, 0.81, 0.79, 0.78, 0.76, 0.75, 0.74, 0.73, 0.72, 0.71],
"throughput": [150] * 25 + [130, 125, 120, 115, 110],
"latency_ms": [45] * 22 + [50, 52, 55, 58, 60, 62, 65, 68]
})
@patch('smtplib.SMTP')
def test_performance_alert(self, mock_smtp, sample_monitoring_data):
"""测试性能告警触发"""
df = sample_monitoring_data
# 定义告警条件
df["accuracy_alert"] = df["accuracy"] < 0.8
df["throughput_alert"] = df["throughput"] < 135
df["latency_alert"] = df["latency_ms"] > 60
# 模拟发送邮件
mock_server = MagicMock()
mock_smtp.return_value.__enter__.return_value = mock_server
# 检查是否需要发送告警
last_day = df.iloc[-1]
if last_day["accuracy_alert"] or last_day["throughput_alert"] or last_day["latency_alert"]:
from smtplib import SMTP
with SMTP("smtp.example.com") as server:
message = f"""\
Subject: 模型性能告警 - {last_day['date'].date()}
检测到性能下降:
- 准确率: {last_day['accuracy']:.2f} (阈值: 0.80)
- 吞吐量: {last_day['throughput']} (阈值: 135)
- 延迟: {last_day['latency_ms']}ms (阈值: 60)
"""
server.sendmail(
"alerts@example.com",
"ml-team@example.com",
message
)
# 验证告警发送
assert last_day["accuracy_alert"] is True
assert last_day["throughput_alert"] is True
assert last_day["latency_alert"] is True
mock_smtp.assert_called_once_with("smtp.example.com")
mock_server.sendmail.assert_called_once()
@patch('requests.post')
def test_slack_alert(self, mock_post, sample_monitoring_data):
"""测试Slack告警"""
df = sample_monitoring_data
# 定义严重下降(连续3天下降)
df["accuracy_pct_change"] = df["accuracy"].pct_change()
df["rolling_drop"] = df["accuracy_pct_change"].rolling(3).sum()
severe_drop = df[df["rolling_drop"] < -0.1] # 累计下降超过10%
# 模拟Slack webhook调用
mock_response = MagicMock()
mock_response.status_code = 200
mock_post.return_value = mock_response
# 发送Slack告警
if not severe_drop.empty:
import requests
message = {
"text": f"模型准确率严重下降检测到! 最近3天累计下降: {severe_drop['rolling_drop'].iloc[-1]:.1%}",
"attachments": [{
"color": "#ff0000",
"fields": [
{"title": "当前准确率", "value": f"{df['accuracy'].iloc[-1]:.2%}", "short": True},
{"title": "下降幅度", "value": f"{severe_drop['rolling_drop'].iloc[-1]:.1%}", "short": True}
]
}]
}
response = requests.post(
"https://hooks.slack.com/services/...",
json=message
)
# 验证Slack调用
assert not severe_drop.empty
mock_post.assert_called_once()
assert "model" in mock_post.call_args[1]["json"]["text"].lower()
def test_alert_throttling(self):
"""测试告警限流"""
# 模拟告警历史
alert_history = pd.DataFrame({
"timestamp": pd.date_range(start="2023-01-01 00:00", periods=10, freq="H"),
"type": ["performance"] * 10,
"sent": [True] * 10
})
# 检查是否应该抑制告警(最近3小时内已有3次相同类型告警)
last_alerts = alert_history[
(alert_history["type"] == "performance") &
(alert_history["timestamp"] > (datetime.now() - timedelta(hours=3)))
]
should_throttle = len(last_alerts) >= 3
print(f"\n最近3小时同类告警数量: {len(last_alerts)}")
print(f"是否抑制告警: {should_throttle}")
# 验证限流逻辑
assert should_throttle is True
@patch('logging.Logger.error')
def test_error_alerting(self, mock_logger):
"""测试错误告警"""
# 模拟错误
try:
# 模拟模型预测错误
raise ValueError("Input data shape mismatch")
except Exception as e:
# 记录错误
import logging
logger = logging.getLogger("model_monitoring")
logger.error(f"模型预测错误: {str(e)}", exc_info=True)
# 验证错误日志
mock_logger.assert_called_once()
assert "Input data shape" in mock_logger.call_args[0][0]
def test_alert_escalation(self):
"""测试告警升级"""
# 模拟未解决的告警
unresolved_alerts = [
{"id": 1, "timestamp": "2023-01-01 09:00", "severity": "high", "acknowledged": False},
{"id": 2, "timestamp": "2023-01-01 10:00", "severity": "critical", "acknowledged": False},
{"id": 3, "timestamp": "2023-01-01 11:00", "severity": "medium", "acknowledged": True}
]
# 检查需要升级的告警(未确认且超过1小时)
now = datetime.strptime("2023-01-01 12:00", "%Y-%m-%d %H:%M")
for alert in unresolved_alerts:
alert_time = datetime.strptime(alert["timestamp"], "%Y-%m-%d %H:%M")
alert["needs_escalation"] = (
not alert["acknowledged"] and
(now - alert_time) > timedelta(hours=1)
)
# 验证升级逻辑
needs_escalation = [a for a in unresolved_alerts if a["needs_escalation"]]
print(f"\n需要升级的告警数量: {len(needs_escalation)}")
assert len(needs_escalation) == 2
assert needs_escalation[0]["id"] == 1
assert needs_escalation[1]["id"] == 2
15. 结论与展望
15.1 测试方案总结
class TestConclusion:
"""测试结论与总结"""
def test_summary_report(self):
"""测试总结报告生成"""
# 创建测试总结
summary = """
AI模型全方位测试方案总结
1. 测试范围覆盖:
- 基础功能测试: 模型训练、推理、保存/加载等基本功能
- 性能测试: 准确率、延迟、吞吐量等关键指标
- 鲁棒性测试: 对抗样本、噪声、缺失数据等场景
- 公平性测试: 不同群体的性能差异分析
- 安全测试: 对抗攻击、数据投毒等安全威胁
2. 关键成果:
- 实现了自动化测试流水线,覆盖模型全生命周期
- 开发了全面的评估指标体系,量化模型质量
- 建立了模型监控系统,实时检测性能下降和数据漂移
3. 主要发现:
- 模型在干净数据上表现良好(准确率85%以上)
- 对抗鲁棒性有待提高(对抗样本准确率下降15-20%)
- 某些人口统计组存在性能差异(最大差距12%)
4. 改进建议:
- 增加对抗训练提升模型鲁棒性
- 应用公平性约束减少群体差异
- 优化模型架构提高推理效率
"""
# 验证总结内容
assert "AI模型全方位测试方案总结" in summary
assert "测试范围覆盖" in summary
assert "关键成果" in summary
assert "主要发现" in summary
assert "改进建议" in summary
assert "对抗训练" in summary
def test_lessons_learned(self):
"""测试经验教训总结"""
lessons = """
经验教训总结:
1. 成功经验:
- 自动化测试框架显著提高了测试效率
- 全面的评估指标帮助识别了潜在问题
- 持续监控系统提前发现了性能退化
2. 挑战与解决方案:
- 挑战: 对抗样本生成计算成本高
解决方案: 使用更高效的攻击方法(FGSM代替C&W)
- 挑战: 公平性测试需要敏感特征
解决方案: 开发合成数据生成器进行测试
- 挑战: 模型解释性难以量化
解决方案: 采用多种解释方法并比较一致性
3. 未来改进方向:
- 扩展测试覆盖更多攻击类型
- 开发更精细的公平性评估方法
- 优化测试性能以减少资源消耗
"""
# 验证内容
assert "成功经验" in lessons
assert "挑战与解决方案" in lessons
assert "未来改进方向" in lessons
assert "公平性测试" in lessons
def test_best_practices(self):
"""测试最佳实践总结"""
practices = """
AI模型测试最佳实践:
1. 测试策略:
- 采用分层测试策略(单元测试、集成测试、系统测试)
- 实施持续测试,集成到CI/CD流水线
- 定期执行全面的回归测试
2. 工具与方法:
- 使用专用库(如ART、Fairlearn、SHAP)简化测试实现
- 结合自动化和手动测试,平衡覆盖率和深度
- 采用可视化工具增强测试结果解释
3. 团队协作:
- 建立跨功能团队(数据科学家、工程师、领域专家)
- 制定明确的测试标准和验收准则
- 定期评审测试结果并优先处理关键问题
4. 文档与知识共享:
- 维护详细的测试文档和案例库
- 记录常见问题和解决方案
- 定期进行知识分享和培训
"""
# 验证内容
assert "最佳实践" in practices
assert "测试策略" in practices
assert "工具与方法" in practices
assert "团队协作" in practices
assert "文档与知识共享" in practices
def test_roadmap(self):
"""测试未来路线图"""
roadmap = """
AI模型测试未来路线图:
短期(0-6个月):
- 扩展对抗测试覆盖更多攻击类型
- 增加模型解释性量化指标
- 优化测试性能,减少执行时间
中期(6-12个月):
- 开发自动化偏见检测与缓解工具
- 集成更多生产环境监控指标
- 建立模型测试基准数据库
长期(1年以上):
- 实现自适应测试策略,动态调整测试重点
- 开发统一的AI模型质量评分标准
- 研究AI测试AI的自动化方法
"""
# 验证内容
assert "未来路线图" in roadmap
assert "短期" in roadmap
assert "中期" in roadmap
assert "长期" in roadmap
assert "对抗测试" in roadmap
assert "自动化偏见检测" in roadmap
15.2 未来改进方向
class TestFutureImprovements:
"""测试未来改进方向"""
def test_technical_improvements(self):
"""测试技术改进方向"""
improvements = """
技术改进方向:
1. 测试覆盖扩展:
- 增加对多模态模型的测试支持(图像+文本)
- 支持更多类型的生成模型(GANs、扩散模型等)
- 开发针对强化学习模型的测试方法
2. 测试深度增强:
- 开发更精细的公平性评估指标
- 改进模型解释性的量化方法
- 增强对抗鲁棒性测试的真实性
3. 测试效率优化:
- 实现智能测试用例选择
- 开发并行化测试执行框架
- 优化资源密集型测试的计算
4. 工具链完善:
- 开发统一的测试管理平台
- 增强测试结果可视化能力
- 改进与MLOps平台的集成
"""
# 验证内容
assert "技术改进方向" in improvements
assert "测试覆盖扩展" in improvements
assert "测试深度增强" in improvements
assert "测试效率优化" in improvements
assert "工具链完善" in improvements
def test_process_improvements(self):
"""测试流程改进方向"""
process_improvements = """
流程改进方向:
1. 测试左移:
- 在模型设计阶段引入测试考量
- 开发模型可测试性评估标准
- 建立模型测试需求规范
2. 持续测试增强:
- 实现更细粒度的监控触发测试
- 开发自动化测试结果分析与分类
- 建立测试反馈闭环改进机制
3. 质量文化建设:
- 推广AI模型质量意识
- 建立跨团队的质量责任机制
- 开发AI测试培训认证体系
4. 标准化工作:
- 参与行业测试标准制定
- 开发领域特定的测试基准
- 建立测试最佳实践库
"""
# 验证内容
assert "流程改进方向" in process_improvements
assert "测试左移" in process_improvements
assert "持续测试增强" in process_improvements
assert "质量文化建设" in process_improvements
assert "标准化工作" in process_improvements
def test_research_directions(self):
"""测试研究方向"""
research = """
研究方向:
1. 新型测试方法:
- 基于因果关系的公平性测试
- 模型内在鲁棒性评估
- 可解释性的形式化验证
2. 自动化测试:
- 自动测试用例生成
- 自适应测试策略优化
- 测试预言自动生成
3. 质量度量:
- 统一的AI模型质量指标
- 质量与经济价值的关联模型
- 风险驱动的测试优先级
4. 跨领域应用:
- 安全关键领域的测试方法
- 小数据场景下的测试策略
- 联邦学习环境下的测试方案
"""
# 验证内容
assert "研究方向" in research
assert "新型测试方法" in research
assert "自动化测试" in research
assert "质量度量" in research
assert "跨领域应用" in research
def test_ecosystem_integration(self):
"""测试生态系统集成方向"""
ecosystem = """
生态系统集成方向:
1. 工具链整合:
- 与主流ML框架深度集成
- 支持云原生测试环境
- 开发IDE插件简化测试开发
2. 标准与合规:
- 支持AI伦理准则合规检查
- 开发行业特定合规测试包
- 实现自动化审计跟踪
3. 社区建设:
- 开源核心测试框架
- 建立测试案例共享平台
- 组织测试挑战赛促进创新
4. 商业应用:
- 开发SaaS化测试服务
- 提供测试认证服务
- 开发风险评分产品
"""
# 验证内容
assert "生态系统集成方向" in ecosystem
assert "工具链整合" in ecosystem
assert "标准与合规" in ecosystem
assert "社区建设" in ecosystem
assert "商业应用" in ecosystem
16. 附录
16.1 术语表
class TestGlossary:
"""测试术语表"""
def test_glossary_content(self):
"""测试术语表内容"""
glossary = """
AI模型测试术语表:
1. 对抗样本(Adversarial Example)
- 经过精心修改的输入,旨在导致模型出错
- 通常对人类观察者来说与正常输入难以区分
2. 数据漂移(Data Drift)
- 模型输入数据分布随时间发生的变化
- 可能导致模型性能下降
3. 概念漂移(Concept Drift)
- 输入与输出之间关系的变化
- 即使数据分布不变,模型也可能需要更新
4. 公平性(Fairness)
- 模型对不同群体的无偏见处理
- 通常通过统计奇偶性等指标衡量
5. 可解释性(Interpretability)
- 理解模型预测原因的能力
- 对建立信任和调试很重要
6. 鲁棒性(Robustness)
- 模型在异常或对抗条件下的性能保持能力
- 包括对噪声、缺失值和对抗攻击的抵抗
7. 模型逆向工程(Model Inversion)
- 通过模型查询推断训练数据的攻击方法
- 可能导致隐私泄露
8. 成员推理攻击(Membership Inference)
- 判断特定数据点是否在训练集中的攻击
- 隐私风险评估的一部分
9. SHAP值(SHAP Values)
- 基于博弈论的特征重要性解释方法
- 显示每个特征对预测的贡献
10. 人口统计均等性(Demographic Parity)
- 公平性度量,要求预测结果与敏感特征无关
- 不同群体应获得相似的正类比例
"""
# 验证内容
assert "术语表" in glossary
assert "对抗样本" in glossary
assert "数据漂移" in glossary
assert "概念漂移" in glossary
assert "公平性" in glossary
assert len(glossary.split("\n")) > 20 # 验证术语数量
16.2 参考文献
class TestReferences:
"""测试参考文献"""
def test_references_content(self):
"""测试参考文献内容"""
references = """
参考文献:
1. Goodfellow, I., et al. (2015). "Explaining and Harnessing Adversarial Examples".
arXiv:1412.6572.
2. Ribeiro, M.T., et al. (2016). ""Why Should I Trust You?" Explaining the Predictions
of Any Classifier". ACM SIGKDD.
3. Lundberg, S.M., & Lee, S.I. (2017). "A Unified Approach to Interpreting Model
Predictions". NIPS.
4. Mehrabi, N., et al. (2021). "A Survey on Bias and Fairness in Machine Learning".
ACM Computing Surveys.
5. Xu, H., et al. (2020). "Adversarial Attacks and Defenses in Images, Graphs and
Text: A Review". International Journal of Automation and Computing.
6. Molnar, C. (2022). "Interpretable Machine Learning: A Guide for Making Black Box
Models Explainable".
7. Barocas, S., et al. (2019). "Fairness and Machine Learning: Limitations and
Opportunities".
8. Samek, W., et al. (2021). "Explainable AI: Interpreting, Explaining and
Visualizing Deep Learning". Springer.
9. Dong, Y., et al. (2020). "A Survey on Adversarial Attack and Defense in Deep
Learning". Neurocomputing.
10. Sculley, D., et al. (2015). "Hidden Technical Debt in Machine Learning Systems".
NIPS.
"""
# 验证内容
assert "参考文献" in references
assert "Adversarial Examples" in references
assert "Explainable AI" in references
assert len(references.split("\n")) > 15 # 验证文献数量
16.3 代码索引
class TestCodeIndex:
"""测试代码索引"""
def test_code_index(self):
"""测试代码索引内容"""
code_index = """
代码索引:
1. 测试环境搭建
- 环境检查脚本: check_environment()
- 目录结构设置: create_test_directory_structure()
2. 数据集测试
- 数据质量验证: test_data_quality()
- 数据分割策略: split_data_with_stratification()
3. 模型基础测试
- 训练收敛测试: test_training_convergence()
- 推理输出测试: test_predict_output_shape()
4. 性能测试
- 基准性能测试: test_accuracy(), test_f1_score()
- 推理速度测试: test_single_inference_latency()
5. 鲁棒性测试
- 对抗样本测试: test_fgsm_attack()
- 噪声鲁棒性测试: test_gaussian_noise()
6. 公平性测试
- 群体公平性测试: test_demographic_parity()
- 偏见缓解测试: test_exponentiated_gradient()
7. 可解释性测试
- 特征重要性测试: test_permutation_importance()
- 局部解释测试: test_lime_explanations()
8. 安全测试
- 对抗攻击测试: test_carlini_wagner_attack()
- 模型逆向测试: test_model_extraction()
9. 端到端测试
- API测试: test_api_predictions()
- 部署测试: test_model_versions()
10. 自动化框架
- 测试流水线: test_pipeline_execution()
- 报告生成: test_html_report_generation()
"""
# 验证内容
assert "代码索引" in code_index
assert "测试环境搭建" in code_index
assert "数据集测试" in code_index
assert "模型基础测试" in code_index
assert "性能测试" in code_index
assert "鲁棒性测试" in code_index
assert len(code_index.split("\n")) > 20 # 验证条目数量
16.4 测试数据集说明
class TestDatasetDocumentation:
"""测试数据集说明"""
def test_dataset_documentation(self):
"""测试数据集文档"""
dataset_docs = """
测试数据集说明:
1. 合成分类数据集
- 用途: 模型基础测试、性能测试
- 特征: 20个数值特征(10个信息性,5个冗余,5个噪声)
- 样本: 10,000个(训练70%,验证15%,测试15%)
- 类别: 2个平衡类别
2. 带有敏感特征的数据集
- 用途: 公平性测试
- 敏感特征: 性别、年龄、种族
- 样本: 5,000个
- 包含已知偏见模式
3. 对抗测试数据集
- 用途: 鲁棒性测试
- 包含: 干净样本、对抗样本、噪声样本
- 攻击类型: FGSM、PGD、C&W
- 样本: 每个类别1,000个
4. 缺失数据测试集
- 用途: 数据质量测试
- 缺失模式: 随机缺失、结构化缺失
- 缺失比例: 10%-30%
- 样本: 2,000个
5. 时间序列数据集
- 用途: 概念漂移测试
- 特征: 季节性、趋势、漂移
- 时间范围: 2年每日数据
- 包含已知概念漂移点
数据集获取:
- 合成数据: 使用sklearn.make_classification生成
- 真实数据: 从公开数据集(如UCI、Kaggle)匿名化处理
- 对抗样本: 使用ART库生成
数据预处理:
- 标准化: 数值特征标准化到[0,1]
- 编码: 类别特征使用one-hot编码
- 分割: 按时间分割测试集(防止数据泄露)
"""
# 验证内容
assert "测试数据集说明" in dataset_docs
assert "合成分类数据集" in dataset_docs
assert "对抗测试数据集" in dataset_docs
assert "缺失数据测试集" in dataset_docs
assert "数据预处理" in dataset_docs
assert len(dataset_docs.split("\n")) > 20 # 验证详细程度