# %% [markdown]
# ## 光纤异常检测系统(Jupyter优化版)
# %% [markdown]
# ### 1. 环境准备
# %%
# 安装必要库(如果尚未安装)
# !pip install pandas numpy scikit-learn matplotlib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import time
import matplotlib.pyplot as plt
# %% [markdown]
# ### 2. 数据加载模块
# %%
def load_otdr_data(path):
"""优化版数据加载,自动生成示例数据"""
try:
df = pd.read_csv(path)
except FileNotFoundError:
print(f"未找到文件 {path},生成示例数据...")
np.random.seed(42)
data = {
'SNR': np.random.uniform(20, 40, 1000),
'Position': np.random.randint(0, 100000, 1000),
}
for i in range(1,31):
data[f'P{i}'] = np.random.normal(i*0.1, 1, 1000)
df = pd.DataFrame(data)
df.iloc[::50, 5:] += np.random.normal(5, 1, (20,25)) # 添加异常模式
# 添加特征权重标记
waveform_features = [f'P{i}' for i in range(1,31)]
return df[['SNR', 'Position'] + waveform_features], waveform_features
# %% [markdown]
# ### 3. 增强型贝叶斯分区模型
# %%
class JupyterBayesianPartitioner:
def __init__(self, m_partitions=8, alpha=1e-4, feature_weights=None):
self.m = m_partitions
self.alpha = alpha
self.feature_weights = feature_weights or {f'P{i}':1.0 for i in range(1,31)}
self.partition_probs = defaultdict(float)
self.feature_probs = defaultdict(dict)
self.scaler = StandardScaler()
def _create_partitions(self, data):
"""可视化分区过程"""
if data.ndim == 1:
data = data.reshape(1, -1)
bins = np.percentile(data, np.linspace(0, 100, self.m + 1))
counts, _ = np.histogram(data, bins=bins)
# 显示分区可视化
if not hasattr(self, '_shown_viz'):
plt.figure(figsize=(10,4))
plt.title(f"Feature Partition Visualization (m={self.m})")
for i in range(data.shape[1]):
plt.hist(data[:,i], bins=bins, alpha=0.3)
plt.show()
self._shown_viz = True
return counts
def _apply_weights(self, sample):
return [sample[i] * self.feature_weights.get(f'P{i+1}',1.0)
for i in range(len(sample))]
def fit(self, X):
"""带进度显示的训练过程"""
from IPython.display import clear_output
# 特征加权
X_weighted = np.array([self._apply_weights(x) for x in X])
# 数据标准化
self.scaler.fit(X_weighted)
scaled = self.scaler.transform(X_weighted)
# 分区统计
partition_counts = defaultdict(int)
for i, sample in enumerate(scaled):
partition = tuple(self._create_partitions(sample))
partition_counts[partition] += 1
# 显示进度
if i % 100 == 0:
clear_output(wait=True)
print(f"Training: {i+1}/{len(X)} samples processed")
# 计算先验概率
total = len(X)
for k, count in partition_counts.items():
self.partition_probs[k] = (count + self.alpha) / (total + self.alpha * len(partition_counts))
# 特征概率计算
for idx, (partition, p_k) in enumerate(self.partition_probs.items()):
mask = np.array([np.array_equal(self._create_partitions(x), partition)
for x in scaled])
subset = scaled[mask]
stats = {
'mean': subset.mean(axis=1),
'std': subset.std(axis=1),
'range': np.ptp(subset, axis=1),
'wavelet': np.sum(np.abs(np.diff(subset))**2, axis=1)
}
self.feature_probs[partition] = {
feat: (np.histogram(vals, bins=20, density=True)[0] + self.alpha)
for feat, vals in stats.items()
}
clear_output(wait=True)
print(f"Processing partitions: {idx+1}/{len(self.partition_probs)}")
# %% [markdown]
from IPython.display import clear_output # 添加此行以导入 clear_output
def jupyter_detect_anomalies(data_path, config):
"""交互式检测流程"""
# 初始化
start_time = time.time()
plt.figure(figsize=(10,3))
# 动态配置
feature_weights = {f'P{i}':1.0 for i in range(1,31)}
for i in range(25,31):
feature_weights[f'P{i}'] = config['tail_weight']
# 加载数据
raw_data, features = load_otdr_data(data_path)
print(f"数据加载完成,样本数: {len(raw_data)}")
# 模型训练
model = JupyterBayesianPartitioner(
m_partitions=config['m_partitions'],
feature_weights=feature_weights
)
model.fit(raw_data[features].values)
# 异常评分
scores = []
for i, row in raw_data.iterrows():
score = 1 - model.partition_probs.get(
tuple(model._create_partitions(model.scaler.transform(
[model._apply_weights(row[features].values)]
)[0])), 0)
scores.append(score)
if i % 100 == 0:
plt.clf()
plt.plot(scores, alpha=0.7)
plt.title('Real-time Anomaly Scores')
plt.xlabel('Sample Index')
plt.ylabel('Score')
display(plt.gcf())
clear_output(wait=True) # 清除输出
raw_data['Anomaly_Score'] = scores
q75 = np.percentile(scores, 75)
raw_data['Anomaly_Score'] = np.clip((raw_data.Anomaly_Score - q75)/(1 - q75), 0, 1)
raw_data['Is_Anomaly'] = raw_data.Anomaly_Score > config['threshold']
# 结果分析
print(f"\n检测完成! 耗时: {time.time()-start_time:.2f}s")
print(f"异常阈值: {config['threshold']} | 尾部特征权重: {config['tail_weight']}x")
print(f"异常样本占比: {raw_data.Is_Anomaly.mean():.2%}")
# 可视化
fig, ax = plt.subplots(2,1,figsize=(12,6))
ax[0].hist(raw_data.Anomaly_Score, bins=20)
ax[0].set_title('Anomaly Score Distribution')
ax[1].scatter(raw_data.Position, raw_data.SNR, c=raw_data.Anomaly_Score, cmap='Reds')
ax[1].set_title('Geographical Distribution')
plt.tight_layout()
plt.show()
return raw_data
# %% [markdown]
# ### 5. 执行检测(带交互控件)
# %%
# 配置参数(可通过Jupyter widgets调整)
config = {
'm_partitions': 8,
'threshold': 0.65,
'tail_weight': 1.5,
'data_path': 'D://SF//YBSJ//OTDR_data1.csv' # 可替换为真实数据路径
}
# 执行检测
results = jupyter_detect_anomalies(config['data_path'], config)
# 显示结果
display(results[['SNR', 'Position', 'Anomaly_Score']].head(10))
print("\n异常位置统计:")
print(results[results.Is_Anomaly].Position.describe())
最后不需要生成图片,只要生成文字和csv文件就好