用西格玛准则剔除异常数据
我是两个数据差用的2西格玛
import pandas as pd
import numpy as np
def two_sigma_rule(data):
mean = np.mean(data)
std_dev = np.std(data)
lower_bound = mean - 2 * std_dev
upper_bound = mean + 2 * std_dev
return lower_bound, upper_bound
def remove_outliers(data1, data2):
# 对应列相减
residuals = data1 - data2
# 计算每一列的均值和标准差
column_means = np.mean(residuals, axis=0)
column_std_devs = np.std(residuals, axis=0)
# 计算每一列的标准化残差
standardized_residuals = (residuals - column_means) / column_std_devs
# 对于每一列,计算异常值的上下界
lower_bound, upper_bound = two_sigma_rule(standardized_residuals)
# 将异常值所在位置设为 NaN
outliers_indices = np.where((standardized_residuals < lower_bound) | (standardized_residuals > upper_bound))
outliers_rows, outliers_cols = outliers_indices
# 将异常值位置的值赋为 NaN
# 将异常值位置的值赋为 NaN
for row, col in zip(outliers_ro