'''
#################################################
# Author : 余欢
# Date : Dec 28, 2015 4:09:29 PM
#company : 南京师范大学--大数据实验室
# description : 清理异常值
#################################################
'''
'''
1 生成0~1之间的随机数据;
2 加入一些异常值;
3 用is_outlier()方法检测异常值;
4 绘制出两个数据的集合(x和filtered)的图表,观察他们的区别。
'''
import numpy as np
import matplotlib.pyplot as plt
def is_outlier(points, threshold=3.5):
"""
返回一个布尔型的数组,如果数据点是异常值返回True,反之,返回False。
数据点的值不在阈值范围内将被定义为异常值
阈值默认为3.5
"""
if len(points.shape) == 1:
points = points[:,None]
median = np.median(points, axis=0)
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return modified_z_score > threshold
x = np.random.random(100)
buckets = 50
x = np.r_[x, -49, 95, 100, -100]
filtered = x[~is_outlier(x)]
plt.figure()
plt.subplot(211)
plt.hist(x, buckets)
plt.xlabel('Raw')
plt.subplot(212)
plt.hist(filtered, buckets)
plt.xlabel('Cleaned')
plt.show()