异常检测作业

题目1:检测异常服务器

代码:

import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt

def get_means_and_variance(X, Variance):
    means = np.mean(X, axis=0)
    if Variance:#为真取协方差,为假则取为方差
        sigma2 = (X-means).T@(X-means)/len(X)
    else:
        sigma2 = np.var(X,axis=0)
    return means, sigma2


def gaussian(X, means, sigma2):
    if np.ndim(sigma2) ==1:# 如果取得的是方差,维度不匹配,作维度变换方便函数运行
        sigma2 = np.diag(sigma2)

    X = X - means
    n = X.shape[1]

    first = np.power(2*np.pi, -n/2)*(np.linalg.det(sigma2)**(-0.5))
    second = np.diag(X@np.linalg.inv(sigma2)@X.T)
    p = first*np.exp(-0.5*second)
    p = p.reshape(-1, 1)

    return p


def plot_gaussian(X, means, sigma2):
    x = np.arange(0, 30, 0.5)
    y = np.arange(0, 30, 0.5)
    xx, yy = np.meshgrid(x, y) # 网格变换
    z = gaussian(np.c_[xx.ravel(), yy.ravel()], means, sigma2)
    zz = z.reshape(xx.shape)
    plt.plot(X[:, 0], X[:, 1], 'bx')
    contour_levels = [10**h for h in range(-20, 0, 3)] # 绘制等高线
    plt.contour(xx, yy, zz, contour_levels)



def select_epsilon(y_val, p):
    bestEpsilon = 0
    bestF1 = 0
    epsilon = np.linspace(min(p), max(p), 1000)
    for e in epsilon:
        p_ = p < e
        tp = np.sum((y_val == 1)&(p_ ==1))#真实值和预测值皆为真的样本点
        fp = np.sum((y_val == 0)&(p_ ==1))#真实值为假,预测值为真的样本点
        fn = np.sum((y_val == 1)&(p_ ==0))#真实值为真,预测值为假的样本点
        prec = tp/(tp + fp) if (tp + fp) else 0 #准确率
        rec = tp/(tp + fn) if (tp + fn) else 0#召回率
        F1_e = 2*prec*rec/(prec+rec) if (prec+rec) else 0
        if F1_e > bestF1:
            bestF1 = F1_e
            bestEpsilon = e
    return bestEpsilon, bestF1



mat = sio.loadmat('./data/ex8data1.mat')
print(mat.keys())
X = mat['X']
X_val, y_val = mat['Xval'], mat['yval']
print(X.shape, X_val.shape, y_val.shape)
plt.plot(X[:, 0], X[:, 1], 'bx')
plt.show()

means, sigma2 = get_means_and_variance(X, Variance=False)
plot_gaussian(X, means, sigma2)
plt.show()
pval = gaussian(X_val, means, sigma2)
bestEpsilon, bestF1 = select_epsilon(y_val, pval)
print(bestEpsilon, bestF1)
p = gaussian(X, means, sigma2)
anoms = np.array([X[i] for i in range(X.shape[0]) if p[i]<bestEpsilon])#异常样本点判断
plot_gaussian(X, means, sigma2)
plt.scatter(anoms[:, 0], anoms[:, 1], c='r', marker='o')
plt.show()

输出:

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Xval', 'yval'])
(307, 2) (307, 2) (307, 1)
[8.99985263e-05] 0.8750000000000001

原始数据散点图

数据的高斯分布等高线图

将异常值标记出来

题目2:高维数据的异常检测

代码:

mat2 = sio.loadmat('./data/ex8data2.mat')
print(mat2.keys())
X2 = mat2['X']
X2_val, y2_val = mat2['Xval'], mat2['yval']
print(X2.shape, X2_val.shape, y2_val.shape)

means_2, sigma2_2 = get_means_and_variance(X2, Variance=False)
pval_2 = gaussian(X2_val, means_2, sigma2_2)
bestEpsilon_2, bestF1_2 = select_epsilon(y2_val, pval_2)
p2 = gaussian(X2, means_2, sigma2_2)
anoms2 = [X2[i] for i in range(X2.shape[0]) if p2[i]<bestEpsilon_2]
print(len(anoms2))# 取协方差时为122个

输出:

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Xval', 'yval'])
(1000, 11) (100, 11) (100, 1)
117

小结:在算法中使用协方差或方差对于准确率、召回率的影响都不大,但是前者能catch到的异常值在这里比方差多,我想是因为其考虑到了多个维度上多个数据的偏差。

作业批改参考:https://www.bilibili.com/video/BV124411A75S?spm_id_from=333.788.videopod.episodes&vd_source=867b8ecbd62561f6cb9b4a83a368f691&p=13

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值