# 机器学习练习8 异常检验算法
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat
data = loadmat('E:\PyCharm\数据\ex8data1.mat')
X = data['X']
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X[:,0],X[:,1])
plt.show()
# 创建一个返回每个要素的均值和方差的函数
def estimate_gaussian(X):
mu = X.mean(axis=0)
sigma =X.var(axis = 0)
return mu,sigma
mu,sigma = estimate_gaussian(X)
Xval = data['Xval']
yval = data['yval']
# 使用scipy的内置方法计算数据点属于正态分布的概率的方法
from scipy import stats
dist = stats.norm(mu[0],sigma[0])
dist.pdf(15)
# 一个找到给定概率密度值和真实标签的最佳阈值的函数
def select_threshold(pval,yval):
best_epsilon = 0
best_fl = 0
fl =0
step = (pval.max()-pval.min())/100
for epsilon in np.arange(pval.min(),pval.max(),step):
preds = pval<epsilon
tp = np.sum(np.logical_and(preds == 1, yval == 1)).astype(float)
fp = np.sum(np.logical_and(preds == 1, yval == 0)).astype(float)
fn = np.sum(np.logical_and(preds == 0, yval == 1)).astype(float)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2 * precision * recall) / (precision + recall)
if f1 > best_f1:
best_f1 = f1
best_epsilon = epsilon
return best_epsilon, best_f1
# 将阈值应用于数据集,并可视化结果
outliers = np.where(p < epsilon)
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X[:,0], X[:,1])
ax.scatter(X[outliers[0],0], X[outliers[0],1], s=50, color='r', marker='o')
plt.show()
Machine Learning ——Homework 8
最新推荐文章于 2021-12-02 00:53:25 发布