吴恩达机器学习课程:编程练习 | (6) ex6-SVM

博客围绕Python使用支持向量机(SVM)进行垃圾邮件检测展开。内容包含邮件分类预处理,介绍了线性 - SVM和高斯核函数 - SVM,还涉及寻找最优参数的过程,利用SVM技术完成垃圾邮件的分类检测。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

1. 垃圾邮件检测

"""
案例:判断一封邮件是否是垃圾邮件
"""

from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import scipy.io as sio

data1 = sio.loadmat('data/spamTrain.mat')  # training data
data2 = sio.loadmat('data/spamTest.mat') # Testing data
print(data1.keys(), data2.keys())
X, y = data1['X'], data1['y']
X_test, y_test = data2['Xtest'], data2['ytest']
print(X.shape, y.shape, X_test.shape, y_test.shape)
svc = svm.SVC()
svc.fit(X, y.flatten())
pred = svc.predict(X_test)
print(svc)
print(metrics.classification_report(y_test.flatten(), pred))

# 线性回归
logit = LogisticRegression()
logit.fit(X, y.flatten())
pred_l = logit.predict(X_test)
print(logit)
print(metrics.classification_report(y_test.flatten(), pred_l))

print(X)

2. 寻找最优参数

"""
寻找最优参数C和gamma
数据集:data/ex6data3.mat
"""

import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import classification_report


def plot_data():
    plt.scatter(X[:,0],X[:,1],c = y.flatten(), cmap ='jet')
    plt.xlabel('x1')
    plt.ylabel('y1')


def plot_boundary(model):
    x_min, x_max = -0.6, 0.4
    y_min, y_max = -0.7, 0.6
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500),
                         np.linspace(y_min, y_max, 500))
    z = model.predict(np.c_[xx.flatten(), yy.flatten()])

    zz = z.reshape(xx.shape)
    plt.contour(xx, yy, zz)

if __name__ =="__main__":
    mat = sio.loadmat('data/ex6data3.mat')
    X, y = mat['X'], mat['y']
    X_val, y_val = mat['Xval'], mat['yval']
    print(X.shape,y.shape)
    plot_data()

    candidate = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
    combination = [(C, gamma) for C in candidate for gamma in candidate]
    print(len(combination))
    search = []
    for C, gamma in combination:
        svc = SVC(C=C, gamma=gamma)
        svc.fit(X, y.flatten())
        search.append(svc.score(X_val, y_val))

    best_score = search[np.argmax(search)] # search最大值--这里有四个相同的最大值(实际上对应四组不同的C,gamma),默认出现第一次的最大值
    best_param = combination[np.argmax(search)] # combination与search一一对应关系,search最大值对应的索引,也就是combination对应的最佳参数
    print(len(search))

   # best_svc = SVC(C=100, gamma=0.3,kernel="rbf") # 线性分类
    best_svc = SVC(C=0.3, gamma=100, kernel="rbf") # 非线性分类,径向基函数即,核函数为高斯函数
    best_svc.fit(X, y.flatten())
    ypred = best_svc.predict(X_val)

    print(classification_report(y_val.flatten(), ypred))
    plot_boundary(best_svc)
    plt.show()

3. 线性-SVM

"""
线性--支持向量机
任务:观察C取值对决策边界的影响
数据集:data/ex6data1.mat
"""

import numpy as np
import pandas as pd
import seaborn as sns
sns.set(context="notebook", style='darkgrid', palette='deep')
import scipy.io as sio
import matplotlib.pyplot as plt


def plot_data():
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.scatter(df['X1'], df['X2'], s=50, c=df['y'], cmap='jet')
    ax.set_title('Raw data')
    ax.set_xlabel('X1')
    ax.set_ylabel('X2')


def plot_boundary(model):
    x_min, x_max = -0.5, 4.5
    y_min, y_max = 1.3, 5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500),
                         np.linspace(y_min, y_max, 500))
    z = model.predict(np.c_[xx.flatten(), yy.flatten()])

    zz = z.reshape(xx.shape)
    plot_data()
    plt.contour(xx, yy, zz)


if __name__ == '__main__':
    data = sio.loadmat('./data/ex6data1.mat')
    print(data['X'].shape)
    df = pd.DataFrame({'X1': data['X'][:, 0].flatten(), 'X2': data['X'][:, 1].flatten(), 'y': data['y'].flatten()})
    print(df.head())
    from sklearn.svm import SVC

    # -----------------C=1-------------------------
    clf = SVC(C=1, kernel='linear')
    clf.fit(data['X'], data['y'].flatten())
    y_pred = clf.predict(data['X'])
    print(y_pred, clf.score(data['X'], data['y'].flatten()))
    df['SVM1 Confidence'] = clf.decision_function(df[['X1', 'X2']])
    plot_boundary(clf)

    # -----------------C=100-------------------------
    clf1 = SVC(C=100, kernel='linear')
    clf1.fit(data['X'], data['y'].flatten())
    y_pred = clf.predict(data['X'])
    print(y_pred, clf1.score(data['X'], data['y'].flatten()))
    plot_boundary(clf1)

    df['SVM100 Confidence'] = clf1.decision_function(df[['X1', 'X2']])
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.scatter(df['X1'], df['X2'], s=50, c=df['SVM100 Confidence'], cmap='autumn')
    ax.set_title('SVM (C=100) Decision Confidence')
    print(df.head())
    plt.show()

4. 邮件分类预处理

"""
垃圾邮件分类---预处理
"""

'''
预处理主要包括以下8个部分:
  1. 将大小写统一成小写字母;
  2. 移除所有HTML标签,只保留内容。
  3. 将所有的网址替换为字符串 “httpaddr”.
  4. 将所有的邮箱地址替换为 “emailaddr”
  5. 将所有dollar符号($)替换为“dollar”.
  6. 将所有数字替换为“number”
  7. 将所有单词还原为词源,词干提取
  8. 移除所有非文字类型
  9.去除空字符串‘’
'''

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat
from sklearn import svm
import nltk.stem as ns
import re


def preprocessing(email):
    # 1. 统一成小写
    email = email.lower()

    # 2. 去除html标签
    email = re.sub('<[^<>]>', ' ', email)

    # 3. 将网址替换为字符串 “httpaddr”.
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)

    # 4. 将邮箱地址替换为 “emailaddr”
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)

    # 5.所有dollar符号($)替换为“dollar”.
    email = re.sub('[\$]+', 'dollar', email)

    # 6.匹配数字,将数字替换为“number”
    email = re.sub('[0-9]+', 'number', email)  # 匹配一个数字, 相当于 [0-9],+ 匹配1到多次

    # 7. 词干提取
    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)
    tokenlist = []

    s = ns.SnowballStemmer('english')

    for token in tokens:

        # 8. 移除非文字类型
        email = re.sub('[^a-zA-Z0-9]', '', email)
        stemmed = s.stem(token)

        # 9.去除空字符串‘’
        if not len(token): continue
        tokenlist.append(stemmed)

    return tokenlist


def email2VocabIndices(email, vocab):
    """提取存在单词的索引"""
    token = preprocessing(email)
    print(token)
    index = [i for i in range(len(token)) if token[i] in vocab]
    return index


def email2FeatureVector(email):
    """
    将email转化为词向量,n是vocab的长度。存在单词的相应位置的值置为1,其余为0
    """
    df = pd.read_table('data/vocab.txt', names=['words'])
    vocab = df.values  # return array
    vector = np.zeros(len(vocab))  # init vector
    vocab_indices = email2VocabIndices(email, vocab)
    print(vocab_indices)  # 返回含有单词的索引
    # 将有单词的索引置为1
    for i in vocab_indices:
        vector[i] = 1
    return vector


if __name__ == '__main__':
    with open("data/emailSample1.txt") as file:
        sample_email = file.read()
        print(sample_email)
    email = preprocessing(sample_email)
    vector = email2FeatureVector(sample_email)
    print('length of vector = {}\nnum of non-zero = {}'.format(len(vector), int(vector.sum())))
    print(vector.shape,vector)

5. 高斯核函数-SVM

"""
非线性--支持向量机
任务:使用高斯核函数解决线性不可分问题,并观察gamma取值对模型复杂度的影响
数据集:data/ex6data2.mat

"""

import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from sklearn.svm import SVC


def plot_data():
    plt.scatter(X[:, 0], X[:, 1], c=y.flatten(), cmap='jet')
    plt.xlabel('x1')
    plt.ylabel('y1')


def plot_boundary(model):
    x_min, x_max = 0, 1
    y_min, y_max = 0.4, 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500),
                         np.linspace(y_min, y_max, 500)) # 把x,y数据生成mesh网格状的数据,因为等高线的显示是在网格的基础上添加上高度值
    z = model.predict(np.c_[xx.flatten(), yy.flatten()])
    print(z.shape)
    zz = z.reshape(xx.shape)
    plot_data()
    plt.contour(xx, yy, zz)

if __name__ == "__main__":
    data = sio.loadmat('./data/ex6data2.mat')
    X, y = data['X'], data['y']
    # print(data)
    plot_data()

    clf = SVC(C=1, kernel='rbf', gamma=100)
    clf.fit(X, y.flatten())
    print(clf.score(X, y.flatten()))
    plot_boundary(clf)
    plt.show()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值