1. 垃圾邮件检测
"""
案例:判断一封邮件是否是垃圾邮件
"""
from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import scipy.io as sio
data1 = sio.loadmat('data/spamTrain.mat') # training data
data2 = sio.loadmat('data/spamTest.mat') # Testing data
print(data1.keys(), data2.keys())
X, y = data1['X'], data1['y']
X_test, y_test = data2['Xtest'], data2['ytest']
print(X.shape, y.shape, X_test.shape, y_test.shape)
svc = svm.SVC()
svc.fit(X, y.flatten())
pred = svc.predict(X_test)
print(svc)
print(metrics.classification_report(y_test.flatten(), pred))
# 线性回归
logit = LogisticRegression()
logit.fit(X, y.flatten())
pred_l = logit.predict(X_test)
print(logit)
print(metrics.classification_report(y_test.flatten(), pred_l))
print(X)
2. 寻找最优参数
"""
寻找最优参数C和gamma
数据集:data/ex6data3.mat
"""
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import classification_report
def plot_data():
plt.scatter(X[:,0],X[:,1],c = y.flatten(), cmap ='jet')
plt.xlabel('x1')
plt.ylabel('y1')
def plot_boundary(model):
x_min, x_max = -0.6, 0.4
y_min, y_max = -0.7, 0.6
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500),
np.linspace(y_min, y_max, 500))
z = model.predict(np.c_[xx.flatten(), yy.flatten()])
zz = z.reshape(xx.shape)
plt.contour(xx, yy, zz)
if __name__ =="__main__":
mat = sio.loadmat('data/ex6data3.mat')
X, y = mat['X'], mat['y']
X_val, y_val = mat['Xval'], mat['yval']
print(X.shape,y.shape)
plot_data()
candidate = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
combination = [(C, gamma) for C in candidate for gamma in candidate]
print(len(combination))
search = []
for C, gamma in combination:
svc = SVC(C=C, gamma=gamma)
svc.fit(X, y.flatten())
search.append(svc.score(X_val, y_val))
best_score = search[np.argmax(search)] # search最大值--这里有四个相同的最大值(实际上对应四组不同的C,gamma),默认出现第一次的最大值
best_param = combination[np.argmax(search)] # combination与search一一对应关系,search最大值对应的索引,也就是combination对应的最佳参数
print(len(search))
# best_svc = SVC(C=100, gamma=0.3,kernel="rbf") # 线性分类
best_svc = SVC(C=0.3, gamma=100, kernel="rbf") # 非线性分类,径向基函数即,核函数为高斯函数
best_svc.fit(X, y.flatten())
ypred = best_svc.predict(X_val)
print(classification_report(y_val.flatten(), ypred))
plot_boundary(best_svc)
plt.show()
3. 线性-SVM
"""
线性--支持向量机
任务:观察C取值对决策边界的影响
数据集:data/ex6data1.mat
"""
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(context="notebook", style='darkgrid', palette='deep')
import scipy.io as sio
import matplotlib.pyplot as plt
def plot_data():
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(df['X1'], df['X2'], s=50, c=df['y'], cmap='jet')
ax.set_title('Raw data')
ax.set_xlabel('X1')
ax.set_ylabel('X2')
def plot_boundary(model):
x_min, x_max = -0.5, 4.5
y_min, y_max = 1.3, 5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500),
np.linspace(y_min, y_max, 500))
z = model.predict(np.c_[xx.flatten(), yy.flatten()])
zz = z.reshape(xx.shape)
plot_data()
plt.contour(xx, yy, zz)
if __name__ == '__main__':
data = sio.loadmat('./data/ex6data1.mat')
print(data['X'].shape)
df = pd.DataFrame({'X1': data['X'][:, 0].flatten(), 'X2': data['X'][:, 1].flatten(), 'y': data['y'].flatten()})
print(df.head())
from sklearn.svm import SVC
# -----------------C=1-------------------------
clf = SVC(C=1, kernel='linear')
clf.fit(data['X'], data['y'].flatten())
y_pred = clf.predict(data['X'])
print(y_pred, clf.score(data['X'], data['y'].flatten()))
df['SVM1 Confidence'] = clf.decision_function(df[['X1', 'X2']])
plot_boundary(clf)
# -----------------C=100-------------------------
clf1 = SVC(C=100, kernel='linear')
clf1.fit(data['X'], data['y'].flatten())
y_pred = clf.predict(data['X'])
print(y_pred, clf1.score(data['X'], data['y'].flatten()))
plot_boundary(clf1)
df['SVM100 Confidence'] = clf1.decision_function(df[['X1', 'X2']])
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(df['X1'], df['X2'], s=50, c=df['SVM100 Confidence'], cmap='autumn')
ax.set_title('SVM (C=100) Decision Confidence')
print(df.head())
plt.show()
4. 邮件分类预处理
"""
垃圾邮件分类---预处理
"""
'''
预处理主要包括以下8个部分:
1. 将大小写统一成小写字母;
2. 移除所有HTML标签,只保留内容。
3. 将所有的网址替换为字符串 “httpaddr”.
4. 将所有的邮箱地址替换为 “emailaddr”
5. 将所有dollar符号($)替换为“dollar”.
6. 将所有数字替换为“number”
7. 将所有单词还原为词源,词干提取
8. 移除所有非文字类型
9.去除空字符串‘’
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat
from sklearn import svm
import nltk.stem as ns
import re
def preprocessing(email):
# 1. 统一成小写
email = email.lower()
# 2. 去除html标签
email = re.sub('<[^<>]>', ' ', email)
# 3. 将网址替换为字符串 “httpaddr”.
email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)
# 4. 将邮箱地址替换为 “emailaddr”
email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)
# 5.所有dollar符号($)替换为“dollar”.
email = re.sub('[\$]+', 'dollar', email)
# 6.匹配数字,将数字替换为“number”
email = re.sub('[0-9]+', 'number', email) # 匹配一个数字, 相当于 [0-9],+ 匹配1到多次
# 7. 词干提取
tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)
tokenlist = []
s = ns.SnowballStemmer('english')
for token in tokens:
# 8. 移除非文字类型
email = re.sub('[^a-zA-Z0-9]', '', email)
stemmed = s.stem(token)
# 9.去除空字符串‘’
if not len(token): continue
tokenlist.append(stemmed)
return tokenlist
def email2VocabIndices(email, vocab):
"""提取存在单词的索引"""
token = preprocessing(email)
print(token)
index = [i for i in range(len(token)) if token[i] in vocab]
return index
def email2FeatureVector(email):
"""
将email转化为词向量,n是vocab的长度。存在单词的相应位置的值置为1,其余为0
"""
df = pd.read_table('data/vocab.txt', names=['words'])
vocab = df.values # return array
vector = np.zeros(len(vocab)) # init vector
vocab_indices = email2VocabIndices(email, vocab)
print(vocab_indices) # 返回含有单词的索引
# 将有单词的索引置为1
for i in vocab_indices:
vector[i] = 1
return vector
if __name__ == '__main__':
with open("data/emailSample1.txt") as file:
sample_email = file.read()
print(sample_email)
email = preprocessing(sample_email)
vector = email2FeatureVector(sample_email)
print('length of vector = {}\nnum of non-zero = {}'.format(len(vector), int(vector.sum())))
print(vector.shape,vector)
5. 高斯核函数-SVM
"""
非线性--支持向量机
任务:使用高斯核函数解决线性不可分问题,并观察gamma取值对模型复杂度的影响
数据集:data/ex6data2.mat
"""
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from sklearn.svm import SVC
def plot_data():
plt.scatter(X[:, 0], X[:, 1], c=y.flatten(), cmap='jet')
plt.xlabel('x1')
plt.ylabel('y1')
def plot_boundary(model):
x_min, x_max = 0, 1
y_min, y_max = 0.4, 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500),
np.linspace(y_min, y_max, 500)) # 把x,y数据生成mesh网格状的数据,因为等高线的显示是在网格的基础上添加上高度值
z = model.predict(np.c_[xx.flatten(), yy.flatten()])
print(z.shape)
zz = z.reshape(xx.shape)
plot_data()
plt.contour(xx, yy, zz)
if __name__ == "__main__":
data = sio.loadmat('./data/ex6data2.mat')
X, y = data['X'], data['y']
# print(data)
plot_data()
clf = SVC(C=1, kernel='rbf', gamma=100)
clf.fit(X, y.flatten())
print(clf.score(X, y.flatten()))
plot_boundary(clf)
plt.show()