支持向量机
支持向量机的代价函数和逻辑回归不同。
对支持向量机做出改变来支持复杂的非线性分类。使用**“核函数”**来完成。
例如,想针对下面的样本训练一个分类器,可以选择一个高次多项式来完成分类任务
θ
0
+
θ
1
x
1
+
θ
2
x
2
+
θ
3
x
1
x
2
+
θ
4
x
1
2
+
θ
5
x
2
2
+
.
.
.
≥
0
\theta_0+\theta_1x_1+\theta_2x_2+\theta_3x_1x_2+\theta_4x_1^2+\theta_5x_2^2+...\geq0
θ0+θ1x1+θ2x2+θ3x1x2+θ4x12+θ5x22+...≥0。
也可以将上面的式子写成
θ
0
+
θ
1
f
1
+
θ
2
f
2
+
.
.
.
≥
0
(
f
1
=
x
1
,
f
2
=
x
2
,
.
.
.
.
)
\theta_0+\theta_1f_1+\theta_2f_2+...\geq0(f_1=x_1,f_2=x_2,....)
θ0+θ1f1+θ2f2+...≥0(f1=x1,f2=x2,....)
编程作业
这周的编程作业,用到了一些自然语言处理包,没有涉及过,参考了不少博客
ex6.py
'''
'''
'''
%% Machine Learning Online Class
% Exercise 6 | Support Vector Machines
%
% Instructions
% ------------
%
% This file contains code that helps you get started on the
% exercise. You will need to complete the following functions:
%
% gaussianKernel.m
% dataset3Params.m
% processEmail.m
% emailFeatures.m
%
% For this exercise, you will not need to change any code in this file,
% or any other files other than those mentioned above.
%
'''
'''
%% =============== Part 1: Loading and Visualizing Data ================
% We start the exercise by first loading and visualizing the dataset.
% The following code will load the dataset into your environment and plot
% the data.
%
'''
import numpy as np
import matplotlib.pyplot as plt
from plotData import *
import scipy.io as scio
print('Part1: Loading and Visualizing Data ...')
# Load from ex6data1:
# You will have X, y in your environment
data = scio.loadmat('D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-master\machine'
'-learning-ex6\machine-learning-ex6\ex6\ex6data1.mat')
X = data['X']
y = data['y'].flatten()
# Plot training data
plotData(X, y)
plt.show()
input('Program paused. Press enter to continue.')
'''
%% ==================== Part 2: Training Linear SVM ====================
% The following code will train a linear SVM on the dataset and plot the
% decision boundary learned.
%
'''
from sklearn.svm import SVC
from visualizeBoundaryLinear import *
print('Part2: Training Linear SVM ...')
# You should try to change the C value below and see how the decision
# boundary varies (e.g., try C = 1000)
#https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
model = SVC(C=1000,kernel='linear') ##max_iter = 200
model.fit(X, y)
visualizeBoundaryLinear(X, y, model)
plt.show()
input('Program paused. Press enter to continue.')
'''
%% =============== Part 3: Implementing Gaussian Kernel ===============
% You will now implement the Gaussian kernel to use
% with the SVM. You should complete the code in gaussianKernel.m
%
'''
from gaussianKernel import *
print('Part3: Evaluating the Gaussian Kernel ...')
x1 = np.array([1, 2, 1])
x2 = np.array([0, 4, -1])
sigma = 2
sim = gaussianKernel(x1, x2, sigma)
print('Gaussian Kernel between x1 = [1; 2; 1], x2 = [0; 4; -1], sigma = {} :'
'\n\t{}\n(for sigma = 2, this value should be about 0.324652)'.format(sigma, sim))
input('Program paused. Press enter to continue.')
'''
%% =============== Part 4: Visualizing Dataset 2 ================
% The following code will load the next dataset into your environment and
% plot the data.
%
'''
print('Part4: Loading and Visualizing Data ...')
# Load from ex6data2:
# You will have X, y in your environment
data = scio.loadmat('D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-master\machine'
'-learning-ex6\machine-learning-ex6\ex6\ex6data2.mat')
X = data['X']
y = data['y'].flatten()
# Plot training data
plotData(X, y)
plt.show()
input('Program paused. Press enter to continue.')
'''
%% ========== Part 5: Training SVM with RBF Kernel (Dataset 2) ==========
% After you have implemented the kernel, we can now use it to train the
% SVM classifier.
%
'''
from visualizeBoundary import *
print('Part5: Training SVM with RBF Kernel (this may take 1 to 2 minutes) ...')
# SVM Parameters
C = 1
sigma = 0.1
# We set the tolerance and max_passes lower here so that the code will run
# faster. However, in practice, you will want to run the training to
# convergence.
model= SVC(gamma=np.power(sigma, -2))
model.fit(X, y)
##这区域选的挺好
visualizeBoundary(0, 1, 0.4, 1.0, X, y, model)
plt.show()
input('Program paused. Press enter to continue.')
'''
%% =============== Part 6: Visualizing Dataset 3 ================
% The following code will load the next dataset into your environment and
% plot the data.
%
'''
print('Part6: Loading and Visualizing Data ...')
# Load from ex6data3:
# You will have X, y in your environment
data = scio.loadmat('D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-master\machine-learning-e'
'x6\machine-learning-ex6\ex6\ex6data3.mat')
X = data['X']
y = data['y'].flatten()
# Plot training data
plotData(X, y)
plt.show()
input('Program paused. Press enter to continue.')
'''
%% ========== Part 7: Training SVM with RBF Kernel (Dataset 3) ==========
% This is a different dataset that you can use to experiment with. Try
% different values of C and sigma here.
%
'''
# Train the SVM
model= SVC( C=C, kernel='rbf', gamma=np.power(sigma, -2))
model.fit(X, y)
visualizeBoundary(-0.6, 0.3, -0.8, 0.6, X, y, model)
plt.show()
input('Program paused. Press enter to continue.')
gaussianKernel.py
''''''
'''
%RBFKERNEL returns a radial basis function kernel between x1 and x2
% sim = gaussianKernel(x1, x2) returns a gaussian kernel between x1 and x2
% and returns the value in sim
% ====================== YOUR CODE HERE ======================
% Instructions: Fill in this function to return the similarity between x1
% and x2 computed using a Gaussian kernel with bandwidth
% sigma
%
%
'''
import numpy as np
def gaussianKernel(x1, x2, sigma):
# Ensure that x1 and x2 are column vectors
x1 = x1.flatten()
x2 = x2.flatten()
x = x1 - x2
# You need to return the following variables correctly.
# sim = 0
sim = np.exp(-np.dot(x, x) / (2 * sigma * sigma))
return sim
plotData.py
''''''
'''
%PLOTDATA Plots the data points X and y into a new figure
% PLOTDATA(x,y) plots the data points with + for the positive examples
% and o for the negative examples. X is assumed to be a Mx2 matrix.
%
% Note: This was slightly modified such that it expects y = 1 or y = 0
'''
import matplotlib.pyplot as plt
def plotData(X, y):
# Find Indices of Positive and Negative Examples
pos = X[y == 1]
neg = X[y == 0]
# Plot Examples
plt.scatter(pos[:, 0], pos[:, 1], marker='+', label='y == 1')
plt.scatter(neg[:, 0], neg[:, 1], marker='o', label='y == 0')
plt.legend()
visualizeBoundary.py
''''''
'''
%VISUALIZEBOUNDARY plots a non-linear decision boundary learned by the SVM
% VISUALIZEBOUNDARYLINEAR(X, y, model) plots a non-linear decision
% boundary learned by the SVM and overlays the data on it
'''
from plotData import *
import matplotlib.pyplot as plt
import numpy as np
def visualizeBoundary(x_min, x_max, y_min, y_max, X, y, model):
h = .02
plotData(X, y)
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # 在x,y轴上以0.02为间隔,生成网格点
##生成网格
Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) # 预测每个网格点的类别0/1
Z = Z.reshape(xx.shape) # 转型为网格的形状
plt.contour(xx, yy, Z, level=[-1, 0, 1], colors='r') # 等高线图 将0/1分界线(决策边界)画出来
##画等高线
作业二
ex6_spam.py
''''''
'''
%% Machine Learning Online Class
% Exercise 6 | Spam Classification with SVMs
%
% Instructions
% ------------
%
% This file contains code that helps you get started on the
% exercise. You will need to complete the following functions:
%
% gaussianKernel.m
% dataset3Params.m
% processEmail.m
% emailFeatures.m
%
% For this exercise, you will not need to change any code in this file,
% or any other files other than those mentioned above.
%
'''
import numpy as np
from processEmail import processEmail
import scipy.io as scio
from sklearn import svm
'''
%% ==================== Part 1: Email Preprocessing ====================
% To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
% to convert each email into a vector of features. In this part, you will
% implement the preprocessing steps for each email. You should
% complete the code in processEmail.m to produce a word indices vector
% for a given email.
'''
print('Part1: Preprocessing sample email (emailSample1.txt)')
# Extract Features
file_contents = open('D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-maste'
'r\machine-learning-ex6\machine-learning-ex6\ex6\emailSample1.txt').read()
word_indices = processEmail(file_contents)
# Print Stats
print('Word Indices: {}'.format(word_indices))
input('Program paused. Press enter to continue.')
'''
%% ==================== Part 2: Feature Extraction ====================
% Now, you will convert each email into a vector of features in R^n.
% You should complete the code in emailFeatures.m to produce a feature
% vector for a given email.
'''
from emailFeatures import *
print('Part2: Extracting features from sample email (emailSample1.txt)')
# Extract Features
file_contents = open('D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-master\mach'
'ine-learning-ex6\machine-learning-ex6\ex6\emailSample1.txt').read()
word_indices = processEmail(file_contents)
features = emailFeatures(word_indices)
# Print Stats
print('Length of feature vector: {}'.format(features.size))
print('Number of non-zero entries: {}'.format(sum(features > 0)))
input('Program paused. Press enter to continue.')
'''
%% =========== Part 3: Train Linear SVM for Spam Classification ========
% In this section, you will train a linear classifier to determine if an
% email is Spam or Not-Spam.
'''
# Load the Spam Email dataset
# You will have X, y in your environment
data = scio.loadmat('D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-master\mach'
'ine-learning-ex6\machine-learning-ex6\ex6\spamTrain.mat')
X = data['X']
y = data['y'].flatten()
print('Part3: Training Linear SVM (Spam Classification)')
print('(this may take 1 to 2 minutes) ...')
C = 0.1
model = svm.SVC(C=C, kernel='linear')
model.fit(X, y)
p = model.predict(X)
print('Training Accuracy: {}'.format(np.mean(p == y) * 100))
'''
%% =================== Part 4: Test Spam Classification ================
% After training the classifier, we can evaluate it on a test set. We have
% included a test set in spamTest.mat
'''
# Load the test dataset
# You will have Xtest, ytest in your environment
data = scio.loadmat('D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-master\mac'
'hine-learning-ex6\machine-learning-ex6\ex6\spamTest.mat')
Xtest = data['Xtest']
ytest = data['ytest'].flatten()
print('Part4: Evaluating the trained Linear SVM on a test set ...')
p = model.predict(Xtest)
print('Test Accuracy: {}'.format(np.mean(p == ytest) * 100))
'''
%% ================= Part 5: Top Predictors of Spam ====================
% Since the model we are training is a linear SVM, we can inspect the
% weights learned by the model to understand better how it is determining
% whether an email is spam or not. The following code finds the words with
% the highest weights in the classifier. Informally, the classifier
% 'thinks' that these words are the most likely indicators of spam.
%
'''
from processEmail import *
# Sort the weights and obtin the vocabulary list
indices = np.argsort(model.coef_).flatten()[::-1]
vocabList = getVocabList()
print('Part5: Top predictors of spam: ')
for i in range(15):
print('{} ({:0.6f})'.format(vocabList[indices[i]], model.coef_.flatten()[indices[i]]))
input('Program paused. Press enter to continue.')
'''
%% =================== Part 6: Try Your Own Emails =====================
% Now that you've trained the spam classifier, you can use it on your own
% emails! In the starter code, we have included spamSample1.txt,
% spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.
% The following code reads in one of these emails and then uses your
% learned SVM classifier to determine whether the email is Spam or
% Not Spam
'''
# Set the file to be read in (change this to spamSample2.txt,
# emailSample1.txt or emailSample2.txt to see different predictions on
# different emails types). Try your own emails as well!
filename = 'D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-master\machine-lear' \
'ning-ex6\machine-learning-ex6\ex6\spamSample1.txt'
# Read and predict
file_contents = open(filename).read()
word_indices = processEmail(file_contents)
x = emailFeatures(word_indices)
p = model.predict(x)
print('Processed {}Spam Classification: {}'.format(filename, p))
print('(1 indicates spam, 0 indicates not spam)')
emailFeatures.py
import numpy as np
def emailFeatures(word_indices):
#EMAILFEATURES takes in a word_indices vector and produces a feature vector
#from the word indices
# x = EMAILFEATURES(word_indices) takes in a word_indices vector and
# produces a feature vector from the word indices.
# Total number of words in the dictionary
n = 1899
# You need to return the following variables correctly.
x = np.zeros(n)
for i in word_indices:
x[i + 1] = 1
return x
'''
% ====================== YOUR CODE HERE ======================
% Instructions: Fill in this function to return a feature vector for the
% given email (word_indices). To help make it easier to
% process the emails, we have already pre-processed each
% email and converted each word in the email into an index in
% a fixed dictionary (of 1899 words). The variable
% word_indices contains the list of indices of the words
% which occur in one email.
%
% Concretely, if an email has the text:
%
% The quick brown fox jumped over the lazy dog.
%
% Then, the word_indices vector for this text might look
% like:
%
% 60 100 33 44 10 53 60 58 5
%
% where, we have mapped each word onto a number, for example:
%
% the -- 60
% quick -- 100
% ...
%
% (note: the above numbers are just an example and are not the
% actual mappings).
%
% Your task is take one such word_indices vector and construct
% a binary feature vector that indicates whether a particular
% word occurs in the email. That is, x(i) = 1 when word i
% is present in the email. Concretely, if the word 'the' (say,
% index 60) appears in the email, then x(60) = 1. The feature
% vector should look like:
%
% x = [ 0 0 0 0 1 0 0 0 ... 0 0 0 0 1 ... 0 0 0 1 0 ..];
'''
processEmail.py
import re
import nltk
import numpy as np
def getVocabList():
##将vocab.txt中的内容,以字典的形式存储在vocab_dict中
vocab_dict = {} #以字典形式获取
with open('D:\\课程相关\\吴恩达机器学习\\Andrew-NG-Meachine-Learning-master\\Andrew-NG-Meachine-Learning-master\\machine-le'
'arning-ex6\\machine-learning-ex6\\ex6\\vocab.txt') as f:
for line in f:
(val, key) = line.split()
vocab_dict[int(val)] = key
return vocab_dict
'''
%PROCESSEMAIL preprocesses a the body of an email and
%returns a list of word_indices
% word_indices = PROCESSEMAIL(email_contents) preprocesses
% the body of an email and returns a list of indices of the
% words contained in the email.
%
将大写转换成小写
将html去除
将所有的url替换成httpaddr
将所有的邮件地址替换成emailaddr
将所有的数字替换成number
将所有的钱数替换成dollar
将单词替换成他们的词根形式
将非文字和标点符号去掉
将所有的空格(tab,回车,空格)替换成单一的空白字符
'''
def processEmail(email_contents):
#得到所有需要统计的词形成的字典
vocabList = getVocabList()
# Init return value
word_list = []
# ========================== Preprocess Email ===========================
# Find the Headers ( \n\n and remove )
# Uncomment the following lines if you are working with raw emails with the
# full headers
# hdrstart = strfind(email_contents, ([char(10) char(10)]));
# email_contents = email_contents(hdrstart(1):end);
# Lower case
email_contents = email_contents.lower()
# Strip all HTML
# Looks for any expression that starts with < and ends with > and replace
# and does not have any < or > in the tag it with a space
email_contents = re.sub('<[^<>]+>', ' ', email_contents)
# Handle Numbers
# Look for one or more characters between 0-9
email_contents = re.sub('[0-9]+', 'number', email_contents)
# Handle URLS
# Look for strings starting with http:// or https://
email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents,)
# Handle Email Addresses
# Look for strings with @ in the middle
email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)
# Handle $ sign
email_contents = re.sub('[$]+', 'dollar', email_contents)
# ========================== Tokenize Email ===========================
# Output the email to screen as well
print('==== Processed Email ====')
stemmer = nltk.stem.porter.PorterStemmer()
# Process file
l = 0
tokens = re.split('[@$/#.-:&*+=\[\]?!(){\},\'\">_<;% ]', email_contents)
for token in tokens:
token = re.sub('[^a-zA-Z0-9]', '', token)
##词干提取工具
token = stemmer.stem(token)
if len(token) < 1:
continue
if token in vocabList.values():
word_list.append(list(vocabList.keys())[list(vocabList.values()).index(token)])
#print(token)
word_indices = np.array(word_list, dtype=np.int64)
return word_indices