支持向量机与核函数详解-优快云博客

本文链接：https://blog.youkuaiyun.com/qq_43887432/article/details/108165866

支持向量机

支持向量机的代价函数和逻辑回归不同。
在这里插入图片描述

在这里插入图片描述

对支持向量机做出改变来支持复杂的非线性分类。使用**“核函数”**来完成。
例如，想针对下面的样本训练一个分类器，可以选择一个高次多项式来完成分类任务 $\theta_0+\theta_1x_1+\theta_2x_2+\theta_3x_1x_2+\theta_4x_1^2+\theta_5x_2^2+...\geq0$ 。
也可以将上面的式子写成 $\theta_0+\theta_1f_1+\theta_2f_2+...\geq0(f_1=x_1,f_2=x_2,....)$

在这里插入图片描述

编程作业

这周的编程作业，用到了一些自然语言处理包，没有涉及过，参考了不少博客
ex6.py

'''
'''
'''
%% Machine Learning Online Class
%  Exercise 6 | Support Vector Machines
%
%  Instructions
%  ------------
%
%  This file contains code that helps you get started on the
%  exercise. You will need to complete the following functions:
%
%     gaussianKernel.m
%     dataset3Params.m
%     processEmail.m
%     emailFeatures.m
%
%  For this exercise, you will not need to change any code in this file,
%  or any other files other than those mentioned above.
%
'''
'''
%% =============== Part 1: Loading and Visualizing Data ================
%  We start the exercise by first loading and visualizing the dataset.
%  The following code will load the dataset into your environment and plot
%  the data.
%
'''
import numpy as np
import matplotlib.pyplot as plt
from plotData import *
import scipy.io as scio
print('Part1: Loading and Visualizing Data ...')

# Load from ex6data1:
# You will have X, y in your environment
data = scio.loadmat('D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-master\machine'
                  '-learning-ex6\machine-learning-ex6\ex6\ex6data1.mat')
X = data['X']
y = data['y'].flatten()
# Plot training data
plotData(X, y)
plt.show()
input('Program paused. Press enter to continue.')

'''
%% ==================== Part 2: Training Linear SVM ====================
%  The following code will train a linear SVM on the dataset and plot the
%  decision boundary learned.
%
'''

from sklearn.svm import SVC
from visualizeBoundaryLinear import *

print('Part2: Training Linear SVM ...')

# You should try to change the C value below and see how the decision
# boundary varies (e.g., try C = 1000)
#https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

model = SVC(C=1000,kernel='linear') ##max_iter = 200
model.fit(X, y)
visualizeBoundaryLinear(X, y, model)
plt.show()

input('Program paused. Press enter to continue.')

'''
%% =============== Part 3: Implementing Gaussian Kernel ===============
%  You will now implement the Gaussian kernel to use
%  with the SVM. You should complete the code in gaussianKernel.m
%
'''
from gaussianKernel import *

print('Part3: Evaluating the Gaussian Kernel ...')

x1 = np.array([1, 2, 1])
x2 = np.array([0, 4, -1])
sigma = 2
sim = gaussianKernel(x1, x2, sigma)

print('Gaussian Kernel between x1 = [1; 2; 1], x2 = [0; 4; -1], sigma = {} :'
         '\n\t{}\n(for sigma = 2, this value should be about 0.324652)'.format(sigma, sim))

input('Program paused. Press enter to continue.')

'''
%% =============== Part 4: Visualizing Dataset 2 ================
%  The following code will load the next dataset into your environment and
%  plot the data.
%
'''
print('Part4: Loading and Visualizing Data ...')

# Load from ex6data2:
# You will have X, y in your environment
data = scio.loadmat('D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-master\machine'
                  '-learning-ex6\machine-learning-ex6\ex6\ex6data2.mat')
X = data['X']
y = data['y'].flatten()

# Plot training data
plotData(X, y)
plt.show()
input('Program paused. Press enter to continue.')

'''
%% ========== Part 5: Training SVM with RBF Kernel (Dataset 2) ==========
%  After you have implemented the kernel, we can now use it to train the
%  SVM classifier.
%
'''
from visualizeBoundary import *

print('Part5: Training SVM with RBF Kernel (this may take 1 to 2 minutes) ...')


# SVM Parameters
C = 1
sigma = 0.1

# We set the tolerance and max_passes lower here so that the code will run
# faster. However, in practice, you will want to run the training to
# convergence.
model= SVC(gamma=np.power(sigma, -2))
model.fit(X, y)
##这区域选的挺好
visualizeBoundary(0, 1, 0.4, 1.0, X, y, model)
plt.show()

input('Program paused. Press enter to continue.')

'''
%% =============== Part 6: Visualizing Dataset 3 ================
%  The following code will load the next dataset into your environment and
%  plot the data.
%
'''
print('Part6: Loading and Visualizing Data ...')

# Load from ex6data3:
# You will have X, y in your environment
data = scio.loadmat('D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-master\machine-learning-e'
     'x6\machine-learning-ex6\ex6\ex6data3.mat')
X = data['X']
y = data['y'].flatten()
# Plot training data
plotData(X, y)
plt.show()
input('Program paused. Press enter to continue.')

'''
%% ========== Part 7: Training SVM with RBF Kernel (Dataset 3) ==========

%  This is a different dataset that you can use to experiment with. Try
%  different values of C and sigma here.
%
'''

# Train the SVM
model= SVC( C=C, kernel='rbf', gamma=np.power(sigma, -2))
model.fit(X, y)
visualizeBoundary(-0.6, 0.3, -0.8, 0.6, X, y, model)
plt.show()
input('Program paused. Press enter to continue.')

gaussianKernel.py

''''''
'''
%RBFKERNEL returns a radial basis function kernel between x1 and x2
%   sim = gaussianKernel(x1, x2) returns a gaussian kernel between x1 and x2
%   and returns the value in sim

% ====================== YOUR CODE HERE ======================
% Instructions: Fill in this function to return the similarity between x1
%               and x2 computed using a Gaussian kernel with bandwidth
%               sigma
%
%
'''
import numpy as np
def gaussianKernel(x1, x2, sigma):
    # Ensure that x1 and x2 are column vectors
    x1 = x1.flatten()
    x2 = x2.flatten()
    x = x1 - x2
    # You need to return the following variables correctly.
    # sim = 0
    sim = np.exp(-np.dot(x, x) / (2 * sigma * sigma))
    return sim

plotData.py

''''''
'''
%PLOTDATA Plots the data points X and y into a new figure
%   PLOTDATA(x,y) plots the data points with + for the positive examples
%   and o for the negative examples. X is assumed to be a Mx2 matrix.
%
% Note: This was slightly modified such that it expects y = 1 or y = 0
'''
import matplotlib.pyplot as plt
def plotData(X, y):
    # Find Indices of Positive and Negative Examples
    pos = X[y == 1]
    neg = X[y == 0]
    # Plot Examples
    plt.scatter(pos[:, 0], pos[:, 1], marker='+', label='y == 1')
    plt.scatter(neg[:, 0], neg[:, 1], marker='o', label='y == 0')
    plt.legend()

visualizeBoundary.py

''''''
'''
%VISUALIZEBOUNDARY plots a non-linear decision boundary learned by the SVM
%   VISUALIZEBOUNDARYLINEAR(X, y, model) plots a non-linear decision
%   boundary learned by the SVM and overlays the data on it
'''
from plotData import *
import matplotlib.pyplot as plt
import numpy as np

def visualizeBoundary(x_min, x_max, y_min, y_max, X, y, model):
    h = .02
    plotData(X, y)
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))  # 在x，y轴上以0.02为间隔，生成网格点
    ##生成网格
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])  # 预测每个网格点的类别0/1
    Z = Z.reshape(xx.shape)  # 转型为网格的形状
    plt.contour(xx, yy, Z, level=[-1, 0, 1], colors='r')  # 等高线图 将0/1分界线（决策边界）画出来
    ##画等高线

作业二

ex6_spam.py

''''''
'''
%% Machine Learning Online Class
%  Exercise 6 | Spam Classification with SVMs
%
%  Instructions
%  ------------
%
%  This file contains code that helps you get started on the
%  exercise. You will need to complete the following functions:
%
%     gaussianKernel.m
%     dataset3Params.m
%     processEmail.m
%     emailFeatures.m
%
%  For this exercise, you will not need to change any code in this file,
%  or any other files other than those mentioned above.
%
'''
import numpy as np
from processEmail import processEmail
import scipy.io as scio
from sklearn import svm
'''
%% ==================== Part 1: Email Preprocessing ====================
%  To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
%  to convert each email into a vector of features. In this part, you will
%  implement the preprocessing steps for each email. You should
%  complete the code in processEmail.m to produce a word indices vector
%  for a given email.
'''

print('Part1: Preprocessing sample email (emailSample1.txt)')

# Extract Features
file_contents = open('D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-maste'
                           'r\machine-learning-ex6\machine-learning-ex6\ex6\emailSample1.txt').read()
word_indices  = processEmail(file_contents)

# Print Stats
print('Word Indices: {}'.format(word_indices))

input('Program paused. Press enter to continue.')

'''
%% ==================== Part 2: Feature Extraction ====================
%  Now, you will convert each email into a vector of features in R^n.
%  You should complete the code in emailFeatures.m to produce a feature
%  vector for a given email.
'''
from emailFeatures import *

print('Part2: Extracting features from sample email (emailSample1.txt)')

# Extract Features
file_contents = open('D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-master\mach'
                     'ine-learning-ex6\machine-learning-ex6\ex6\emailSample1.txt').read()
word_indices  = processEmail(file_contents)
features = emailFeatures(word_indices)

# Print Stats
print('Length of feature vector: {}'.format(features.size))
print('Number of non-zero entries: {}'.format(sum(features > 0)))

input('Program paused. Press enter to continue.')

'''
%% =========== Part 3: Train Linear SVM for Spam Classification ========
%  In this section, you will train a linear classifier to determine if an
%  email is Spam or Not-Spam.
'''
# Load the Spam Email dataset
# You will have X, y in your environment
data = scio.loadmat('D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-master\mach'
                    'ine-learning-ex6\machine-learning-ex6\ex6\spamTrain.mat')
X = data['X']
y = data['y'].flatten()

print('Part3: Training Linear SVM (Spam Classification)')
print('(this may take 1 to 2 minutes) ...')

C = 0.1
model = svm.SVC(C=C, kernel='linear')
model.fit(X, y)
p = model.predict(X)

print('Training Accuracy: {}'.format(np.mean(p == y) * 100))
'''
%% =================== Part 4: Test Spam Classification ================
%  After training the classifier, we can evaluate it on a test set. We have
%  included a test set in spamTest.mat
'''
# Load the test dataset
# You will have Xtest, ytest in your environment
data = scio.loadmat('D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-master\mac'
                    'hine-learning-ex6\machine-learning-ex6\ex6\spamTest.mat')
Xtest = data['Xtest']
ytest = data['ytest'].flatten()

print('Part4: Evaluating the trained Linear SVM on a test set ...')

p = model.predict(Xtest)

print('Test Accuracy: {}'.format(np.mean(p == ytest) * 100))


'''
%% ================= Part 5: Top Predictors of Spam ====================
%  Since the model we are training is a linear SVM, we can inspect the
%  weights learned by the model to understand better how it is determining
%  whether an email is spam or not. The following code finds the words with
%  the highest weights in the classifier. Informally, the classifier
%  'thinks' that these words are the most likely indicators of spam.
%
'''
from processEmail import *
# Sort the weights and obtin the vocabulary list
indices = np.argsort(model.coef_).flatten()[::-1]
vocabList = getVocabList()

print('Part5: Top predictors of spam: ')
for i in range(15):
    print('{} ({:0.6f})'.format(vocabList[indices[i]], model.coef_.flatten()[indices[i]]))



input('Program paused. Press enter to continue.')

'''
%% =================== Part 6: Try Your Own Emails =====================
%  Now that you've trained the spam classifier, you can use it on your own
%  emails! In the starter code, we have included spamSample1.txt,
%  spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.
%  The following code reads in one of these emails and then uses your
%  learned SVM classifier to determine whether the email is Spam or
%  Not Spam
'''
# Set the file to be read in (change this to spamSample2.txt,
# emailSample1.txt or emailSample2.txt to see different predictions on
# different emails types). Try your own emails as well!
filename = 'D:\课程相关\吴恩达机器学习\Andrew-NG-Meachine-Learning-master\Andrew-NG-Meachine-Learning-master\machine-lear' \
           'ning-ex6\machine-learning-ex6\ex6\spamSample1.txt'

# Read and predict
file_contents = open(filename).read()
word_indices  = processEmail(file_contents)
x = emailFeatures(word_indices)
p = model.predict(x)

print('Processed {}Spam Classification: {}'.format(filename, p))
print('(1 indicates spam, 0 indicates not spam)')

emailFeatures.py

import numpy as np

def emailFeatures(word_indices):
#EMAILFEATURES takes in a word_indices vector and produces a feature vector
#from the word indices
#   x = EMAILFEATURES(word_indices) takes in a word_indices vector and
#   produces a feature vector from the word indices.

    # Total number of words in the dictionary
    n = 1899

    # You need to return the following variables correctly.
    x = np.zeros(n)
    for i in word_indices:
        x[i + 1] = 1
    return x
'''
% ====================== YOUR CODE HERE ======================
% Instructions: Fill in this function to return a feature vector for the
%               given email (word_indices). To help make it easier to
%               process the emails, we have already pre-processed each
%               email and converted each word in the email into an index in
%               a fixed dictionary (of 1899 words). The variable
%               word_indices contains the list of indices of the words
%               which occur in one email.
%
%               Concretely, if an email has the text:
%
%                  The quick brown fox jumped over the lazy dog.
%
%               Then, the word_indices vector for this text might look
%               like:
%
%                   60  100   33   44   10     53  60  58   5
%
%               where, we have mapped each word onto a number, for example:
%
%                   the   -- 60
%                   quick -- 100
%                   ...
%
%              (note: the above numbers are just an example and are not the
%               actual mappings).
%
%              Your task is take one such word_indices vector and construct
%              a binary feature vector that indicates whether a particular
%              word occurs in the email. That is, x(i) = 1 when word i
%              is present in the email. Concretely, if the word 'the' (say,
%              index 60) appears in the email, then x(60) = 1. The feature
%              vector should look like:
%
%              x = [ 0 0 0 0 1 0 0 0 ... 0 0 0 0 1 ... 0 0 0 1 0 ..];
'''

processEmail.py

import re
import nltk
import numpy as np

def getVocabList():
    ##将vocab.txt中的内容，以字典的形式存储在vocab_dict中
    vocab_dict = {} #以字典形式获取
    with open('D:\\课程相关\\吴恩达机器学习\\Andrew-NG-Meachine-Learning-master\\Andrew-NG-Meachine-Learning-master\\machine-le'
              'arning-ex6\\machine-learning-ex6\\ex6\\vocab.txt') as f:
        for line in f:
            (val, key) = line.split()
            vocab_dict[int(val)] = key

    return vocab_dict



'''
%PROCESSEMAIL preprocesses a the body of an email and
%returns a list of word_indices 
%   word_indices = PROCESSEMAIL(email_contents) preprocesses 
%   the body of an email and returns a list of indices of the 
%   words contained in the email. 
%
将大写转换成小写
将html去除
将所有的url替换成httpaddr
将所有的邮件地址替换成emailaddr
将所有的数字替换成number
将所有的钱数替换成dollar
将单词替换成他们的词根形式
将非文字和标点符号去掉
将所有的空格(tab，回车，空格)替换成单一的空白字符
'''
def processEmail(email_contents):
    #得到所有需要统计的词形成的字典
    vocabList = getVocabList()

    # Init return value
    word_list = []

# ========================== Preprocess Email ===========================


    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = strfind(email_contents, ([char(10) char(10)]));
    # email_contents = email_contents(hdrstart(1):end);

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents,)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)


    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('==== Processed Email ====')
    stemmer = nltk.stem.porter.PorterStemmer()
    # Process file
    l = 0
    tokens = re.split('[@$/#.-:&*+=\[\]?!(){\},\'\">_<;% ]', email_contents)
    for token in tokens:
        token = re.sub('[^a-zA-Z0-9]', '', token)
        ##词干提取工具
        token = stemmer.stem(token)

        if len(token) < 1:
            continue
        if token in vocabList.values():
            word_list.append(list(vocabList.keys())[list(vocabList.values()).index(token)])

        #print(token)

    word_indices = np.array(word_list, dtype=np.int64)

    return word_indices