支持向量机

支持向量机

支持向量机(Support Vector Machine, SVM)的理论知识在上篇转载的博客中讲解的很详细


SVM一般流程

  1. 收集数据:任意方法
  2. 准备数据:需要数值型数据
  3. 分析数据:有助于可视化分割超平面
  4. 训练算法:SVM的大部分时间都源自训练,该过程主要实现两个参数的调优
  5. 测试算法:测试样本带入分隔超平面,通过符号判断类别
  6. 使用算法

在SVM算法中,最关键的部分在于拉格朗日乘系数的解,该系数未知量的大小和样本成正比,传统的梯度下降优化解法计算开销太大,应用最广的是SMO算法。


SMO算法伪代码

创建一个alpha向量初始化为0
当迭代次数小于最大迭代次数(外循环)
    对数据集中的每个数据向量(内循环)
    如果该数据向量可以被优化
        随机选择另外一个数据向量
        同时优化这两个向量
        如果两个向量都不能被优化,退出内循环

示例代码

下段代码为通用的带核函数和软间隔的代码

# 核矩阵计算
def kernelTrans(X, A, kTup):
    m,n  = shape(X)
    K = mat(zeros((m, 1)))
    if kTup[0] == 'lin':
        K = X*A.T
    elif kTup[0] == 'rbf':
        for j in range(m):
            deltaRow = X[j,:] - A
            K[j] = deltaRow*deltaRow.T
        K = exp(K / (-1*kTup[1]**2))
    else: raise NameError('Houston We Have a Problem -- That Kernel in not recognized')
    return K

# 应对于核变换的数据结构
class optStructK:
    def __init__(self, dataMatIn, classLabels, C, toler, kTup):
        self.X = dataMatIn
        self.labelMat = classLabels
        self.C = C
        self.tol = toler
        self.m = shape(dataMatIn)[0]
        self.alphas = mat(zeros((self.m, 1)))
        self.b = 0
        self.eCache = mat(zeros((self.m, 2)))
        self.K = mat(zeros((self.m, self.m)))
        for i in range(self.m):
            self.K[:, i] = kernelTrans(self.X, self.X[i,:], kTup)

# Platt SMO内循环代码-核变换
def innerLK(i, oS):
    Ei = calcEkK(oS, i)
    if (oS.labelMat[i]*Ei < -oS.tol and oS.alphas[i] < oS.C) or (oS.labelMat[i]*Ei > oS.tol and oS.alphas[i] > 0):
        j, Ej = selectJK(i, oS, Ei)
        alphaIold = oS.alphas[i].copy()
        alphaJold = oS.alphas[j].copy()
        if oS.labelMat[i] != oS.labelMat[j]:
            L = max(0, oS.alphas[j] - oS.alphas[i])
            H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
        else:
            L = max(0, oS.alphas[j] + oS.alphas[i] -oS.C)
            H = min(oS.C, oS.alphas[j] + oS.alphas[i])
        if L == H:
            print('L==H')
            return 0
        eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j, j]
        if eta >= 0:
            print('eta >= 0')
            return 0
        oS.alphas[j] -= oS.labelMat[j]*(Ei - Ej)/eta
        oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
        updateEkK(oS, j)
        if abs(oS.alphas[j] - alphaJold) < 0.00001:
            print('j not moving enough')
            return 0
        oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])
        updateEkK(oS, i)
        b1 = oS.b - Ei - oS.labelMat[i]*(oS.alphas[i] - alphaIold)*oS.K[i,i] - \
             oS.labelMat[j]*(oS.alphas[j] - alphaJold)*oS.K[i, j]
        b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i] - alphaIold)*oS.K[i,j] - \
             oS.labelMat[j]*(oS.alphas[j] - alphaJold)*oS.K[j, j]
        if 0 < oS.alphas[i] and oS.C > oS.alphas[i]:
            oS.b = b1
        elif 0 < oS.alphas[j] and oS.C > oS.alphas[j]:
            oS.b = b2
        else:
            oS.b = (b1 + b2)/2.0
        return 1
    else:
        return 0

# alpha_j选择
def selectJK(i, oS, Ei):
    maxK = -1
    maxDeltaE = 0
    Ej = 0
    oS.eCache[i] = [1, Ei]
    validEcacheList = nonzero(oS.eCache[:, 0].A)[0]
    if len(validEcacheList) > 1:
        for k in validEcacheList:
            if k == i: continue
            Ek = calcEkK(oS, k)
            deltaE = abs(Ei - Ek)
            if deltaE > maxDeltaE:
                maxK = k
                maxDeltaE = deltaE
                Ej = Ek
        return maxK, Ej
    else:
        j = selectJrand(i, oS.m)
        Ej = calcEkK(oS, j)
    return j, Ej

def updateEkK(oS, k):
    Ek = calcEkK(oS, k)
    oS.eCache[k] = [1, Ek]
# 核变换计算差值
def calcEkK(oS, k):
    fXk = float(multiply(oS.alphas, oS.labelMat).T*oS.K[:,k]+ oS.b)
    Ek = fXk - float(oS.labelMat[k])
    return Ek

# Platt SMO外循环代码
def smoPK(dataMatIn, classLabels, C, toler, maxIter, kTup = ('lin', 0)):
    oS = optStructK(mat(dataMatIn), mat(classLabels).transpose(), C, toler, kTup)
    iter = 0
    entireSet = True; alphaPairsChanged = 0
    while iter < maxIter and (alphaPairsChanged > 0 or entireSet):
        alphaPairsChanged = 0
        if entireSet:
            for i in range(oS.m):
                alphaPairsChanged += innerLK(i, oS)
            print('fullSet, iter: %d i: %d, pairs changed %d' %(iter, i, alphaPairsChanged))
            iter += 1
        else:
            nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
            for i in nonBoundIs:
                alphaPairsChanged += innerLK(i, oS)
                print('non-bound, iterL %d i:%d, pair changed %d' %(iter, i, alphaPairsChanged))
            iter += 1
        if entireSet:
            entireSet = False
        elif alphaPairsChanged == 0:
            entireSet = True
        print('iteration number: %d' %iter)
    return oS.b, oS.alphas

def testRbf(k1 = 1.3):
    dataArr, labelArr = loadDataSet('testSetRBF.txt')
    b, alphas = smoPK(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1))
    dataMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    svInd = nonzero(alphas.A>0)[0]
    sVs = dataMat[svInd]
    labelSV = labelMat[svInd]
    print('there are %d Support Vectors' % shape(sVs)[0])
    m,n = shape(dataMat)
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs, dataMat[i,:], ('rbf', k1))
        predict = kernelEval.T*multiply(labelSV, alphas[svInd]) + b
        if sign(predict) != sign(labelArr[i]):
            errorCount += 1
    print('the training error rate is: %f' %(float(errorCount/m)))
    dataArr, labelArr = loadDataSet('testSetRBF2.txt')
    errorCount = 0
    datMat = mat(dataArr); labelMat = mat(labelArr).transpose()
    m,n = shape(datMat)
    for i in range(m):
        kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
        if sign(predict) != sign(labelArr[i]):
            errorCount += 1
    print('the test error rate is: %f' % (float(errorCount)/m))

# 图像转向量
def img2vector(filename):
    returnVect = zeros((1, 1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0, 32*i + j] = int(lineStr[j])
    return returnVect

# 加载图像,图像以文本形式给出
def loadImages(dirName):
    from os import listdir
    hwLabels = []
    trainingFileList = listdir(dirName)
    m = len(trainingFileList)
    trainingMat = zeros((m, 1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        if classNumStr == 9:
            hwLabels.append(-1)
        else:
            hwLabels.append(1)
        trainingMat[i, :] = img2vector('%s/%s' %(dirName, fileNameStr))
    return trainingMat, hwLabels

# 数据测试, 输出训练错误率和测试错误率
def testDigits(kTup = ('rbf', 10)):
    dataArr, labelArr = loadImages('trainingDigits')
    b, alphas = smoPK(dataArr, labelArr, 200, 0.0001, 10000, kTup)
    dataMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    svInd = nonzero(alphas.A > 0)[0]
    sVs = dataMat[svInd]
    labelSV = labelMat[svInd];
    print('there are % d Surpport Vectors' % shape(sVs)[0])
    m,n = shape(dataMat)
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs, dataMat[i,:], kTup)
        predict = kernelEval.T*multiply(labelSV, alphas[svInd]) + b
        if sign(predict) != sign(labelArr[i]):
            errorCount += 1
    print ('the training error rae is: %f' %(float(errorCount)/m))
    dataArr, labelArr = loadImages('testDigits')
    dataMat = mat(dataArr)
    labelMat = mat(labelArr).transpose()
    m, n = shape(dataMat)
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs, dataMat[i, :], kTup)
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
        if sign(predict) != sign(labelArr[i]):
            errorCount += 1
    print('the test error rae is: %f' % (float(errorCount) / m))

>>>j not moving enough
fullSet, iter: 5 i: 401, pairs changed 0
iteration number: 6
there are  49 Surpport Vectors
the training error rae is: 0.000000
the test error rae is: 0.016129

算法特点

优点:泛化错误率低,计算开销不大,结果易计算
缺点:对参数调节和核函数的选择敏感,原始分类器不加修改仅适用于二分类情况
使用数据类型:数值型和标称型

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值