支持向量机
支持向量机(Support Vector Machine, SVM)的理论知识在上篇转载的博客中讲解的很详细
SVM一般流程
- 收集数据:任意方法
- 准备数据:需要数值型数据
- 分析数据:有助于可视化分割超平面
- 训练算法:SVM的大部分时间都源自训练,该过程主要实现两个参数的调优
- 测试算法:测试样本带入分隔超平面,通过符号判断类别
- 使用算法
在SVM算法中,最关键的部分在于拉格朗日乘系数的解,该系数未知量的大小和样本成正比,传统的梯度下降优化解法计算开销太大,应用最广的是SMO算法。
SMO算法伪代码
创建一个alpha向量初始化为0
当迭代次数小于最大迭代次数(外循环)
对数据集中的每个数据向量(内循环)
如果该数据向量可以被优化
随机选择另外一个数据向量
同时优化这两个向量
如果两个向量都不能被优化,退出内循环
示例代码
下段代码为通用的带核函数和软间隔的代码
# 核矩阵计算
def kernelTrans(X, A, kTup):
m,n = shape(X)
K = mat(zeros((m, 1)))
if kTup[0] == 'lin':
K = X*A.T
elif kTup[0] == 'rbf':
for j in range(m):
deltaRow = X[j,:] - A
K[j] = deltaRow*deltaRow.T
K = exp(K / (-1*kTup[1]**2))
else: raise NameError('Houston We Have a Problem -- That Kernel in not recognized')
return K
# 应对于核变换的数据结构
class optStructK:
def __init__(self, dataMatIn, classLabels, C, toler, kTup):
self.X = dataMatIn
self.labelMat = classLabels
self.C = C
self.tol = toler
self.m = shape(dataMatIn)[0]
self.alphas = mat(zeros((self.m, 1)))
self.b = 0
self.eCache = mat(zeros((self.m, 2)))
self.K = mat(zeros((self.m, self.m)))
for i in range(self.m):
self.K[:, i] = kernelTrans(self.X, self.X[i,:], kTup)
# Platt SMO内循环代码-核变换
def innerLK(i, oS):
Ei = calcEkK(oS, i)
if (oS.labelMat[i]*Ei < -oS.tol and oS.alphas[i] < oS.C) or (oS.labelMat[i]*Ei > oS.tol and oS.alphas[i] > 0):
j, Ej = selectJK(i, oS, Ei)
alphaIold = oS.alphas[i].copy()
alphaJold = oS.alphas[j].copy()
if oS.labelMat[i] != oS.labelMat[j]:
L = max(0, oS.alphas[j] - oS.alphas[i])
H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
else:
L = max(0, oS.alphas[j] + oS.alphas[i] -oS.C)
H = min(oS.C, oS.alphas[j] + oS.alphas[i])
if L == H:
print('L==H')
return 0
eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j, j]
if eta >= 0:
print('eta >= 0')
return 0
oS.alphas[j] -= oS.labelMat[j]*(Ei - Ej)/eta
oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
updateEkK(oS, j)
if abs(oS.alphas[j] - alphaJold) < 0.00001:
print('j not moving enough')
return 0
oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])
updateEkK(oS, i)
b1 = oS.b - Ei - oS.labelMat[i]*(oS.alphas[i] - alphaIold)*oS.K[i,i] - \
oS.labelMat[j]*(oS.alphas[j] - alphaJold)*oS.K[i, j]
b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i] - alphaIold)*oS.K[i,j] - \
oS.labelMat[j]*(oS.alphas[j] - alphaJold)*oS.K[j, j]
if 0 < oS.alphas[i] and oS.C > oS.alphas[i]:
oS.b = b1
elif 0 < oS.alphas[j] and oS.C > oS.alphas[j]:
oS.b = b2
else:
oS.b = (b1 + b2)/2.0
return 1
else:
return 0
# alpha_j选择
def selectJK(i, oS, Ei):
maxK = -1
maxDeltaE = 0
Ej = 0
oS.eCache[i] = [1, Ei]
validEcacheList = nonzero(oS.eCache[:, 0].A)[0]
if len(validEcacheList) > 1:
for k in validEcacheList:
if k == i: continue
Ek = calcEkK(oS, k)
deltaE = abs(Ei - Ek)
if deltaE > maxDeltaE:
maxK = k
maxDeltaE = deltaE
Ej = Ek
return maxK, Ej
else:
j = selectJrand(i, oS.m)
Ej = calcEkK(oS, j)
return j, Ej
def updateEkK(oS, k):
Ek = calcEkK(oS, k)
oS.eCache[k] = [1, Ek]
# 核变换计算差值
def calcEkK(oS, k):
fXk = float(multiply(oS.alphas, oS.labelMat).T*oS.K[:,k]+ oS.b)
Ek = fXk - float(oS.labelMat[k])
return Ek
# Platt SMO外循环代码
def smoPK(dataMatIn, classLabels, C, toler, maxIter, kTup = ('lin', 0)):
oS = optStructK(mat(dataMatIn), mat(classLabels).transpose(), C, toler, kTup)
iter = 0
entireSet = True; alphaPairsChanged = 0
while iter < maxIter and (alphaPairsChanged > 0 or entireSet):
alphaPairsChanged = 0
if entireSet:
for i in range(oS.m):
alphaPairsChanged += innerLK(i, oS)
print('fullSet, iter: %d i: %d, pairs changed %d' %(iter, i, alphaPairsChanged))
iter += 1
else:
nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
for i in nonBoundIs:
alphaPairsChanged += innerLK(i, oS)
print('non-bound, iterL %d i:%d, pair changed %d' %(iter, i, alphaPairsChanged))
iter += 1
if entireSet:
entireSet = False
elif alphaPairsChanged == 0:
entireSet = True
print('iteration number: %d' %iter)
return oS.b, oS.alphas
def testRbf(k1 = 1.3):
dataArr, labelArr = loadDataSet('testSetRBF.txt')
b, alphas = smoPK(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1))
dataMat = mat(dataArr)
labelMat = mat(labelArr).transpose()
svInd = nonzero(alphas.A>0)[0]
sVs = dataMat[svInd]
labelSV = labelMat[svInd]
print('there are %d Support Vectors' % shape(sVs)[0])
m,n = shape(dataMat)
errorCount = 0
for i in range(m):
kernelEval = kernelTrans(sVs, dataMat[i,:], ('rbf', k1))
predict = kernelEval.T*multiply(labelSV, alphas[svInd]) + b
if sign(predict) != sign(labelArr[i]):
errorCount += 1
print('the training error rate is: %f' %(float(errorCount/m)))
dataArr, labelArr = loadDataSet('testSetRBF2.txt')
errorCount = 0
datMat = mat(dataArr); labelMat = mat(labelArr).transpose()
m,n = shape(datMat)
for i in range(m):
kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1))
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
if sign(predict) != sign(labelArr[i]):
errorCount += 1
print('the test error rate is: %f' % (float(errorCount)/m))
# 图像转向量
def img2vector(filename):
returnVect = zeros((1, 1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32*i + j] = int(lineStr[j])
return returnVect
# 加载图像,图像以文本形式给出
def loadImages(dirName):
from os import listdir
hwLabels = []
trainingFileList = listdir(dirName)
m = len(trainingFileList)
trainingMat = zeros((m, 1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
if classNumStr == 9:
hwLabels.append(-1)
else:
hwLabels.append(1)
trainingMat[i, :] = img2vector('%s/%s' %(dirName, fileNameStr))
return trainingMat, hwLabels
# 数据测试, 输出训练错误率和测试错误率
def testDigits(kTup = ('rbf', 10)):
dataArr, labelArr = loadImages('trainingDigits')
b, alphas = smoPK(dataArr, labelArr, 200, 0.0001, 10000, kTup)
dataMat = mat(dataArr)
labelMat = mat(labelArr).transpose()
svInd = nonzero(alphas.A > 0)[0]
sVs = dataMat[svInd]
labelSV = labelMat[svInd];
print('there are % d Surpport Vectors' % shape(sVs)[0])
m,n = shape(dataMat)
errorCount = 0
for i in range(m):
kernelEval = kernelTrans(sVs, dataMat[i,:], kTup)
predict = kernelEval.T*multiply(labelSV, alphas[svInd]) + b
if sign(predict) != sign(labelArr[i]):
errorCount += 1
print ('the training error rae is: %f' %(float(errorCount)/m))
dataArr, labelArr = loadImages('testDigits')
dataMat = mat(dataArr)
labelMat = mat(labelArr).transpose()
m, n = shape(dataMat)
errorCount = 0
for i in range(m):
kernelEval = kernelTrans(sVs, dataMat[i, :], kTup)
predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
if sign(predict) != sign(labelArr[i]):
errorCount += 1
print('the test error rae is: %f' % (float(errorCount) / m))
>>>j not moving enough
fullSet, iter: 5 i: 401, pairs changed 0
iteration number: 6
there are 49 Surpport Vectors
the training error rae is: 0.000000
the test error rae is: 0.016129
算法特点
优点:泛化错误率低,计算开销不大,结果易计算
缺点:对参数调节和核函数的选择敏感,原始分类器不加修改仅适用于二分类情况
使用数据类型:数值型和标称型