支持向量机
理论知识参考博客:https://blog.youkuaiyun.com/c406495762/article/details/78072313
以及斯坦福大学课程
1 编程求解线性SVM
1.1 可视化数据集
import matplotlib.pyplot as plt
import numpy as np
#读取数据
def loadDataSet(fileName):
dataMat=[]
labelMat=[]
fr=open(fileName)
for line in fr.readlines():
lineArr=line.strip().split('\t')
dataMat.append([float(lineArr[0]),float(lineArr[1])])
labelMat.append(float(lineArr[2]))
return dataMat,labelMat
#数据可视化
def showDataSet(dataMat,labelMat):
data_plus=[]
data_minus=[]
for i in range(len(dataMat)):
if labelMat[i]>0:
data_plus.append(dataMat[i])
else:
data_minus.append(dataMat[i])
data_plus_np=np.array(data_plus)
data_minus_np=np.array(data_minus)
plt.scatter(data_plus_np.T[0],data_plus_np.T[1]) #注意转置
plt.scatter(data_minus_np.T[0],data_minus_np.T[1])
plt.show()
#绘图
path='H:/机器学习课程资料/machinelearninginaction/Ch06/testSet.txt'
dataMat,labelMat=loadDataSet(path)
showDataSet(dataMat,labelMat)
1.2 简化版SMO算法
import random
#SMO算法中的辅助函数
def selectJrand(i,m):
j=i
while(j==i):
j=int(random.uniform(0,m))
return j
def clipAlpha(aj,H,L):
if aj>H:
aj=H
if L>aj:
aj=L
return aj
#简化版smo算法
def smoSimple(dataMatIn,classLabels,C,toler,maxIter):
dataMatrix=np.mat(dataMatIn)
labelMat=np.mat(classLabels).T
#初始化
b=0
m,n=np.shape(dataMatrix)
alphas=np.mat(np.zeros((m,1)))
iter_num=0
#最多迭代maxIter
while(iter_num<maxIter):
alphasPairsChange=0
for i in range(m):
#步骤1:计算误差Ei
#np.multiply矩阵乘法中对应元素相乘
fXi=float(np.multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T))+b
Ei=fXi-float(labelMat[i])
#优化alpha,设定一定容错率
if((labelMat[i]*Ei<-toler)and(alphas[i]<C))or((labelMat[i]*Ei>toler)and(alphas[i]>0)):
j=selectJrand(i,m) #随机选择另一个alpha_j
#计算误差Ej
fXj=float(np.multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T))+b
Ej=fXj-float(labelMat[j])
#保存更新前的alpha值
alphaIold=alphas[i].copy() #深拷贝,不会像复制只是引用,对象内容不会被更改
alphaJold=alphas[j].copy()
#步骤2:计算上下界L和H
if(labelMat[i]!=labelMat[j]):
L=max(0,alphas[j]-alphas[i])
H=min(C,C+alphas[j]-alphas[i])
else:
L=max(0,alphas[j]+alphas[i]-C)
H=min(C,alphas[j]+alphas[i])
if L==H:
print('L==H')
continue
#步骤3:计算eta
eta=2.0*dataMatrix[i,:]*dataMatrix[j,:].T-dataMatrix[i,:]*dataMatrix[i,:].T-dataMatrix[j,:]*dataMatrix[j,:].T
if eta>=0:
print('eta>=0')
continue
#步骤4:更新alpha_j
alphas[j]-=labelMat[j]*(Ei-Ej)/eta
#步骤5:修建alpha_j
alphas[j]=clipAlpha(alphas[j],H,L)
if (abs(alphas[j]-alphaJold)<0.00001):
print('alphas[j]变化太小')
continue
#步骤6:更新alpha_i
alphas[i]+=labelMat[j]*labelMat[i]*(alphaJold-alphas[j])
#步骤7:更新b_1,b_2
b1=b-Ei-labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T-\
labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
b2=b-Ej-labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T-\
labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
#步骤8:根据b1,b2更新b
if (0<alphas[i])and(C>alphas[i]):
b=b1
elif (0<alphas[j])and(C>alphas[j]):
b=b2
else:
b=(b1+b2)/2.0
#统计优化次数
alphasPairsChange+=1
print('第%d次迭代,样本:%d,alpha优化次数:%d'%(iter_num,i,alphasPairsChange))
if (alphasPairsChange==0):
iter_num+=1
else:
iter_num=0
print('迭代次数:%d'%iter_num)
return b,alphas
#计算w
def get_w(dataMat,labelMat,alphas):
alphas,dataMat,labelMat=np.array(alphas),np.array(dataMat),np.array(labelMat)
w=np.dot((np.tile(labelMat.reshape(1,-1).T,(1,