Part1-Chapter6-核函数及手写测试

本文探讨了核函数在低维数据高维映射中的作用,以实现数据可区分性。通过高斯函数示例展示了核函数在支持向量机(SVM)中的应用,用于手写数字识别。实验结果显示,参数调整对模型性能有显著影响,参数约为10时,测试错误率最低,有效缓解过拟合问题。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

核函数法,简单来说,就是将低维的数据投射到高维,一方面这样可以降低计算难度,另一方面这样可以将低维不可区分的数据,变得可区分。常用的核函数有很多,下面使用的是高斯函数。

import matplotlib.pyplot as plt
import numpy as np
import random

class optStruct:
   def __init__(self,dataMatIn,classLabels,C,toler,kTup):
     self.X = dataMatIn
 	 self.labelMat = classLabels
  	 self.C = C
   	 self.tol = toler
    	self.m = np.shape(dataMatIn)[0]
   	 self.alphas = np.mat(np.zeros((self.m,1)))
   	 self.b = 0
    	self.eCache = np.mat(np.zeros((self.m,2)))
    	self.K = np.mat(np.zeros((self.m,self.m)))
    	for i in range(self.m):
        	self.K[:,i] = kernelTrans(self.X,self.X[i,:],kTup)

def kernelTrans(X,A,kTup):               #根据核函数类型计算xi*xj
    m,n=np.shape(X)
	K = np.mat(np.zeros((m,1)))
	if kTup[0] == 'lin' :K= A.T * X
	elif kTup[0] == 'rbf':
    	for j in range(m):
        	deltaRow = X[j,:] - A
        	K[j] = deltaRow*deltaRow.T
    	K = np.exp(K/(-1*kTup[1]**2))
	else:raise NameError('核函数无法识别')
	return K
        
def loadDataSet(fileName):               #载入文件
    dataMat = [];labelMat = []
	fr = open(fileName)
	for line in fr.readlines():
   	lineArr = line.strip().split('\t')
    	dataMat.append([float(lineArr[0]),float(lineArr[1])])
   	 labelMat.append(float(lineArr[-1]))
 return dataMat,labelMat

def calcEk(oS,k):                        #计算误差
    fXk = float(np.multiply(oS.alphas,oS.labelMat).T*(oS.K[:,k]) + oS.b)
    Ek = fXk - float(oS.labelMat[k])
    return Ek

def selectJrand(i,m):                    #随机选取aj
    j = i
    while(j == i):
        j = int(random.uniform(0,m))
        return j

def selectJ(i,oS,Ei):                    #选取一个步长最大的aj
    maxK = -1
    maxDeltaE = 0
    Ej = 0
    oS.eCache[i] = [1,Ei]
    validEcacheList = np.nonzero(oS.eCache[:,0].A)[0]
    if(len(validEcacheList) > 1):
        for k in validEcacheList:
            if k==i:continue
            Ek = calcEk(oS,k)
            deltaE = abs(Ei-Ek)
            if(deltaE > maxDeltaE):
                maxK = k
                maxDeltaE = deltaE
                Ej = Ek
        return maxK,Ej
    else:
        j = selectJrand(i,oS.m)
        Ej = calcEk(oS,j)
    return j,Ej

def updateEk(oS,k):                       #更新Ek
    Ek = calcEk(oS,k)
    oS.eCache[k] = [1,Ek]

def clipAlpha(aj,H,L):
     if aj > H:
        aj = H
    if aj < L:
        aj = L
    return aj

def innerL(i,oS):                         #计算aiNew,ajNew,b1,b2并返回是否更新
    Ei = calcEk(oS,i)
    if((oS.labelMat[i] * Ei< -oS.tol)and(oS.alphas[i] \
    < oS.C)) or ((oS.labelMat[i]*Ei> oS.tol)and(oS.alphas[i]> 0)):
        j,Ej = selectJ(i,oS,Ei)
        alphaIold = oS.alphas[i].copy()
        alphaJold = oS.alphas[j].copy()
        if(oS.labelMat[i]!= oS.labelMat[j]):
            L=max(0,oS.alphas[j] - oS.alphas[i])
            H=min(oS.C,oS.C+oS.alphas[j] - oS.alphas[i])
        else:
            L=max(0,oS.alphas[j]+oS.alphas[i] - oS.C)
            H=min(oS.C,oS.alphas[j]+oS.alphas[i])
        if L == H:
            print("L==H")
            return 0
        eta = 2.0*oS.K[i,j]- oS.K[i,i]- oS.K[j,j]
        if eta >= 0:
            print("eta>=0")
        oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej)/eta
        oS.alphas[j] = clipAlpha(oS.alphas[j],H,L)
        updateEk(oS,j)
        if(abs(oS.alphas[j] - alphaJold) < 0.00001):
            print("AlphaJ变化太小")
            return 0
        oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])
        updateEk(oS,i)
        b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] - oS.labelMat[j]*  (oS.alphas[j]-alphaJold)*oS.K[i,j]
        b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j]
        if(0<oS.alphas[i])and(oS.C>oS.alphas[i]):oS.b = b1
        elif(oS.alphas[j]>0)and(oS.C>oS.alphas[j]):oS.b = b2
        else:oS.b = (b1+b2)/2
        return 1
    else:
        return 0

def smoP(dataMatIn,classLabels,C,\        #smo改进算法,应用核函数
toler,maxIter,kTup = ('lin',0)):  
    oS = optStruct(np.mat(dataMatIn),np.mat(classLabels).transpose(),C,toler,kTup)
    iteration = 0
    entireSet = True
    alphaPairsChanged = 0
    while(iteration<maxIter)and((alphaPairsChanged>0)or(entireSet)):
        alphaPairsChanged = 0
        if entireSet:
            for i in range(oS.m):
                alphaPairsChanged += innerL(i,oS)
                print("全部遍历:第%d次迭代 样本:%d,alpha优化次数:%d"%(iteration,i,alphaPairsChanged))
            iteration += 1
        else:
            nonBoundIs = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A<C))[0]
        for i in nonBoundIs :
            alphaPairsChanged += innerL(i,oS)
            print("非边界遍历:第%d次迭代 样本:%d,alpha优化次数:%d"%(iteration,i,alphaPairsChanged))
        iteration +=1
    if entireSet:
        entireSet = False
    elif(alphaPairsChanged == 0):
        entireSet = True
    print("迭代次数:%d"%iteration)
return oS.b,oS.alphas

def showClassifer(dataMat,classLabels,\    #绘图
w,b,alphas):
    dataPlus = []
    dataMinus = []
    for i in range(len(dataMat)):
        if classLabels[i] > 0:
            dataPlus.append(dataMat[i])
        else:
            dataMinus.append(dataMat[i])
    dataPlusNp = np.array(dataPlus)
    dataMinusNp = np.array(dataMinus)
    plt.scatter(np.transpose(dataPlusNp)[0],np.transpose(dataPlusNp)[1],s=30,alpha=0.7)
    plt.scatter(np.transpose(dataMinusNp)[0],np.transpose(dataMinusNp)[1],s=30,alpha=0.7)
    x1 = max(dataMat)[0]
    x2 = min(dataMat)[0]
    a1,a2=w
    b=float(b)
    a1 = float(a1[0])
    a2 = float(a2[0])
    y1,y2 = (-b-a1*x1)/a2,(-b-a1*x2)/a2
    plt.plot([x1,x2],[y1,y2])
    for i,alpha in enumerate(alphas):
        if abs(alpha)>0:
            x,y=dataMat[i]
            plt.scatter([x],[y],s=150,c='none',alpha=0.7,linewidth=1.5,edgecolor='red')
    plt.show()

def calcWs(alphas,dataArr,classLabels):   
    X = np.mat(dataArr)
    labelMat = np.mat(classLabels).transpose()
    m,n = np.shape(X)
    w = np.zeros((n,1))
    for i in range(m):
        w += np.multiply(alphas[i]*labelMat[i],X[i,:].T)
    return w


def testRbf(k1=1.3):                   #计算训练集及测试集的错误率
    dataArr,labelArr = loadDataSet('testSetRBF.txt')
    b,alphas = smoP(dataArr,labelArr,200,0.0001,100,('rbf',k1))
    datMat = np.mat(dataArr)
    labelMat = np.mat(labelArr).transpose()
    svInd = np.nonzero(alphas.A > 0)[0]
    sVs = datMat[svInd]
    labelSV = labelMat[svInd]
    print("支持向量个数:%d"%np.shape(sVs)[0])
    m,n = np.shape(datMat)
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs,datMat[i,:],('rbf',k1))
        predict = kernelEval.T*np.multiply(labelSV,alphas[svInd]+b)
        if np.sign(predict)!=np.sign(labelArr[i]):errorCount += 1
        print("训练集错误率%f"%(float((errorCount)/m)*100))
    dataArr,labelArr = loadDataSet('testSetRBF2.txt')
    errorCount = 0
    datMat = np.mat(dataArr)
    labelMat = np.mat(labelArr).transpose()
    m,n = np.shape(datMat)
    for i in range(m):
        kernelEval = kernelTrans(sVs,datMat[i,:],('rbf',k1))
         predict = kernelEval.T*np.multiply(labelSV,alphas[svInd]+b)
        if np.sign(predict)!=np.sign(labelArr[i]):errorCount += 1
        print("测试集错误率%f"%((float(errorCount)/m)*100))

def get_w(dataMat, labelMat, alphas):
    alphas, dataMat, labelMat = np.array(alphas), np.array(dataMat), np.array(labelMat)
    w = np.dot((np.tile(labelMat.reshape(1, -1).T, (1, 2)) * dataMat).T, alphas)
    return w.tolist()

最终结果如下:
在这里插入图片描述
这是k1=1.3的图从图中来看划分的还是比较科学的,蓝色类基本都在一起,不过不知道为什么,训练集和测试集的错误率都在50左右

下面是一个使用核函数的svm方法的实例:识别一个用32*32大小图像保存的手写数字

import matplotlib.pyplot as plt
import numpy as np
import random

class optStruct:
    def __init__(self,dataMatIn,classLabels,C,toler,kTup):
        self.X = dataMatIn
        self.labelMat = classLabels
        self.C = C
        self.tol = toler
        self.m = np.shape(dataMatIn)[0]
        self.alphas = np.mat(np.zeros((self.m,1)))
        self.b = 0
        self.eCache = np.mat(np.zeros((self.m,2)))
        self.K = np.mat(np.zeros((self.m,self.m)))
        for i in range(self.m):
            self.K[:,i] = kernelTrans(self.X,self.X[i,:],kTup)

def kernelTrans(X,A,kTup):
    m,n=np.shape(X)
    K = np.mat(np.zeros((m,1)))
    if kTup[0] == 'lin' :K= A.T * X
    elif kTup[0] == 'rbf':
        for j in range(m):
            deltaRow = X[j,:] - A
            K[j] = deltaRow*deltaRow.T
        K = np.exp(K/(-1*kTup[1]**2))
    else:raise NameError('核函数无法识别')
    return K
        
def loadDataSet(fileName):
    dataMat = [];labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr = line.strip().split('\t')
        dataMat.append([float(lineArr[0]),float(lineArr[1])])
        labelMat.append(float(lineArr[-1]))
    return dataMat,labelMat

def calcEk(oS,k):
    fXk = float(np.multiply(oS.alphas,oS.labelMat).T*(oS.K[:,k]) + oS.b)
    Ek = fXk - float(oS.labelMat[k])
    return Ek

def selectJrand(i,m):
    j = i
    while(j == i):
        j = int(random.uniform(0,m))
        return j

def selectJ(i,oS,Ei):
    maxK = -1
    maxDeltaE = 0
    Ej = 0
    oS.eCache[i] = [1,Ei]
    validEcacheList = np.nonzero(oS.eCache[:,0].A)[0]
    if(len(validEcacheList) > 1):
        for k in validEcacheList:
            if k==i:continue
            Ek = calcEk(oS,k)
            deltaE = abs(Ei-Ek)
            if(deltaE > maxDeltaE):
                maxK = k
                maxDeltaE = deltaE
                Ej = Ek
        return maxK,Ej
    else:
        j = selectJrand(i,oS.m)
        Ej = calcEk(oS,j)
    return j,Ej

def updateEk(oS,k):
    Ek = calcEk(oS,k)
    oS.eCache[k] = [1,Ek]

def clipAlpha(aj,H,L):
    if aj > H:
        aj = H
    if aj < L:
        aj = L
    return aj

def innerL(i,oS):
    Ei = calcEk(oS,i)
    if((oS.labelMat[i] * Ei< -oS.tol)and(oS.alphas[i] < oS.C)) or ((oS.labelMat[i]*Ei> oS.tol)and(oS.alphas[i]> 0)):
        j,Ej = selectJ(i,oS,Ei)
        alphaIold = oS.alphas[i].copy()
        alphaJold = oS.alphas[j].copy()
        if(oS.labelMat[i]!= oS.labelMat[j]):
            L=max(0,oS.alphas[j] - oS.alphas[i])
            H=min(oS.C,oS.C+oS.alphas[j] - oS.alphas[i])
        else:
            L=max(0,oS.alphas[j]+oS.alphas[i] - oS.C)
            H=min(oS.C,oS.alphas[j]+oS.alphas[i])
        if L == H:
            print("L==H")
            return 0
        eta = 2.0*oS.K[i,j]- oS.K[i,i]- oS.K[j,j]
        if eta >= 0:
            print("eta>=0")
        oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej)/eta
        oS.alphas[j] = clipAlpha(oS.alphas[j],H,L)
        updateEk(oS,j)
        if(abs(oS.alphas[j] - alphaJold) < 0.00001):
            print("AlphaJ变化太小")
            return 0
        oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])
        updateEk(oS,i)
        b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j]
        b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j]
        if(0<oS.alphas[i])and(oS.C>oS.alphas[i]):oS.b = b1
        elif(oS.alphas[j]>0)and(oS.C>oS.alphas[j]):oS.b = b2
        else:oS.b = (b1+b2)/2
        return 1
    else:
        return 0

def smoP(dataMatIn,classLabels,C,toler,maxIter,kTup = ('lin',0)):
    oS = optStruct(np.mat(dataMatIn),np.mat(classLabels).transpose(),C,toler,kTup)
    iteration = 0
    entireSet = True
    alphaPairsChanged = 0
    while(iteration<maxIter)and((alphaPairsChanged>0)or(entireSet)):
        alphaPairsChanged = 0
        if entireSet:
            for i in range(oS.m):
                alphaPairsChanged += innerL(i,oS)
                print("全部遍历:第%d次迭代 样本:%d,alpha优化次数:%d"%(iteration,i,alphaPairsChanged))
            iteration += 1
        else:
            nonBoundIs = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A<C))[0]
            for i in nonBoundIs :
                alphaPairsChanged += innerL(i,oS)
                print("非边界遍历:第%d次迭代 样本:%d,alpha优化次数:%d"%(iteration,i,alphaPairsChanged))
            iteration +=1
        if entireSet:
           entireSet = False
        elif(alphaPairsChanged == 0):
            entireSet = True
        print("迭代次数:%d"%iteration)
    return oS.b,oS.alphas

def showClassifer(dataMat,classLabels,w,b,alphas):
    dataPlus = []
    dataMinus = []
    for i in range(len(dataMat)):
        if classLabels[i] > 0:
            dataPlus.append(dataMat[i])
        else:
            dataMinus.append(dataMat[i])
    dataPlusNp = np.array(dataPlus)
    dataMinusNp = np.array(dataMinus)
    plt.scatter(np.transpose(dataPlusNp)[0],np.transpose(dataPlusNp)[1],s=30,alpha=0.7)
    plt.scatter(np.transpose(dataMinusNp)[0],np.transpose(dataMinusNp)[1],s=30,alpha=0.7)
    x1 = max(dataMat)[0]
    x2 = min(dataMat)[0]
    a1,a2=w
    b=float(b)
    a1 = float(a1[0])
    a2 = float(a2[0])
    y1,y2 = (-b-a1*x1)/a2,(-b-a1*x2)/a2
    plt.plot([x1,x2],[y1,y2])
    for i,alpha in enumerate(alphas):
        if abs(alpha)>0:
            x,y=dataMat[i]
            plt.scatter([x],[y],s=150,c='none',alpha=0.7,linewidth=1.5,edgecolor='red')
    plt.show()

def calcWs(alphas,dataArr,classLabels):
    X = np.mat(dataArr)
    labelMat = np.mat(classLabels).transpose()
    m,n = np.shape(X)
    w = np.zeros((n,1))
    for i in range(m):
        w += np.multiply(alphas[i]*labelMat[i],X[i,:].T)
    return w


def testRbf(k1=1.3):
    dataArr,labelArr = loadDataSet('testSetRBF.txt')
    b,alphas = smoP(dataArr,labelArr,200,0.0001,100,('rbf',k1))
    datMat = np.mat(dataArr)
    labelMat = np.mat(labelArr).transpose()
    svInd = np.nonzero(alphas.A > 0)[0]
    sVs = datMat[svInd]
    labelSV = labelMat[svInd]
    print("支持向量个数:%d"%np.shape(sVs)[0])
    m,n = np.shape(datMat)
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs,datMat[i,:],('rbf',k1))
        predict = kernelEval.T*np.multiply(labelSV,alphas[svInd]+b)
        if np.sign(predict)!=np.sign(labelArr[i]):errorCount += 1
        print("训练集错误率%f"%(float((errorCount)/m)*100))
    dataArr,labelArr = loadDataSet('testSetRBF2.txt')
    errorCount = 0
    datMat = np.mat(dataArr)
    labelMat = np.mat(labelArr).transpose()
    m,n = np.shape(datMat)
    for i in range(m):
        kernelEval = kernelTrans(sVs,datMat[i,:],('rbf',k1))
        predict = kernelEval.T*np.multiply(labelSV,alphas[svInd]+b)
        if np.sign(predict)!=np.sign(labelArr[i]):errorCount += 1
        print("测试集错误率%f"%((float(errorCount)/m)*100))

def get_w(dataMat, labelMat, alphas):
    alphas, dataMat, labelMat = np.array(alphas), np.array(dataMat), np.array(labelMat)
    w = np.dot((np.tile(labelMat.reshape(1, -1).T, (1, 2)) * dataMat).T, alphas)
    return w.tolist()

def loadImages(dirName):
    from os import listdir
    trainingFileList= listdir
    hwLabels = []
    trainingFileList = listdir(dirName)
    m = len(trainingFileList)
    trainingMat = np.zeros((m,1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')
        classNumStr = int(fileNameStr.split('_')[0])
        if classNumStr == 9:hwLabels.append(-1)
        else:hwLabels.append(1)
        trainingMat[i,:] = img2vector('%s/%s'%(dirName,fileNameStr))
    return trainingMat,hwLabels

def testDigits(kTup=('rbf',10)):
    dataArr,labelArr = loadImages('trainingDigits')
    b,alphas = smoP(dataArr,labelArr,200,0.0001,10000,kTup)
    dataMat=np.mat(dataArr)
    labelMat=np.mat(labelArr).transpose()
    svInd=np.nonzero(alphas.A>0)[0]
    sVs= dataMat[svInd]
    labelSV = labelMat[svInd]
    print("有%d个支持向量"%np.shape(sVs)[0])
    m,n=np.shape(dataMat)
    errorCount = 0
    for i in range(m):
        kernelEval = kernelTrans(sVs,dataMat[i,:],kTup)
        predict=kernelEval.T*np.multiply(labelSV,alphas[svInd])+b
        if np.sign(predict)!=np.sign(labelArr[i]):errorCount +=1
    print("训练集错误率为%f"%(float(errorCount/m)))
    dataArr,labelArr=loadImages('testDigits')
    errorCount = 0
    datMat=np.mat(dataArr)
    labelMat=np.mat(labelArr).transpose()
    m,n=np.shape(datMat)
    for i in range(m):
        kernelEval=kernelTrans(sVs,datMat[i,:],kTup)
        predict=kernelEval.T*np.multiply(labelSV,alphas[svInd])+b
        if(np.sign(predict)!=np.sign(labelArr[i])):errorCount+=1
    print("测试集错误率为%f"%(float(errorCount/m)))

def img2vector(filename):
    returnVect=np.zeros((1,1024))
    fr=open(filename)
    for i in range(32):
        lineStr=fr.readline()
        for j in range(32):
            returnVect[0,32*i+j]=int(lineStr[j])
    return returnVect

这个代码和之前的几乎没有区别,除了它的数据来源不是txt文件,而是一个图片。且这个代码的RBF方法里,其参数不再固定,而是在调用时确定。
当使用不同的参数时,其结果如下:
在这里插入图片描述
可以看出,当参数上升时,支持向量书下降,训练错误率先上升再下降,而测试错误率则呈一个卧倒的S型。
所以当参数设置非常小时,会产生过拟合的情况,随着参数的提升,过拟合的情况虽然有波动,但相比较而言大幅度缓解了。
该数据的最佳参数设置在10左右。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值