用python实现了留一法和十折交叉检验来对iris和wine数据集进行三分类。算法实现起来不难,但是由于第一次写,迭代函数的封装性不好(因为假设了类别在最后一列)。主要注意的是十折交叉检验时候分层抽样。因为两个数据集都是比较整齐,一二三类分为三块。所以这里用数据集每一行元组的下标除以10的余数作为其所处的块数。其余就没有什么了。只是因为先做的iris,数据集数量特别好,就特别蠢的假设总数是10和类别的倍数。故对wine进行分类时候,又重写了这个方法。
另外迭代次数只是简单设置为5词就得到了很好的模型,就没有再对其进行调整。
在调试过程中遇到了最后模型出现nan的问题,主要是因为迭代次数太多,导致不断迭代使得模型中数据不断变小,因为计算机精度有限,所以最后使迭代过程中0作为被除数因而出现nan的问题。
对wine数据集分类代码如下:
import os
import numpy
import string
import math
def xz(x):
x2=numpy.row_stack((x,1))
return x2
def betaInit(m):
x=numpy.zeros(shape=(m.shape[1],1))
return x;
def p1(xz,beta):
mul=numpy.dot(beta.T,xz)[0][0]
t=math.exp(mul)
return t/(1+t)
def p0(xz,beta):
mul = numpy.dot(beta.T, xz)[0][0]
t = math.exp(mul)
return 1/(1+t)
def arrayToMatrix(a):
m=numpy.ndarray(shape=(0,1))
for i in a:
m=numpy.row_stack((m,i))
return m;
def iteration(beta,m,start,end,rPos): #改变
dbeta=numpy.zeros(shape=(m.shape[1],1))
d2beta=numpy.zeros(shape=(m.shape[1],m.shape[1]))
for a in m:
xiz=xz(arrayToMatrix(a[start:end]))
p=p1(xiz,beta)
d1=xiz*(a[rPos]-p)
dbeta=dbeta-d1
mt=numpy.dot(xiz,xiz.T)
d2=mt*p*(1-p)
d2beta=d2beta+d2
return beta-numpy.dot(numpy.linalg.inv(d2beta),dbeta)
def y(beta,x):
t=(numpy.dot(beta.T,x))
return 1/(1+math.exp(-t[0][0]))
def train(m,start,end,rPos):
beta=betaInit(m)
for i in range(5):
beta=iteration(beta,m,start,end,rPos)
return beta
def trainNum(m,num,start,end,rPos):
mt=m.copy()#这里copy了一份测试集,为了不破坏原来测试集
for i in range(mt.shape[0]):
if mt[i,rPos]!=num:
mt[i,rPos]=0
else:
mt[i,rPos]=1
return train(mt,start,end,rPos)
def newKFold(m,k):
newm=numpy.ndarray(shape=(0,m.shape[1]))
for i in range(k):
t=i
while t<m.shape[0]:
newm=numpy.row_stack((newm,m[t]))
t=t+k
return newm
file=open('E:\\wine.txt')
mylist=file.readlines()
m=numpy.ndarray(shape=(0,14))
for str in mylist:
if str.find(",")<0:
break
testa=[]
str=str.split(",")
for arg in str:
testa.append(float(arg))
m=numpy.row_stack((m,testa))
for a in m:
print (a)
'''
k折交叉检验
'''
k=10
c=3
sum=m.shape[0]
numeach=m.shape[0]/k #15
mt=newKFold(m,k).copy()
resultAray1 = numpy.zeros(shape=(mt.shape[0],1))
E=0
err=0
for i in range(k):
testm1 = numpy.ndarray(shape=(0, m.shape[1]))
trainm1 = numpy.ndarray(shape=(0, m.shape[1]))
testm1=numpy.row_stack((testm1,mt[int(i*numeach):int((i+1)*numeach)]))
trainm1=numpy.row_stack((trainm1,mt[0:int(i*numeach)]))
trainm1=numpy.row_stack((trainm1,mt[int((i+1)*numeach):mt.shape[0]]))
beta1=trainNum(trainm1,1,1,mt.shape[0],0)
beta2=trainNum(trainm1,2,1,mt.shape[0],0)
beta3=trainNum(trainm1,3,1,mt.shape[0],0)
for j in range(testm1.shape[0]):
r1 = y(beta1, numpy.row_stack((arrayToMatrix(testm1[j, 1:len(m[0])]), 1)))
r2 = y(beta2, numpy.row_stack((arrayToMatrix(testm1[j, 1:len(m[0])]), 1)))
r3 = y(beta3, numpy.row_stack((arrayToMatrix(testm1[j, 1:len(m[0])]), 1)))
if (isMax(r1, r2, r3) == 0 ):
E=E+(r1-mt[int(i*numeach+j),0])**2
resultAray1[int(i*numeach+j)] = 1
elif (isMax(r2, r1, r3) == 0 ):
E = E + (r2 - mt[int(i * numeach + j), 0]) ** 2
resultAray1[int(i*numeach+j)] = 2
else :
E = E + (r3 - mt[int(i * numeach + j), 0]) ** 2
resultAray1[int(i*numeach+j)] = 3
for i in range(mt.shape[0]):
if mt[i,0]!=resultAray1[i]:
err=err+1
print(mt[i],resultAray1[i])
print("k折检验法,E:",E/mt.shape[0],"错误率: ",err/mt.shape[0])
'''
留一法检验
'''
resultAray2 = numpy.zeros(shape=(m.shape[0],1))
E=0
err=0
for i in range(m.shape[0]):
testm2 = numpy.ndarray(shape=(0, m.shape[1]))
trainm2 = numpy.ndarray(shape=(0, m.shape[1]))
testm2 = numpy.row_stack((testm2,m[i]))
trainm2 = numpy.row_stack((trainm2,m[0:i]))
trainm2 = numpy.row_stack((trainm2,m[i+1:m.shape[0]]))
beta1 = trainNum(trainm2, 1,1,m.shape[1],0)
beta2 = trainNum(trainm2, 2,1,m.shape[1],0)
beta3 = trainNum(trainm2, 3,1,m.shape[1],0)
for j in range(testm2.shape[0]):
r1 = y(beta1, numpy.row_stack((arrayToMatrix(testm2[j, 1:len(m[0]) ]), 1)))
r2 = y(beta2, numpy.row_stack((arrayToMatrix(testm2[j, 1:len(m[0]) ]), 1)))
r3 = y(beta3, numpy.row_stack((arrayToMatrix(testm2[j, 1:len(m[0])]), 1)))
if (isMax(r1, r2, r3) == 0):
E=E+(r1-m[i][0])**2
resultAray2[i] = 1
elif (isMax(r2, r1, r3) == 0):
E = E + (r2 - m[i][0]) ** 2
resultAray2[i] = 2
else:
E = E + (r3 - m[i][0]) ** 2
resultAray2[i] = 3
for i in range(m.shape[0]):
if m[i,0]!=resultAray2[i]:
err=err+1
print(m[i], resultAray2[i])
print("留一法,E:",E/m.shape[0],"错误率: ",err/m.shape[0])