实现PLA
import pandas as pd
import numpy as np
dataPath='../data/data.dat'
def PLA():
count=0 #记录w更新次数
index=0 #记录更新w之后遍历数据的条数
i =0 #作为遍历数组的游标
with open(dataPath) as files:
df=pd.read_csv(files,header=None,delim_whitespace=True,names=['x1','x2','x3','x4','y'])
df.insert(0,'x0',-1)
y=df['y'].values
df.drop('y',axis=1,inplace=True)
data=df.values
w=np.zeros((1,5),np.float)
while True:
if np.dot(w,data[i].T) ==0 and y[i]==1:
w+=data[i]*y[i]
count+=1
index=0
i+=1
elif np.sign(np.dot(w,data[i].T)*y[i])<0:
w += data[i] * y[i]
count+=1
index=0
i+=1
else:
index += 1
i+=1
if i == len(data)-1:
i=0
if index>=len(data)-1:
break
print(count)
def PLA2():
count=0
w = np.zeros((1, 5), np.float)
while True:
with open(dataPath) as file:
halt=True
df = pd.read_csv(file,header=None,sep='\t')
for index,item in df.iterrows():
x=item[0].split(' ')
x=np.asanyarray(x,np.float)
x=np.insert(x,0,-1)
y=item[1]
if np.dot(w,x.T) ==0 and y==1:
w+=x*y
count+=1
halt=False
elif np.sign(np.dot(w,x.T))*y<0:
w+=x*y
count+=1
halt=False
if halt==True:
break
print(count)
在这里实现了两个版本的PLA,第一个版本将所有数据加载如内存 同时对遍历数据进行标记 保证在找到hypothesis之后只会完整遍历一遍数据集,不会有多余的遍历
第二个版本代码上较为简洁,只是再找到hypothesis之后还会遍历完数据集依然会再次完整遍历一遍数据集作为终止条件
但是上面的代码是在是写的太垃圾了 之后更改什么东西都不太好更改,所以还是需要重构一下 进行模块化
class DataSet:
def __init__(self,dataPath):
self.dataPath=dataPath
self.X=[]
self.Y=[]
self._loadData()
def _loadData(self):
with open(self.dataPath) as file:
df=pd.read_csv(dataPath,header=None,delim_whitespace=True,names=['x1','x2','x3','x4','y'])
df.insert(0,'x0',-1)
self.Y=df['y'].values
df.drop('y', axis=1, inplace=True)
self.X=df.values
def _shuffleInUnion(self,a,b):
randomState=np.random.get_state()
np.random.shuffle(a)
np.random.set_state(randomState)
np.random.shuffle(b)
def shuffleData(self):
self._shuffleInUnion(self.X,self.Y)
class PLA:
def __init__(self,dataPath):
self.dataPath=dataPath
self.data=DataSet(dataPath)
def naviePLA(self,eta=1,printOut=False):
i=0
count=0
mark=0
w=np.zeros((1,5),np.float)
while True:
if np.sign(np.dot(w,self.data.X[i]))!=self.data.Y[i]:
w+=eta*self.data.X[i]*self.data.Y[i]
count+=1
mark=0
else:
mark+=1
i+=1
if mark==len(self.data.Y)-1:
break
if i==len(self.data.Y)-1:
i=0
if printOut:
print('after %d PLA is finished'%(count))
return count
def CyclePLA(self,epochTime=2000,eta=1,printOut=False):
startTime = time.clock()
result=0
for i in range(epochTime):
self.data.shuffleData()
result=result+self.naviePLA(eta,printOut)
print('mean time is ',result/epochTime)
print('runnning time is ',time.clock()-startTime)
if __name__ == '__main__':
pla=PLA(dataPath)
pla.CyclePLA(printOut=True)
这样看上去好了一些 如果继续增加其他功能也使得代码具有可维护性
对于Pocket算法也是一样
import numpy as np
import pandas as pd
trainData='../data/train.csv'
testData='../data/test.csv'
class DataSet:
def __init__(self,dataPath):
self.dataPath=dataPath
self.X=[]
self.Y=[]
self._loadData()
def _loadData(self):
with open(self.dataPath) as file:
df=pd.read_csv(self.dataPath,header=None,delim_whitespace=True,names=['x1','x2','x3','x4','y'])
df.insert(0,'x0',-1)
self.Y=df['y'].values
df.drop('y', axis=1, inplace=True)
self.X=df.values
def _shuffleInUnion(self,a,b):
randomState=np.random.get_state()
np.random.shuffle(a)
np.random.set_state(randomState)
np.random.shuffle(b)
def shuffleData(self):
self._shuffleInUnion(self.X,self.Y)
def sign(w,x):
if np.sign(np.dot(w,x))>0:
return 1
else:
return -1
class Pocket:
def __init__(self,trainPath,testPath):
self.trainDataSet=DataSet(trainPath)
self.testDataSet=DataSet(testPath)
def iterationPocket(self,iterationNum):
w=np.zeros((1,5),np.float)
saveW=np.zeros((1,5),np.float)
accSave=0
while iterationNum>0:
for i in range(len(self.trainDataSet.Y)):
if sign(w,self.trainDataSet.X[i])!=self.trainDataSet.Y[i]:
w+=self.trainDataSet.X[i]*self.trainDataSet.Y[i]
acc=self.evaluate(w,'train')
if acc>accSave:
accSave=acc
saveW=w.copy()
iterationNum-=1
if iterationNum==0:
break
print('acc is',accSave)
return w
def pocket(self,epoch):
acc=0
for i in range(epoch):
self.trainDataSet.shuffleData()
w=self.iterationPocket(50)
acc=acc+self.evaluate(w,'test')
print(acc/epoch)
def evaluate(self,w,type):
if type=='train':
acc=0
for i in range(len(self.trainDataSet.X)):
if sign(w,self.trainDataSet.X[i])==self.trainDataSet.Y[i]:
acc+=1
return acc/len(self.trainDataSet.X)
if type=='test':
acc = 0
for i in range(len(self.testDataSet.X)):
if sign(w, self.testDataSet.X[i]) == self.testDataSet.Y[i]:
acc += 1
return acc / len(self.testDataSet.X)
if __name__=='__main__':
pocket=Pocket(trainData,testData)
pocket.pocket(200)