数据集:https://archive.ics.uci.edu/ml/datasets/Adult
数据集本身是做二分类的。
里面自己设计了DataLoader、Tester等类,简单易懂。
导入的头文件都没卵用,全是手写的。
tester用了从30000条到32000条数据,当然你自己可以随便改。
Party类实例化之后是多方中的一方。
import torch
import torch.nn.modules
import torch.nn
import numpy as np
from torch.autograd import Variable #torch的基本变量
import torch.nn.functional as F #里面有很多torch的函数
import matplotlib.pyplot as plt
NUM_OF_TEST_SET=2000
NUM_OF_PARTY=5
PARTY_DATA=6000
NUM_OF_ALL_DATA=32561
class Party:
def __init__(self):
self.Data=[]
self.NUMOFFEATURE=0
self.NUMOFDATA=0
def Update(self):
self.NUMOFDATA=len(self.Data)
self.NUMOFFEATURE=len(self.Data[0])
def BayesPredict(self,feature):
#use=self.ConvertFeature(feature)
#if len(use)==0: return "Feature Convert Failed. Unable to Predict"
#print('one')
half0=[item[:-1] for item in self.Data if item[self.NUMOFFEATURE-1]==1]#大于50K的
cnt=[0]*len(feature)
for item in half0:
for index,data in enumerate(item):
if data==feature[index]:
cnt[index]+=1
cnt=cnt[:-1]
P1 = 1
PA=len(half0)#0类分类在总分类中出现的次数
if PA!=0:
#特征事件相乘
for count in cnt:
P1 = P1*(count/PA)
#分类事件相乘
P1 = P1 * (PA/self.NUMOFDATA)
half1 = [item[:-1] for item in self.Data if item[self.NUMOFFEATURE - 1] == 0] # 小于50K的
cnt = [0] * len(feature)
for item in half1:
for index, data in enumerate(item):
if data == feature[index]:
cnt[index] += 1
cnt = cnt[:-1]
P2 = 1
PA = len(half1) # 0类分类在总分类中出现的次数
if PA!=0:
# 特征事件相乘
for count in cnt:
P2 = P2 * (count / PA)
# 分类事件相乘
P2 = P2 * (PA / self.NUMOFDATA)
if P1-P2<1e-11: return -1
elif P1>P2: return 1#大于50k
elif P1<P2: return 0#小于50k
def ConvertFeature(self,feature):
res=[]
for item in feature:
print(item)
if item in self.GetClass:
res.append(self.GetClass[item])
else:return []
return res
class DataLoader():
def __init__(self):
self.RawData = []
self.Data = []
self.GetClass = {}
self.NUMOFFEATURE = 0
self.NUMOFDATA = 0
def LoadData(self,PATH):
with open(PATH,'r') as f:
for line in f:
self.RawData.append([thing.strip() for thing in line.split(',')])
self.RawData=self.RawData[:-1]
self.NUMOFDATA=len(self.RawData)
def ShowRawData(self):
for item in self.RawData:
print(item)
def WashData(self):
attribute=len(self.RawData[0])-1
#print("attribute: {}".format(attribute))
index=0
while index<attribute+1:
#print('index: {}'.format(index))
cnt=0
temp=[i[index] for i in self.RawData]
for item in temp:
if item not in self.GetClass:
self.GetClass[item]=cnt
cnt+=1
index+=1
self.NUMOFFEATURE=index
#print("feature number is {}.".format(self.NUMOFFEATURE))
for line in self.RawData:
self.Data.append([self.GetClass[i] for i in line])
def ShowData(self):
for item in self.Data:
print(item)
def AssignData(self,Party,start,end):
Party.Data=self.Data[start:end]
class Tester:
def __init__(self):
self.Data = []
self.NUMOFFEATURE = 0
self.NUMOFDATA = 0
def Update(self):
self.NUMOFDATA=len(self.Data)
self.NUMOFFEATURE=len(self.Data[0])
def Test(self,A):
Error=0
#print(A.Data[:10])
for item in self.Data:
res=A.BayesPredict(item)
if res==-1:continue
if res!=item[-1]:
Error+=1
return Error
if __name__=="__main__":
FilePath=r'C:\Users\Lenovo\Desktop\adult.data'
#simulation of federated learning
loader=DataLoader()
loader.LoadData(FilePath)
loader.WashData()
#print(loader.GetClass['>50K'])
#print(loader.GetClass['<=50K'])
parties=[]
for i in range(6):
temp=Party()
parties.append(temp)
last=0
for i in range(len(parties)):
loader.AssignData(parties[i],last,last+6000)
#print(parties[i].Data[:10])
parties[i].Update()
last+=6000
tester = Tester()
loader.AssignData(tester,31000,32000)
'''
y=[]
last=0
for i in range(6):
loader.AssignData(parties[0],last,last+6000)
res=tester.Test(parties[0])
y.append(res)
'''
y=[]
last = 6000
for i in range(6):
loader.AssignData(parties[0], 0, last)
res = tester.Test(parties[0])
y.append(res)
x = [1, 2, 3, 4, 5, 6]
plt.bar(x, y)
plt.show()