此文章记录实现过程中遇到的各种问题
并在结尾附上源码
本文参考以下博文:
FP_growth算法参考
https://blog.youkuaiyun.com/sinat_17196995/article/details/71191869
apriori算法参考
https://blog.youkuaiyun.com/qq_32126633/article/details/78351726
问题1
freqs = [v[0] for v in sorted(headerTable.items(), key=lambda p:p[1][0])] # 根据频繁项的总频次排序
AttributeError: 'NoneType' object has no attribute 'items'
该问题的产生,如果你是Python3.*,导致的原因是你进行了空值for循环创建,比如上述中的headerTable没有取到值,导致发生了错误。
除此之外,还会经常遇到AttributeError: 'NoneType' object has no attribute 'iteritems'
这是因为Python3.*没有此功能,你需要改变代码的写法,将iteritems()换成items()
问题2
TypeError: list indices must be integers or slices, not frozenset
此问题的产生,通常在于字典类型的操作,你需要在字典变为list。
比如
support_data={。。。。。。。}
for j in sorted(support_data):
改为
for j in sorted(list(support_data)):
问题3
RuntimeError: dictionary changed size during iteration
比如函数的
for k in headerTable.keys()
改为
for k in list(headerTable):
问题4
AttributeError: 'node' object has no attribute 'issubset'
具体报错没有及时记录下来,大致是这样的,说没有产生结点,我这边的修改是调整最小支持度,在这只最下支持度是没有使用小数形式,导致变换数据集时发生没有频繁项集,导致不能构成结点。
问题5
TypeError: unorderable types: treeNode() < treeNode()
原因是python3:comparing integers and strings is not allowed
比如:
order_item = [v[0] for v in sorted(localD.items(), key=lambda x:x[1], reverse=True)]
因此需要修改函数中的为
order_item = [v[0] for v in sorted(localD.items(), key=lambda x:str(x[1])), reverse=True)]
添加str将类型转化为字符串类型。
问题6
add与append的使用,add用于数组,append用于字典。
但程序中经常会先创建好频繁项集L的长度和结构,如果在进行L操作时,使用append,会产生空集。
比如
L=[set() for i in range(size)]#用于保存频繁项
flag=[True for _ in range(len(data_dic))]
C1=self.create_C1(data_dic)
L1=self.generate_lk_by_ck(data_dic, C1, min_support, support_data,flag)
L.append(L1)
Lksub=L1.copy() #初始时L1
k = 2
while (len(L[k-2]) > 0 and k < size): # 创造Ck
Ck = self.create_ck(L[k-2],k)
Lk = self.generate_lk_by_ck(data_dic, Ck, min_support, support_data,flag)
L[k-1]=Lk
k += 1
会出现 [空] [有数值] [有数值]
修改为
L=[set() for i in range(size)]#用于保存频繁项
flag=[True for _ in range(len(data_dic))]
C1=self.create_C1(data_dic)
L1=self.generate_lk_by_ck(data_dic, C1, min_support, support_data,flag)
L[0]=L1
Lksub=L1.copy() #初始时L1
k = 2
while (len(L[k-2]) > 0 and k < size): # 创造Ck
Ck = self.create_ck(L[k-2],k)
Lk = self.generate_lk_by_ck(data_dic, Ck, min_support, support_data,flag)
L[k-1]=Lk
k += 1
使用精准赋值,可以有效避免这种情况的产生。
问题7
加载路径问题:
current_path=os.getcwd()
如果遇到这种加载方法,无法找到文件,可以尝试使用下列加载方式
current_path = os.path.abspath(os.path.dirname(__file__))
具体问题可以自行百度
问题8
编辑器的问题,代码粘贴进入,无端报错,是因为编辑器的间隔方式不同,比如
只要在函数之间保持一致的间隔方式就可以解决问题。
总结
Python版本语法问题
set(), frozenset()函数理解
数字,字典之间的赋值转化
最小置信度、最小支持度的设置
FP_growth算法源码:
#-*- coding: utf-8 -*-
import os
import time
from tqdm import tqdm
def load_data(path):#根据路径加载数据集
ans=[]#将数据保存到该数组
if path.split(".")[-1]=="xls":#若路径为药方.xls
from xlrd import open_workbook
import xlwt
workbook=open_workbook(path)
sheet=workbook.sheet_by_index(0)#读取第一个sheet
for i in range(1,sheet.nrows):#忽视header,从第二行开始读数据,第一列为处方ID,第二列为药品清单
temp=sheet.row_values(i)[1].split(";")[:-1]#取该行数据的第二列并以“;”分割为数组
if len(temp)==0: continue
temp=[j.split(":")[0] for j in temp]#将药品后跟着的药品用量去掉
temp=list(set(temp))#去重,排序
temp.sort()
ans.append(temp)#将处理好的数据添加到数组
elif path.split(".")[-1]=="csv":
import csv
with open(path,"r") as f:
reader=csv.reader(f)
for row in reader:
row=list(set(row))#去重,排序
row.sort()
ans.append(row)#将添加好的数据添加到数组
return ans#返回处理好的数据集,为二维数组
def save_rule(rule,path):#保存结果到txt文件
with open(path,"w") as f:
f.write("index confidence"+" rules\n")
index=1
for item in rule:
s=" {:<4d} {:.3f} {}=>{}\n".format(index,item[2],str(list(item[0])),str(list(item[1])))
index+=1
f.write(s)
f.close()
print("result saved,path is:{}".format(path))
#保存频繁项集
def save_Frequent_itemsets(support_data,size,path):
with open(path,"w") as f:
f.write("频繁项集 项集总个数 items\n")
for i in range(size):
data={}
for j in sorted(list(support_data)):
if len(j) == i+1:
data[j]=support_data[j]
data=sorted(list(data.items()),key=lambda x:x[1],reverse=True)
f.write("Frequent item{}: {}\n {}\n".format(i+1,len(data),data))
f.close()
print("result saved,path is:{}".format(path))
#判断输入值是否正确
def panduan():
paduan=True
while(paduan):
n=input("请输入最大频繁项(正整数):")#conda config --set auto_activate_base false
if n.isdigit():#判断n中只有数字
return n
else:
print("输入格式错误,请正确输入最大频繁项集!\n")
panduan()
paduan=False
class Node:
def __init__(self, node_name,count,parentNode):
self.name = node_name
self.count = count
self.nodeLink = None#根据nideLink可以找到整棵树中所有nodename一样的节点
self.parent = parentNode#父亲节点
self.children = {}#子节点{节点名字:节点地址}
class Fp_growth_plus():
def data_compress(self,data_set):#数据处理,数据压缩
data_dic={}
for i in data_set:
if frozenset(i) not in data_dic:
data_dic[frozenset(i)]=1
else:
data_dic[frozenset(i)]+=1
return data_dic
def update_header(self,node, targetNode):#更新headertable中的node节点形成的链表
while node.nodeLink != None:
node = node.nodeLink
node.nodeLink = targetNode
def update_fptree(self,items, count,node, headerTable):#用于更新fptree
if items[0] in node.children:
# 判断items的第一个结点是否已作为子结点
node.children[items[0]].count+=count
else:
# 创建新的分支
node.children[items[0]] = Node(items[0],count,node)
# 更新相应频繁项集的链表,往后添加
if headerTable[items[0]][1] == None:
headerTable[items[0]][1] = node.children[items[0]]
else:
self.update_header(headerTable[items[0]][1], node.children[items[0]])
# 递归
if len(items) > 1:
self.update_fptree(items[1:],count, node.children[items[0]], headerTable)
def create_fptree(self,data_dic, min_support,flag=False):#建树主函数
'''
根据data_dic创建fp树
header_table结构为
{"nodename":[num,node],..} 根据node.nodelink可以找到整个树中的所有nodename
'''
item_count = {}#统计各项出现次数
for t in data_dic:#第一次遍历,得到频繁一项集
for item in t:
if item not in item_count:
item_count[item]=data_dic[t]
else:
item_count[item]+=data_dic[t]
headerTable={}
t_num=float(len(data_dic))
for k in item_count:#剔除不满足最小支持度的项
if (item_count[k]/t_num) >= min_support:
headerTable[k]=item_count[k]
freqItemSet = set(headerTable)#满足最小支持度的频繁项集
if len(freqItemSet) == 0:
return None, None
for k in headerTable:
headerTable[k] = [headerTable[k], None] # element: [count, node]
tree_header = Node('head node',1,None)
if flag:
ite=tqdm(data_dic)
else:
ite=data_dic
for t in ite:#第二次遍历,建树
localD = {}
for item in t:
if item in freqItemSet: # 过滤,只取该样本中满足最小支持度的频繁项
localD[item] = headerTable[item][0] # element : count
if len(localD) > 0:
# 根据全局频数从大到小对单样本排序
order_item = [v[0] for v in sorted(localD.items(), key=lambda x:x[1], reverse=True)]
# 用过滤且排序后的样本更新树
self.update_fptree(order_item,data_dic[t],tree_header, headerTable)
return tree_header, headerTable
def find_path(self,node, nodepath):
'''
递归将node的父节点添加到路径
'''
if node.parent != None:
nodepath.append(node.parent.name)
self.find_path(node.parent, nodepath)
def find_cond_pattern_base(self,node_name, headerTable):
'''
根据节点名字,找出所有条件模式基
'''
treeNode = headerTable[node_name][1]
cond_pat_base = {}#保存所有条件模式基
while treeNode != None:
nodepath = []
self.find_path(treeNode, nodepath)
if len(nodepath) > 1:
cond_pat_base[frozenset(nodepath[:-1])] = treeNode.count
treeNode = treeNode.nodeLink
return cond_pat_base
#挖掘频繁项集
def create_cond_fptree(self,headerTable, min_support, temp, freq_items,support_data):
# 最开始的频繁项集是headerTable中的各元素
freqs = [v[0] for v in sorted(headerTable.items(), key=lambda p:p[1][0])] # 根据频繁项的总频次排序
for freq in freqs: # 对每个频繁项
freq_set = temp.copy()
freq_set.add(freq)
freq_items.add(frozenset(freq_set))
if frozenset(freq_set) not in support_data:#检查该频繁项是否在support_data中
support_data[frozenset(freq_set)]=headerTable[freq][0]
else:
support_data[frozenset(freq_set)]+=headerTable[freq][0]
cond_pat_base = self.find_cond_pattern_base(freq, headerTable)#寻找到所有条件模式基
#创建条件模式树
cond_tree, cur_headtable = self.create_fptree(cond_pat_base, min_support)
if cur_headtable != None:
self.create_cond_fptree(cur_headtable, min_support, freq_set, freq_items,support_data) # 递归挖掘条件FP树
def generate_L(self,data_set,min_support,size,Frequent_itemsets_save_path):
data_dic=self.data_compress(data_set)
freqItemSet=set()
support_data={}
tree_header,headerTable=self.create_fptree(data_dic,min_support,flag=True)#创建数据集的fptree
#创建各频繁一项的fptree,并挖掘频繁项并保存支持度计数
#self.mineFPtree(tree_header, headerTable, n, set([]), freqItemSet)
self.create_cond_fptree(headerTable, min_support, set(), freqItemSet,support_data)
L=[set() for i in range(size)]#用于保存频繁项
# print("freqItems++++++++++++++++++++++++++++++")
# print(freqItemSet)
# print("freqItems++++++++++++++++++++++++++++++")
t_num = float(len(data_dic))
for k in list(support_data.keys()): #删除未达到最小频繁度的数据
if (support_data[k]/t_num) < min_support:
del (support_data[k])
# print("support_data++++++++++++++++++++++++++++++")
# print(support_data)
# print("support_data++++++++++++++++++++++++++++++")
# L=list(support_data.keys())
for i in range(0, len(L)):
data={}
for j in sorted(list(support_data)):
if len(j) == i+1:
data[j]=support_data[j]
L[i]=list(data.keys())
print("frequent item {}:{}".format(i+1,len(L[i])))
# print("L[i]++++++++++++++++++++++++++++++")
# print(L[i])
# print("L[i]++++++++++++++++++++++++++++++")
save_Frequent_itemsets(support_data,size,Frequent_itemsets_save_path)
#return L,support_data
return L,support_data
def generate_R(self,data_set, min_support, min_conf,size,Frequent_itemsets_save_path):
L,support_data=self.generate_L(data_set,min_support,size,Frequent_itemsets_save_path)
rule_list = []
sub_set_list = []
for i in range(0, len(L)):
for freq_set in L[i]:
for sub_set in sub_set_list:
if sub_set.issubset(freq_set) and freq_set-sub_set in support_data:#and freq_set-sub_set in support_data
conf = support_data[freq_set] / support_data[freq_set - sub_set]
big_rule = (freq_set - sub_set, sub_set, conf)
if conf >= min_conf and big_rule not in rule_list:
# print freq_set-sub_set, " => ", sub_set, "conf: ", conf
rule_list.append(big_rule)
sub_set_list.append(freq_set)
rule_list = sorted(rule_list,key=lambda x:(x[2]),reverse=True)
return rule_list
if __name__=="__main__":
#filename="groceries.csv"
filename="药方.xls"
min_support=0.2#最小支持度
min_conf=0.5#最小置信度
spend_time=[]
current_path = os.path.abspath(os.path.dirname(__file__))
if not os.path.exists(current_path+"\\log"):
os.mkdir("log")
path=current_path+"\\dataset\\"+filename
save_path=current_path+"\\log\\"+filename.split(".")[0]+"_fpgrowth_plus.txt"
#频繁项集保存路径
Frequent_itemsets_save_path=current_path+"\\log\\"+filename.split(".")[0]+"_FPFrequent_itemsets.txt"
data_set=load_data(path)
fp=Fp_growth_plus()
n=panduan()
#rule_list =fp.generate_L(data_set,min_support,int(n),Frequent_itemsets_save_path)
rule_list =fp.generate_R(data_set, min_support, min_conf,int(n),Frequent_itemsets_save_path)
save_rule(rule_list,save_path)
Apriori算法源码:
#-*- coding: utf-8 -*-
import os
import time
from tqdm import tqdm
def load_data(path):#根据路径加载数据集
ans=[]#将数据保存到该数组
if path.split(".")[-1]=="xls":#若路径为药方.xls
from xlrd import open_workbook
import xlwt
workbook=open_workbook(path)
sheet=workbook.sheet_by_index(0)#读取第一个sheet
for i in range(1,sheet.nrows):#忽视header,从第二行开始读数据,第一列为处方ID,第二列为药品清单
temp=sheet.row_values(i)[1].split(";")[:-1]#取该行数据的第二列并以“;”分割为数组
if len(temp)==0: continue
temp=[j.split(":")[0] for j in temp]#将药品后跟着的药品用量去掉
temp=list(set(temp))#去重,排序
temp.sort()
ans.append(temp)#将处理好的数据添加到数组
elif path.split(".")[-1]=="csv":
import csv
with open(path,"r") as f:
reader=csv.reader(f)
for row in reader:
row=list(set(row))#去重,排序
row.sort()
ans.append(row)#将添加好的数据添加到数组
return ans#返回处理好的数据集,为二维数组
def save_rule(rule,path):#保存结果到txt文件
with open(path,"w") as f:
f.write("index confidence"+" rules\n")
index=1
for item in rule:
s=" {:<4d} {:.3f} {}=>{}\n".format(index,item[2],str(list(item[0])),str(list(item[1])))
index+=1
f.write(s)
f.close()
print("result saved,path is:{}".format(path))
#保存频繁项集
def save_Frequent_itemsets(L,support_data,path):
with open(path,"w") as f:
f.write("频繁项集 项集总个数 items\n")
length=0
for i in range(len(L)):
f.write("Frequent item{}: {}\n {}\n".format(i+1,len(L[i]),sorted(list(support_data.items())[length:length+len(L[i])],key=lambda x:x[1],reverse=True)))
length+=len(L[i])
f.close()
print("result saved,path is:{}".format(path))
#判断输入值是否正确
def panduan():
paduan=True
while(paduan):
n=input("请输入最大频繁项(正整数):")#conda config --set auto_activate_base false
if n.isdigit():#判断n中只有数字
return n
else:
print("输入格式错误,请正确输入最大频繁项集!\n")
panduan()
paduan=False
class Apriori_plus():
#数据集压缩
def data_compress(self,data_set):
ans={}
for i in data_set:
if frozenset(i) not in ans:
ans[frozenset(i)]=1
else:
ans[frozenset(i)]+=1
return ans
##散列技术在此实现
##基于散列技术一次遍历数据,即可生成l1,l2,l3
##不生成l4是因为迭代生成候选项时导致可能性太多,数据量大时占用内存太大
def create_C1(self,data_dic):#基于散列技术一次遍历数据集生成L1,L2,L3
L = set()
for t in data_dic:
for item in t:
item_set = frozenset([item])
L.add(item_set)
return L
def increase_ck_item(self,count,item,temp,l,size,index,item_count):#递归生成候选项(dfs方法)
if len(temp)==size:
ck_item=frozenset(temp)
if ck_item not in item_count:
item_count[ck_item]=count
else:
item_count[ck_item]+=count
return
for i in range(index,l):
temp.append(item[i])
self.increase_ck_item(count,item,temp,l,size,i+1,item_count)
temp.pop()
def create_ck(self,Lk_1,size):#通过频繁项集Lk-1创建ck候选项集
Ck = set()
l = len(Lk_1)
lk_list = list(Lk_1)
for i in range(l):
for j in range(i+1, l):#两次遍历Lk-1,找出前n-1个元素相同的项
l1 = list(lk_list[i])
l2 = list(lk_list[j])
l1.sort()
l2.sort()
if l1[0:size-2] == l2[0:size-2]:#只有最后一项不同时,生成下一候选项
Ck_item = lk_list[i] | lk_list[j]
if self.has_infrequent_subset(Ck_item, Lk_1):#检查该候选项的子集是否都在Lk-1中
Ck.add(Ck_item)
return Ck
def has_infrequent_subset(self,Ck_item, Lk_1):#检查候选项Ck_item的子集是否都在Lk-1中
for item in Ck_item:
sub_Ck = Ck_item - frozenset([item])
if sub_Ck not in Lk_1:
return False
return True
def generate_lk_by_ck(self,data_dic,ck,min_support,support_data,flag):#通过候选项ck生成lk,并将各频繁项的支持度保存到support_data字典中
item_count={}#用于标记各候选项在数据集出现的次数
Lk = set()
index=-1
for t in tqdm(data_dic):
index+=1
temp_flag=False
if not flag[index]:continue
for item in ck:
if item.issubset(t):
temp_flag=True
if item not in item_count:
item_count[item] = data_dic[t]
else:
item_count[item] += data_dic[t]
flag[index]=temp_flag
t_num = float(len(data_dic))
for item in item_count:#将满足支持度的候选项添加到频繁项集中
if (item_count[item]/t_num) >= min_support:
Lk.add(item)
support_data[item] = item_count[item]
return Lk
def generate_L(self,data_set, min_support,size,Frequent_itemsets_save_path):#用于生成所有频繁项集的主函数,k为最大频繁项的大小
data_dic=self.data_compress(data_set)
support_data = {} #用于保存各频繁项的支持度
L=[set() for i in range(size)]#用于保存频繁项
flag=[True for _ in range(len(data_dic))]
C1=self.create_C1(data_dic)
L1=self.generate_lk_by_ck(data_dic, C1, min_support, support_data,flag)
L[0]=L1
Lksub=L1.copy() #初始时L1
k = 2
while (len(L[k-2]) > 0 and k < size): # 创造Ck
Ck = self.create_ck(L[k-2],k)
Lk = self.generate_lk_by_ck(data_dic, Ck, min_support, support_data,flag)
L[k-1]=Lk
k += 1
# for i in range(1,size):
# Ci = self.create_ck(Lksub, i) #根据Lk-1生成Ck
# Li = self.generate_lk_by_ck(data_dic, Ci, min_support, support_data,flag) #根据Ck生成Lk
# Lksub = Li.copy() #下次迭代时Lk-1=Lk
# L[i]=Lksub
for i in range(len(L)):
print("frequent item {}:{}".format(i+1,len(L[i])))
# print("L[i]++++++++++++++++++++++++++++++")
# print(L[i])
# print("L[i]++++++++++++++++++++++++++++++")
save_Frequent_itemsets(L,support_data,Frequent_itemsets_save_path)
return L, support_data
def generate_R(self,data_set, min_support, min_conf,size,Frequent_itemsets_save_path):
L,support_data=self.generate_L(data_set,min_support,size,Frequent_itemsets_save_path)#根据频繁项集和支持度生成关联规则
rule_list = []#保存满足置信度的规则
sub_set_list = []#该数组保存检查过的频繁项
for i in range(0, len(L)):
for freq_set in L[i]:#遍历Lk
for sub_set in sub_set_list:#sub_set_list中保存的是L1到Lk-1
if sub_set.issubset(freq_set):#检查sub_set是否是freq_set的子集
#检查置信度是否满足要求,是则添加到规则
conf = support_data[freq_set] / support_data[freq_set - sub_set]
big_rule = (freq_set - sub_set, sub_set, conf)
if conf >= min_conf and big_rule not in rule_list:
rule_list.append(big_rule)
sub_set_list.append(freq_set)
rule_list = sorted(rule_list,key=lambda x:(x[2]),reverse=True)
return rule_list
if __name__=="__main__":
##config
#filename="test.xls"
filename="药方.xls"
current_path = os.path.abspath(os.path.dirname(__file__))
if not os.path.exists(current_path+"\\log"):
os.mkdir("log")
path=current_path+"\\dataset\\"+filename
save_path=current_path+"\\log\\"+filename.split(".")[0]+"_apriori_plus.txt"
#频繁项集保存路径
Frequent_itemsets_save_path=current_path+"\\log\\"+filename.split(".")[0]+"_Frequent_itemsets.txt"
data=load_data(path)
apriori_plus=Apriori_plus()
n=panduan()
rule_list=apriori_plus.generate_R(data,min_support=0.2,min_conf=0.5,size=int(n),Frequent_itemsets_save_path=Frequent_itemsets_save_path)
#rule_list=apriori_plus.generate_R(data,min_support=500,min_conf=0.95,size=int(n),Frequent_itemsets_save_path=Frequent_itemsets_save_path)
save_rule(rule_list,save_path)