参考:http://blog.youkuaiyun.com/zjd950131/article/details/8071414
http://blog.sina.com.cn/s/blog_6fb7db430100vdcf.html
很多人都喜欢用“啤酒跟尿布”这个例子来比喻机器学习,
主要就是想说明Apriori在挖掘对象关联的重要作用,这个算法很简单,没有涉及复杂的数学知识,一点逻辑而已,还有改进的Apriori算法,有时间我也会实现一下
简单实现了一下Apriori,直接上python代码和结果
#-*- coding:utf-8 -*-
'''
Created on Aug 25, 2013
@author: blacklaw
@ref: http://blog.sina.com.cn/s/blog_6fb7db430100vdcf.html
'''
# trade data use goods id
DATA = [[1, 2, 5],
[2, 4],
[2, 3],
[1, 2, 4],
[1, 3],
[2, 3],
[1, 3],
[1, 2, 3, 5],
[1, 2, 3]
]
def get_init_cand_list(data):
items = []
for grade in data:
for good in grade:
tu = tuple([good])
if not tu in items:
items.append(tu)
return sorted(items)
def get_frequence_dict(data, cand_list):
# tran cand_list to dict with candiate cont
frequence_dict = {}
for cand in cand_list:
frequence_dict[cand] = 0
for grade in data:
if len(set(cand) - set(grade)) == 0:
frequence_dict[cand] += 1
return frequence_dict
def del_unfreque_item(freq_dict):
# unfrequece item's count is 1
for item, count in freq_dict.items():
if count <= 1:# count of 0 or 1 delete
freq_dict.pop(item)
def candiate_combine(freq_dict):
# upper goods count of frequence like (1,2) | (2,3) = (1,2,3)
candiate_list = freq_dict.keys()
ret_list = []
for i, candiate in enumerate(candiate_list):
for union in candiate_list[i+1:]:
ret_list.append(tuple(set(candiate + union)))
return list(set(ret_list))
if __name__ == "__main__":
cand_list = get_init_cand_list(DATA)
time = 0
while(True):
time += 1
freq_dict = get_frequence_dict(DATA, cand_list)
del_unfreque_item(freq_dict)
print '%d good(s) frequently together:' % time, freq_dict
cand_list = candiate_combine(freq_dict)
if len(cand_list) == 0:
break
结果:
1 good(s) frequently together: {(2,): 7, (5,): 2, (3,): 6, (1,): 6, (4,): 2}
2 good(s) frequently together: {(1, 2): 4, (1, 3): 4, (1, 5): 2, (2, 3): 4, (2, 5): 2, (2, 4): 2}
3 good(s) frequently together: {(1, 2, 3): 2, (1, 2, 5): 2}
4 good(s) frequently together: {}