跟着《推荐系统实践》用MovieLens数据集做了一下基于用户的协同过滤算法。
#coding: utf-8
import random
import math
K = 3
def SplitData(file, M, k, seed):
test = dict()
train = dict()
random.seed(seed)
with open(file, 'r') as rf:
"""
f.read()\f.readline()\f.readlines()
"""
for line in rf.readlines():
dataTmp = line.split("::")
"""
dataTmp[0]为user, 1为movie, 2为user对movie的打分
"""
if random.randint(0,M) == k:
if dataTmp[0] in test.keys():
test[dataTmp[0]][dataTmp[1]] = dataTmp[2]
else:
test[dataTmp[0]] = dict()
test[dataTmp[0]][dataTmp[1]] = dataTmp[2]
else:
if dataTmp[0] in train.keys():
train[dataTmp[0]][dataTmp[1]] = dataTmp[2]
else:
train[dataTmp[0]] = dict()
train[dataTmp[0]][dataTmp[1]] = dataTmp[2]
return test, train
"""
生成用户倒排表
W存储用户u和用户v的相似度,key1:用户u,key2:用户v,value:相似度
"""
def UserSimilarity(train):
item_users = dict()
"""
u是user, items是movie:rate
item_users统计每部电影参与评分的用户。
"""
for u, items in train.items():
for i in items.keys():
if i not in item_users:
item_users[i] = set()
item_users[i].add(u)
C = dict()
N = dict()
"""
N记录每个用户评分电影的部数。
C计算用户u和其他用户评同一部电影的次数。没有共同评分的电影,则不被记录。
"""
for i, users in item_users.items():
for u in users:
if u in N.keys():
N[u] += 1
else:
N[u] = 1
for v in users:
if u == v:
continue
if u in C.keys():
if v in C[u].keys():
C[u][v] += 1
else:
C[u][v] = 1
else:
C[u] = dict()
C[u][v] = 1
"""
W通过两个用户评分同一部电影的次数和这两个用户评分电影部数之积计算这两个用户的相似性。
"""
W = dict()
for u, related_users in C.items():
for v, cuv in related_users.items():
if u == v:
continue
if u in W.keys():
W[u][v] = cuv / math.sqrt(N[u] * N[v])
else:
W[u] = dict()
W[u][v] = cuv / math.sqrt(N[u] * N[v])
print(len(W))
n = 0
for u, related_users in W.items():
print(u)
print(related_users)
n += 1
if n > 10: break
return W
"""
rvi为用户给电影的打分
通过相似用户对某部电影的打分来对电影进行排名。
"""
def Recommend(user, train, W):
rank = dict()
interacted_items = train[user]
print(W[user])
for v, wuv in sorted(W[user].items, key=W[user].items.values(), reverse = True)[0:K]:
for i, rvi in train[v].items:
if i in interacted_items:
#filter items user interacted before
continue
if i in rank.keys():
rank[i] += wuv * rvi
else:
rank[i] = dict()
rank[i] = wuv * rvi
print(rank)
return rank
def main():
data = []
rate_file = "GroupLens_MovieLens/ml-1m/ml-1m/ratings.dat"
test, train = SplitData(rate_file, 8, 4, 60)
print(len(train))
W = UserSimilarity(train)
Recommend('59915', train, W)
if __name__ == '__main__':
main()