参考论文:Advances in Collaborative Filtering
参考资料:http://blog.youkuaiyun.com/dark_scope/article/details/17228643
数据集:movielens100k
这里我只选用了u1.base, u1.test
两个部分分别作为测试集和训练集。
Baseline Predictor
bui→
A baseline prediction for an unknown rating.
bu→
the observed deviations of user u from the average.
bi→
the observed deviations of item i from the average.
μ→
the overall average rating.
我们可以通过优化下面的目标函数来估计 bu 和 bi 的值:
有一个更加简便但是会损失精确度的方法可以用来估计 bu 和 bi 的值:
评估结果
我们使用RMSE(Root Mean Squared Error)的值来评估结果的好坏:
|TestSet| :表示测试集的数目。
SVD
概念总结
用
qTipu
可以用来用户
u
与项目
优化的目标函数( bi、bu、qi、pu ):
采用随机梯度下降进行优化:
算法实现
from __future__ import division
import numpy as np
import scipy as sp
from numpy.random import random
class SVD_CF:
def __init__(self,train_X,k=20):
self.train_X=np.array(train_X)
self.k=k#size of latent vector
self.num_train=self.train_X.shape[0]#size of train dataset
self.bu={}#deviations of user u
self.bi={}#deviations of item i
self.mu=np.mean(self.train_X[:,2])
self.qi={}#latent vector of item
self.pu={}#latent vector of user
self.movie_user={}
self.user_movie={}
for i in range(self.num_train):
uid=self.train_X[i][0]
mid=self.train_X[i][1]
rat=self.train_X[i][2]
self.movie_user.setdefault(mid,{})
self.movie_user[mid][uid]=rat
self.user_movie.setdefault(uid,{})
self.user_movie[uid][mid]=rat
self.qi.setdefault(mid, random((self.k, 1)) / 10 * (np.sqrt(self.k)))
self.pu.setdefault(uid, random((self.k, 1)) / 10 * (np.sqrt(self.k)))#latent vector初始化为一串随机数
def pred(self,uid,mid):
self.bi.setdefault(mid, 0)
self.bu.setdefault(uid, 0)
self.qi.setdefault(mid, np.zeros((self.k, 1)))#对于训练集中没有的item,latent vector 设为0
self.pu.setdefault(uid, np.zeros((self.k, 1)))#对于训练集中没有的user,latent vector 设为0
# if (self.qi[mid] == None):
# self.qi[mid] = np.zeros((self.k, 1))
# if (self.pu[uid] == None):
# self.pu[uid] = np.zeros((self.k, 1))
ans = self.mu + self.bi[mid] + self.bu[uid] + np.sum(self.qi[mid] * self.pu[uid])
if ans > 5:#大于5的评分输出5
return 5
elif ans < 1:
return 1#小于1的评分输出1
return ans
def train(self,steps=20,gamma=0.04,lda=0.15):
print("Train on SVD:")
for step in range(steps):
print('the ',step,'-th step is running')
rmse_sum=0.0
kk = np.random.permutation(self.num_train)#随机打乱顺序
for j in range(self.num_train):
i=kk[j]
uid = self.train_X[i][0]
mid = self.train_X[i][1]
rat = self.train_X[i][2]
rui = self.pred(uid,mid)
eui = rat - rui
rmse_sum+=eui**2
self.bu[uid]+=gamma*(eui-lda*self.bu[uid])
self.bi[mid]+=gamma*(eui-lda*self.bi[mid])
self.qi[mid]+=gamma*(eui*self.pu[uid]-lda*self.qi[mid])
self.pu[uid]+=gamma*(eui*self.qi[mid]-lda*self.pu[uid])
gamma=gamma*0.93
rmse=np.sqrt(rmse_sum/self.num_train)
print("the rmse of this step on train data is ",rmse)
def test(self,test_X):
test_X = np.array(test_X)
output = []
sums = 0
print("Test on SVD:")
print("the test data size is ", test_X.shape)
for i in range(test_X.shape[0]):
pre = self.pred(test_X[i][0], test_X[i][1])
output.append(pre)
sums += (pre - test_X[i][2]) ** 2
rmse = np.sqrt(sums / test_X.shape[0])
print("the rmse on test data is ", rmse)
return output
提示:在测试的时候由于忘记 import scipy as sp
一直报错
Item-based
概念总结
项目
i
和项目
bui
为均值,
rui
为预测值。数据集
U(i,j)
包含所有同时给项目
i
和项目
根据上面的相似性计算公式,我们在同一个用户
u
评分的情况下为项目
算法实现
item_based.py
实现算法:
from __future__ import division
import numpy as np
import scipy as sp
class Item_based_CF:
def __init__(self,train_X):
self.train_X=np.array(train_X)
self.movie_user={}#用来找U(i,j)
self.user_movie={}#用来找S^k(i;u)
self.ave = np.mean(self.train_X[:, 2])#均值
for i in range(self.train_X.shape[0]):
uid=self.train_X[i][0]
mid=self.train_X[i][1]
rat=self.train_X[i][2]
self.movie_user.setdefault(mid,{})#dict.setdefault(key, default=None)default -- 键不存在时,设置的默认键值。
self.movie_user[mid][uid]=rat
self.user_movie.setdefault(uid, {})
self.user_movie[uid][mid] = rat
self.movie_sim={}
def sim_cal(self,m1,m2):
self.movie_sim.setdefault(m1, {})
self.movie_sim.setdefault(m2, {})
self.movie_user.setdefault(m1, {})#可能出现训练集中没有打过分的电影
self.movie_user.setdefault(m2, {})
self.movie_sim[m1].setdefault(m2, -1)
self.movie_sim[m2].setdefault(m1, -1)
if self.movie_sim[m1][m2] != -1:#直接输出已经计算过的sim
return self.movie_sim[m1][m2]
si = {}#存储U(i,j)
for user in self.movie_user[m1]:
if user in self.movie_user[m2]:
si[user] = 1
n = len(si)
if (n == 0):#U(i,j)为空
self.movie_sim[m1][m2] = 1
self.movie_sim[m2][m1] = 1
return 1
#求皮尔森系数
s1 = np.array([self.movie_user[m1][u] for u in si])
s2 = np.array([self.movie_user[m2][u] for u in si])
sum1 = np.sum(s1)
sum2 = np.sum(s2)
sum1Sq = np.sum(s1 ** 2)
sum2Sq = np.sum(s2 ** 2)
pSum = np.sum(s1 * s2)
num = pSum - (sum1 * sum2 / n)
den = np.sqrt((sum1Sq - sum1 ** 2 / n) * (sum2Sq - sum2 ** 2 / n))
if den == 0:#只有当den不等于0才有意义
self.movie_sim[m1][m2] = 0
self.movie_sim[m2][m1] = 0
return 0
self.movie_sim[m1][m2] = num / den
self.movie_sim[m2][m1] = num / den
return num / den
def pred(self,uid,mid):
sim_accumulate = 0.0
rat_acc = 0.0
for item in self.user_movie[uid]:
sim = self.sim_cal(item, mid)
if sim < 0: continue
rat_acc += sim * self.user_movie[uid][item]
sim_accumulate += sim
if sim_accumulate == 0: # no same user rated,return average rates of the data
return self.ave
return rat_acc / sim_accumulate
def test(self, test_X):
test_X = np.array(test_X)
output = []
sums = 0
print("Test on Item-Based:")
print("the test data size is ", test_X.shape)
for i in range(test_X.shape[0]):
pre = self.pred(test_X[i][0], test_X[i][1])
output.append(pre)
sums += (pre - test_X[i][2]) ** 2
rmse = np.sqrt(sums / test_X.shape[0])
print("the rmse on test data is ", rmse)
return output
test.py
实现读取数据和测试:
import numpy as np
from item_based import Item_based_CF
from svd_mf import SVD_CF
def data_read():
test = 'D:/MachineLearning/Notes/Recommendation/Projects/test01_CF/data/ml-100k/u1.test'
train = 'D:/MachineLearning/Notes/Recommendation/Projects/test01_CF/data/ml-100k/u1.base'
train_X = []
test_X = []
with open(test, 'r') as f:
lines = f.readlines()
for each in lines:
p = each.split()
new = [int(p[0]), int(p[1]), int(p[2])]
test_X.append(new)
with open(train, 'r') as f:
lines = f.readlines()
for each in lines:
p = each.split()
new = [int(p[0]), int(p[1]), int(p[2])]
train_X.append(new)
return train_X,test_X
#
train_X,test_X=data_read()
a=Item_based_CF(train_X)
a.test(test_X)
b=SVD_CF(train_X,30)
b.train()
b.test(test_X)
OUTPUT:Test on Item-Based:
the test data size is (20000, 3)
the rmse on test data is 1.04164917355
Train on SVD:
the 0 -th step is running
the rmse of this step on train data is 1.06147863504
the 1 -th step is running
the rmse of this step on train data is 0.933068598689
the 2 -th step is running
the rmse of this step on train data is 0.917020530118
the 3 -th step is running
the rmse of this step on train data is 0.908024566318
the 4 -th step is running
the rmse of this step on train data is 0.899151009875
the 5 -th step is running
the rmse of this step on train data is 0.891401965185
the 6 -th step is running
the rmse of this step on train data is 0.884340062176
the 7 -th step is running
the rmse of this step on train data is 0.877877195046
the 8 -th step is running
the rmse of this step on train data is 0.872217832148
the 9 -th step is running
the rmse of this step on train data is 0.867243365719
the 10 -th step is running
the rmse of this step on train data is 0.862927174872
the 11 -th step is running
the rmse of this step on train data is 0.859048442347
the 12 -th step is running
the rmse of this step on train data is 0.85523094384
the 13 -th step is running
the rmse of this step on train data is 0.852608188393
the 14 -th step is running
the rmse of this step on train data is 0.849679360252
the 15 -th step is running
the rmse of this step on train data is 0.847087016118
the 16 -th step is running
the rmse of this step on train data is 0.84498687518
the 17 -th step is running
the rmse of this step on train data is 0.842603466917
the 18 -th step is running
the rmse of this step on train data is 0.840825090125
the 19 -th step is running
the rmse of this step on train data is 0.838784925391
Test on SVD:
the test data size is (20000, 3)
the rmse on test data is 0.929369246698