1. 写作目的
首先感叹一下推荐系统的子领域是这么的丰富,可以有召回阶段的传统推荐、精排阶段的CTR、序列化推荐、对话推荐、推荐系统与图神经网络结合、推荐系统与知识图谱结合、推荐系统与NAS技术结合。论文看得眼花缭乱,慕然回首,发现自己根本没入门。
BPR算法是基础的推荐算法,在我入门推荐系统时很容易理解BPR算法,但是当我想实现完整的推荐算法时,却困难重重,如何加载数据,如何进行负采样,如何写评价指标函数,让我很烦恼。
参照王翔老师的NGCF算法 Tensorflow版 和 PyTorch版 自己实现一个完整的BPR推荐算法框架。
我对此进行了一些改进,想把整个代码封装成一个BPR类,但是由于cuda和并行评价的冲突,导致BPR类代码在类的思想有点瑕疵,还需再改进。
2. BPR算法简介
已知数据集中的交互集合
(
u
,
i
)
(u,i)
(u,i), 我们可以继续得到用户未交互的集合 $ (u,i) $,组成三元组集合
(
u
,
i
,
j
)
(u, i, j)
(u,i,j),u为用户,i为u交互的item,j为u未交互的item。
D
S
:
=
{
(
u
,
i
,
j
)
∣
i
∈
I
u
+
∧
j
∈
I
u
−
}
D_S :=\{(u,i,j)|i ∈ I^+_u ∧j ∈ I^-_u\}
DS:={(u,i,j)∣i∈Iu+∧j∈Iu−}
每一个三元组 ( u , i , j ) (u, i, j) (u,i,j),我们可以通过计算的 x u i ( u , i 交 互 的 预 测 概 率 ) xui(u,i交互的预测概率) xui(u,i交互的预测概率) 和 x u j ( u , j 交 互 的 预 测 概 率 ) xuj(u,j交互的预测概率) xuj(u,j交互的预测概率),进而得到最终的损失函数
B
P
R
−
L
o
s
s
=
−
∑
(
u
,
i
,
j
)
∈
D
S
l
n
σ
(
x
u
i
−
x
u
j
)
+
λ
∣
∣
Θ
∣
∣
2
BPR-Loss = -\sum_{(u,i,j)∈D_S} lnσ(x_{ui}-x_{uj})+λ||Θ||^2
BPR−Loss=−(u,i,j)∈DS∑lnσ(xui−xuj)+λ∣∣Θ∣∣2
我们主要介绍程序,算法细节讲得比较简单,详细可以浏览刘建平老师的博客 贝叶斯个性化排序(BPR)算法小结
3. 数据集介绍和运行环境
-
运行环境:python 3.6、pytorch 1.5、pandas、numpy
-
数据集:Amazon-book,程序用到的是已经过滤和编号处理好的数据。
数据集格式:
0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 1 53 54 49 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 2 372 373 114 13 374 375 376 377 378 379 380 381 382 383 3 415 416 49 417 32 418 23 419 27 211 420 421 422 24 115 4 551 552 553 554 555 556 5 594 595 54 596 367 597 598 599 600 601每一行数据结构是:一个user 和 多个此user交互过的item
userid itemid1 itemid2 itemid3 itemid4 itemid5 itemid6 itemid7 itemid8
4.代码实现
整体代码:
-
首先设置随机种子,保证实验可复现性;设置运行设备。
-
通过自己实现的Data类加载amazon-book数据集
-
初始化BPR类,预设embedding_size,l2_reg_embedding
-
最后运行model.fit()方法,进行训练,同时输出在测试集上的评价指标
def set_seed(seed):
torch.manual_seed(seed) # 为CPU设置随机种子
torch.cuda.manual_seed(seed) # 为当前GPU设置随机种子
torch.cuda.manual_seed_all(seed) # 为所有GPU设置随机种子
np.random.seed(seed)
random.seed(seed)
def set_device():
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
print('cuda ready...')
device = 'cuda:0'
return device
if __name__ == '__main__':
seed = 1024
set_seed(seed) # 设置随机种子
device = set_device() # 设置运行设备
filepath = 'amazon-book'
data_generator = Data(path=filepath)
bpr.data_generator = data_generator # 加载amazon-book数据集
model = BPR(data_generator.n_users, data_generator.n_items, embedding_size = 10, l2_reg_embedding=0.00001, device=device)
model.fit(learning_rate=0.001, batch_size=2000, epochs=50, verbose=5, early_stop=False)
数据加载类Data:
class Data(object):
def __init__(self, path, batch_size=256): # 读入数据集
def sample(self,batch_size=256): # 随机生成部分三元组,之后传入模型训练。
- init()方法,用于加载数据集。主要得到训练集、测试集中的映射集{user:item_list}
def __init__(self, path, batch_size=256):
self.path = path
self.batch_size = batch_size
train_file = path + '/train.txt'
test_file = path + '/test.txt'
#get number of users and items
self.n_users, self.n_items = 0, 0
self.n_train, self.n_test = 0, 0
self.neg_pools = {}
self.exist_users = []
self.train_items, self.test_set = {}, {} # userid to itemset
with open(train_file) as f:
for l in f.readlines():
if len(l) > 0:
l = l.strip('\n').split(' ')
items = [int(i) for i in l[1:]]
uid = int(l[0])
self.exist_users.append(uid)
self.n_items = max(self.n_items, max(items))
self.n_users = max(self.n_users, uid)
self.n_train += len(items)
self.train_items[uid] = items
with open(test_file) as f:
for l in f.readlines():
if len(l) > 0:
try:
l = l.strip('\n').split(' ')
uid = int(l[0])
items = [int(i) for i in l[1:]]
except Exception:
continue
self.n_items = max(self.n_items, max(items))
self.n_test += len(items)
self.test_set[uid] = items
self.n_items += 1 # user、item都是从0开始编号,所以要 +1
self.n_users += 1
- sample()方法,随机生成一定数量的三元组,之后传入模型训练。这里说明一下为什么每次按照概率生成部分三元组,而不是从全部的三元组里抽取部分三元组。因为user、item数目多,如果生成全部的三元组(u, i, j),则规模巨大,占用内存。
def sample(self,batch_size=256):
'''
:return: [user id], [postiveitem id] [negativeitem id]
'''
if batch_size <= self.n_users:
users = rd.sample(self.exist_users, batch_size)
else:
users = [rd.choice(self.exist_users) for _ in range(batch_size)]
def sample_pos_items_for_u(u, num):
# sample num pos items for u-th user
pos_items = self.train_items[u]
n_pos_items = len(pos_items)
pos_batch = []
while True:
if len(pos_batch) == num:
break
pos_id = np.random.randint(low=0, high=n_pos_items, size=1)[0]
pos_i_id = pos_items[pos_id]
if pos_i_id not in pos_batch:
pos_batch.append(pos_i_id)
return pos_batch
def sample_neg_items_for_u(u, num):
# sample num neg items for u-th user
neg_items = []
while True:
if len(neg_items) == num:
break
neg_id = np.random.randint(low=0, high=self.n_items,size=1)[0]
if neg_id not in self.train_items[u] and neg_id not in neg_items:
neg_items.append(neg_id)
return neg_items
pos_items, neg_items = [], []
for u in users:
pos_items += sample_pos_items_for_u(u, 1)
neg_items += sample_neg_items_for_u(u, 1)
return users, pos_items, neg_items
BPR模型类:
data_generator = 0 # Data(path=filepath) 主程序main.py中传入,设置公有变量的目的是使得程序运行更快
class BPR(nn.Module):
def __init__(self, n_user, n_item,
embedding_size=4, l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001,seed=1024, device='cpu'): # 模型初始化,传入参数
def forward(self, input_dict): # 模型计算
def rating(self, user_batch, all_item): # 生成评分矩阵,test()调用
def fit(self, learning_rate=0.001, batch_size=500, epochs=15): # 训练模型,输出测试评价指标
def test(self, batch_size=256, ): # 测试评价指标,fit()调用
def create_embedding_matrix(self, vocabulary_size, embedding_size): # embedding工具函数
-
init()
def __init__(self, n_user, n_item, embedding_size=4, l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024, device='cpu'): super(BPR, self).__init__() self.n_user = n_user self.n_item = n_item self.embedding_size = embedding_size self.device = device self.l2_reg_embedding = l2_reg_embedding self.embedding_dict = nn.ModuleDict({ 'user_emb': self.create_embedding_matrix(n_user, embedding_size), 'item_emb': self.create_embedding_matrix(n_item, embedding_size) }) self.to(device) def create_embedding_matrix(self, vocabulary_size, embedding_size, init_std=0.0001, sparse=False,): embedding = nn.Embedding(vocabulary_size, embedding_size, sparse=sparse) nn.init.normal_(embedding.weight, mean=0, std=init_std) return embedding -
fit() 函数: 主要分成两部分:
- 模型训练部分:随机采样得到部分三元组(u,i,j),然后通过**forward()**方法对其计算损失函数,求解梯度进行更新参数
- 测试集评价指标:调用test()方法,得到precision、recall、ndcg等一系列指标
def fit(self, learning_rate=0.001, batch_size=500, epochs=15, verbose=5, early_stop=False): print(self.device, end="\n") self.data_generator = data_generator model = self.train() loss_func = nn.LogSigmoid() optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0) # 显示 一次epoch需要几个step sample_num = data_generator.n_train n_batch = (sample_num - 1) // batch_size + 1 print("Train on {0} samples, {1} steps per epoch".format(sample_num, n_batch)) logloss_list = [] auc_score_list = [] for epoch in range(epochs): loss_epoch = 0 total_loss, total_mf_loss, total_emb_loss = 0.0, 0.0, 0.0 train_result = {} pred_ans = [] true_ans = [] with torch.autograd.set_detect_anomaly(True): start_time = time.time() for index in range(n_batch): users, pos_items, neg_items = data_generator.sample(batch_size) users = torch.from_numpy(np.array(users)).to(self.device).long() pos_items = torch.from_numpy(np.array(pos_items)).to(self.device).long() neg_items = torch.from_numpy(np.array(neg_items)).to(self.device).long() input_dict = {'users': users, 'pos_items': pos_items, 'neg_items': neg_items} rui, ruj, emb_loss = model(input_dict) optimizer.zero_grad() mf_loss = -loss_func(rui - ruj).mean() reg_emb_loss = self.l2_reg_embedding*emb_loss/batch_size loss = mf_loss + reg_emb_loss loss.backward(retain_graph=True) optimizer.step() total_mf_loss = total_mf_loss + mf_loss.item() total_emb_loss = total_emb_loss + reg_emb_loss.item() total_loss = total_loss + loss.item() epoch_time = time.time() - start_time print('epoch %d %.2fs train loss is [%.4f = %.4f + %.4f] ' % (epoch, epoch_time, total_loss / n_batch, total_mf_loss/n_batch, total_emb_loss/n_batch)) start_time = time.time() result = self.test(batch_size=batch_size) eval_time = time.time() - start_time print( 'epoch %d %.2fs test precision is [%.4f %.4f] recall is [%.4f %.4f] ndcg is [%.4f %.4f] hit_ratio is [%.4f %.4f] MAP is [%.4f %.4f] auc is %.4f ' % (epoch, eval_time, result['precision'][0], result['precision'][-1], result['recall'][0], result['recall'][-1], result['ndcg'][0],result['ndcg'][-1], result['hit_ratio'][0],result['hit_ratio'][-1], result['MAP'][0], result['MAP'][-1], result['auc'])) print(" ") -
foward()函数:每一个三元组 ( u , i , j ) (u, i, j) (u,i,j),计算的 x u i ( u , i 交 互 的 预 测 概 率 ) xui(u,i交互的预测概率) xui(u,i交互的预测概率) 和 $xuj(u,j交互的预测概率)
def forward(self, input_dict):
'''
:param input_dict:
:return: rui, ruj
'''
users, pos_items, neg_items = input_dict['users'], input_dict['pos_items'], input_dict['neg_items']
user_vector = self.embedding_dict['user_emb'](users)
pos_items_vector = self.embedding_dict['item_emb'](pos_items)
neg_items_vector = self.embedding_dict['item_emb'](neg_items)
rui = torch.sum(torch.mul(user_vector, pos_items_vector), dim=-1, keepdim=True)
ruj = torch.sum(torch.mul(user_vector, neg_items_vector), dim=-1, keepdim=True)
emb_loss = torch.norm(user_vector) ** 2 + torch.norm(pos_items_vector) ** 2 + torch.norm(neg_items_vector) ** 2
return rui, ruj, emb_loss
-
test()函数:
-
通过调用rating()函数,为所有的<user, item>, 生成评分(交互概率)
-
针对每一个user u,首先得到user u 未交互的item集合 I u − I_u^- Iu−,结合测试集中user u实际交互的item集合 I u t e s t I_u^{test} Iutest,生成评价指标(因为全体item集合实在是太大了,也有其他程序对未交互的item集合进行采样,使得正负样本维持在一定比例,比如1:100)。计算评价指标,具体在**test_one_user()**函数实现(本文没有给出此函数,可以在代码文件找到)。
为了加快评测速度,我们加入多线程,同时计算多个user的评价指标。
-
def test(self, batch_size=256, ):
model = self.eval()
cores = multiprocessing.cpu_count() // 2
pool = multiprocessing.Pool(cores)
Ks = [20, 40, 60, 80, 100]
# data_generator = self.data_generator
ITEM_NUM = data_generator.n_items
result = {'precision': np.zeros(len(Ks)), 'recall': np.zeros(len(Ks)), 'ndcg': np.zeros(len(Ks)),
'hit_ratio': np.zeros(len(Ks)), 'MAP': np.zeros(len(Ks)), 'auc': 0.}
u_batch_size = batch_size
i_batch_size = batch_size
test_users = list(data_generator.test_set.keys())
n_test_users = len(test_users)
n_user_batchs = (n_test_users - 1) // u_batch_size + 1
count = 0
# with torch.no_grad():
for u_batch_id in range(n_user_batchs):
start = u_batch_id * u_batch_size
end = (u_batch_id + 1) * u_batch_size
user_batch = test_users[start: end] # 取一部分 user
all_item = range(ITEM_NUM)
user_batch = torch.from_numpy(np.array(user_batch)).to(self.device).long()
all_item = torch.from_numpy(np.array(all_item)).to(self.device).long()
rate_batch = model.rating(user_batch,
all_item).detach().cpu() # shape is [len(user_batch),ITEM_NUM] 即预测评分矩阵
user_batch_rating_uid = zip(rate_batch.numpy(), user_batch.detach().cpu().numpy()) # 一个user 对应一行评分
batch_result = pool.map(test_one_user, user_batch_rating_uid)
count += len(batch_result)
for re in batch_result:
result['precision'] += re['precision'] / n_test_users
result['recall'] += re['recall'] / n_test_users
result['ndcg'] += re['ndcg'] / n_test_users
result['hit_ratio'] += re['hit_ratio'] / n_test_users
result['MAP'] += re['MAP'] / n_test_users
# result['auc'] += re['auc'] / n_test_users
assert count == n_test_users
pool.close()
return result
- rating()函数:为所有的<user, item>, 生成评分(交互概率)
def rating(self, user_batch, all_item):
user_vector = self.embedding_dict['user_emb'](user_batch)
pos_items_vector = self.embedding_dict['item_emb'](all_item)
return torch.mm(user_vector, pos_items_vector.t())
本文介绍了如何入门推荐系统,特别是通过实现BPR算法,并针对CUDA并行问题进行改进。内容涵盖了BPR算法原理、数据集亚马逊Book的使用、代码实现步骤和数据加载类Data的详解。
2285





