推荐之召回策略

这篇博客介绍了召回策略中item的协同过滤算法的实现,包括物品相似度计算、用户对item的推荐分数以及如何考虑用户活跃度和时间衰减。首先,通过计算用户重合度确定物品相似度,然后利用物品相似度和用户行为得分计算推荐分数。在升级版的算法中,引入了用户活跃度的贡献度降低和时间衰减惩罚。博客提供了Python代码实现,展示了从用户行为数据和物品信息中提取信息,并进行推荐结果计算的过程。

召回策略-item的协同过滤

给用户推荐他之前喜欢的物品的相似物品

1)计算物品的相似度:喜欢两个物品的用户重合度越高,那么两个物品就越相似

2)用户对item_j对推荐分数

https://img4.mukewang.com/5c3dcf9d000159fa03410099.jpg


N(u) : user行为过的item 的总数
rui : user对物品i的行为得分(比如电影评分系统中用户对电影的评分-->归一化0~1间的一个值)
Sij  : 物品 i 和 j 的相似得分
item i 是用户行为过的物品且是与item j 最相似的top k个 item (一般实战中选取50个)

公式升级

1.理论意义:活跃用户应该被降低在相似度中的贡献度

https://img2.mukewang.com/5c3dd0ca0001325f03170115.jpg

由于部分用户很活跃,对任何商品都有贡献值,所以要降低贡献度

2.理论意义:用户在不同时间对item的操作应给予时间衰减惩罚

随着时间的增长,用户对于item的操作应该进行衰减

https://img4.mukewang.com/5c3dd0f30001017203220248.jpg

代码实现:抽取相关信息

# -*-coding:utf-8-*-
import os
import sys

#获取user-item数据{userid:[item1,item2],....}
def get_user_click(rating_file):

    if not os.path.exists(rating_file):
        return {}

    fp = open(rating_file)
    num = 0
    user_click = {}
    for line in fp:
        if num == 0:
            num += 1
            continue
        item = line.strip().split(',')
        if len(item)<4:
            continue
        [userid, itemid, rating, timestamp]=item
        if float(rating)<3.0:
            continue

        if userid not in user_click:
            user_click[userid] = []
        user_click[userid].append(itemid)
    fp.close()
    return user_click
#获取item-fea数据{itemid:[title, genres],....}
def get_item_info(item_file):

    if not os.path.exists(item_file):
        return {}
    num = 0
    item_info = {}
    fp = open(item_file)
    for line in fp:
        if num == 0:
            num += 1
            continue
        item = line.strip().split(',')

        if len(item)<3:
            continue
        if len(item) == 3:
            [itemid, title, genres] = item
        elif len(item)>3:
            itemid = item[0]
            genres = item[-1]
            title = ",".join(item[1:-1])

        if itemid not in item_info:
            item_info[itemid] = [title, genres]
    fp.close()
    return item_info


if __name__ == "__main__":
    user_click = get_user_click("./data/ratings.csv")
    # print(len(user_click))
    # print(user_click['2'])
    item_info = get_item_info("./data/movies.csv")
    print(item_info["1"])

itemcf的实现,根据公式1进行的编码

# -*-coding:utf-8-*-
import os
import sys
import reader as reader
import math
import operator

def base_contribute_score():
    return 1


def cal_item_sim(user_click):
    """
    args: user_click:dict, key userid value [itemid1,itemid2]
    return:dict, key:itemid_i, value dict, value_key itemid_j,value_value simscore
    """
    co_appear = {}
    item_user_click_time = {}
    for user, itemlist in user_click.items():
        for index_i in range(0, len(itemlist)):
            itemid_i = itemlist[index_i]
            item_user_click_time.setdefault(itemid_i, 0)
            item_user_click_time[itemid_i] += 1

            for index_j in range(index_i+1, len(itemlist)):
                itemid_j = itemlist[index_j]
                co_appear.setdefault(itemid_i, {})
                co_appear[itemid_i].setdefault(itemid_j, 0)
                co_appear[itemid_i][itemid_j] += base_contribute_score()

                co_appear.setdefault(itemid_j,{})
                co_appear[itemid_j].setdefault(itemid_i, 0)
                co_appear[itemid_j][itemid_i] += base_contribute_score()
    item_sim_score = {}
    item_sim_score_sorted = {}
    for itemid_i, relate_item in co_appear.items():
        for itemid_j, co_time in relate_item.items():
            sim_score = co_time/math.sqrt(item_user_click_time[itemid_i] * item_user_click_time[itemid_j])
            item_sim_score.setdefault(itemid_i, {})
            item_sim_score[itemid_i].setdefault(itemid_j, 0)
            item_sim_score[itemid_i][itemid_j] = sim_score

    for itemid in item_sim_score:
        item_sim_score_sorted[itemid] = sorted(item_sim_score[itemid].items(), key=operator.itemgetter(1),reverse=True)
    return item_sim_score_sorted


def cal_recom_result(sim_info, user_click):
    # 召回的item
    recent_click_num=3
    topk = 5
    recom_info = {}
    for user in user_click:
        click_list = user_click[user]
        recom_info.setdefault(user, {})
        for itemid in click_list[:recent_click_num]:
            if itemid not in sim_info:
                continue
            for itemsimzuhe in sim_info[itemid][:topk]:
                print(itemsimzuhe)
                itemsimid = itemsimzuhe[0]
                print(itemsimid)
                itemsimscore = itemsimzuhe[1]
                print(itemsimscore)
                recom_info[user][itemsimid] = itemsimscore


    return recom_info


def main_flow():
    user_click = reader.get_user_click("./data/ratings.csv")
    sim_info = cal_item_sim(user_click)
    # print('*************',sim_info['1'])
    recom_result = cal_recom_result(sim_info, user_click)
    print(recom_result["3"])


if __name__ == "__main__":
    main_flow()

修改后的升级函数

# -*-coding:utf-8-*-
import os
import sys
import reader as reader
import math
import operator

def base_contribute_score():
    return 1

def update_one_contribute_score(user_total_click_num):
    return 1/math.log10(1+user_total_click_num)

def cal_item_sim(user_click):
    """
    args: user_click:dict, key userid value [itemid1,itemid2]
    return:dict, key:itemid_i, value dict, value_key itemid_j,value_value simscore
    """
    co_appear = {}
    item_user_click_time = {}
    for user, itemlist in user_click.items():
        for index_i in range(0, len(itemlist)):
            itemid_i = itemlist[index_i]
            item_user_click_time.setdefault(itemid_i, 0)
            item_user_click_time[itemid_i] += 1

            for index_j in range(index_i+1, len(itemlist)):
                itemid_j = itemlist[index_j]
                co_appear.setdefault(itemid_i, {})
                co_appear[itemid_i].setdefault(itemid_j, 0)
                co_appear[itemid_i][itemid_j] += base_contribute_score()

                co_appear.setdefault(itemid_j,{})
                co_appear[itemid_j].setdefault(itemid_i, 0)
                co_appear[itemid_j][itemid_i] += base_contribute_score()
    item_sim_score = {}
    item_sim_score_sorted = {}
    for itemid_i, relate_item in co_appear.items():
        for itemid_j, co_time in relate_item.items():
            sim_score = co_time/math.sqrt(item_user_click_time[itemid_i] * item_user_click_time[itemid_j])
            item_sim_score.setdefault(itemid_i, {})
            item_sim_score[itemid_i].setdefault(itemid_j, 0)
            item_sim_score[itemid_i][itemid_j] = sim_score

    for itemid in item_sim_score:
        item_sim_score_sorted[itemid] = sorted(item_sim_score[itemid].items(), key=operator.itemgetter(1),reverse=True)
    return item_sim_score_sorted


def cal_recom_result(sim_info, user_click):
    # 召回的item
    recent_click_num=3
    topk = 5
    recom_info = {}
    for user in user_click:
        click_list = user_click[user]
        recom_info.setdefault(user, {})
        for itemid in click_list[:recent_click_num]:
            if itemid not in sim_info:
                continue
            for itemsimzuhe in sim_info[itemid][:topk]:
                print(itemsimzuhe)
                itemsimid = itemsimzuhe[0]
                print(itemsimid)
                itemsimscore = itemsimzuhe[1]
                print(itemsimscore)
                recom_info[user][itemsimid] = itemsimscore


    return recom_info


# 调试信息,查看推荐的结果和item之间是否相似很大
def debug_getsim(item_info, sim_info):
    fixed_itemid = "3"
    if fixed_itemid not in item_info:
        print("invalid itemid")
        return
    [title_fix, generes_fix] = item_info[fixed_itemid]

    for zuhe in sim_info[fixed_itemid][:5]:
        itemid_sim = zuhe[0]
        sim_score = zuhe[1]
        if itemid_sim not in item_info:
            continue
        [title, genres] = item_info[itemid_sim]
        print(title_fix+"\t"+generes_fix+"\tsim:"+title+'\t'+genres+"\t"+str(sim_score))



def main_flow():
    user_click = reader.get_user_click("./data/ratings.csv")
    item_info = reader.get_item_info("./data/movies.csv")

    sim_info = cal_item_sim(user_click)
    # print('*************',sim_info['1'])
    debug_getsim(item_info, sim_info)
    # recom_result = cal_recom_result(sim_info, user_click)
    # print(recom_result["3"])


if __name__ == "__main__":
    main_flow()

基于用户的协同过滤usercf

# -*-coding:utf-8-*-
import os
import sys
import reader as reader
import math
import operator


def base_contribution_score():
    return 1

def transfer_user_click(user_click):

    item_click_by_user = {}
    for user in user_click:
        item_list = user_click[user]
        for itemid in item_list:
            item_click_by_user.setdefault(itemid,[])
            item_click_by_user[itemid].append(user)
    return item_click_by_user


def cal_user_sim(item_click_by_user):
    co_appear = {}
    user_click_count = {}
    for itemid, user_list in item_click_by_user.items():
        for index_i in range(0, len(user_list)):
            user_i = user_list[index_i]
            user_click_count.setdefault(user_i, 0)
            user_click_count[user_i] += 1
            for index_j in range(index_i+1, len(user_list)):
                user_j = user_list[index_j]
                co_appear.setdefault(user_i, {})
                co_appear[user_i].setdefault(user_j, 0)
                co_appear[user_i][user_j] += base_contribution_score()

                co_appear.setdefault(user_j, {})
                co_appear[user_j].setdefault(user_i, 0)
                co_appear[user_j][user_i] += base_contribution_score()

    user_sim_info = {}
    user_sim_info_sorted = {}
    for user_i, relate_user in co_appear.items():
        user_sim_info.setdefault(user_i, {})
        for user_j, cotime in relate_user.items():
            user_sim_info[user_i].setdefault(user_j, 0)
            user_sim_info[user_i][user_j] = cotime/math.sqrt(user_click_count[user_i]*user_click_count[user_j])

    for user in user_sim_info:
        user_sim_info_sorted[user] = sorted(user_sim_info[user].items(), key=operator.itemgetter(1),reverse=True)
    return user_sim_info_sorted


def cal_recom_result(user_click, user_sim):
    recom_result = {}
    topk_user = 3
    item_num = 5
    for user, item_list in user_click.items():
        tmp_dict = {}
        for itemid in item_list:
            tmp_dict.setdefault(itemid, 1)
        recom_result.setdefault(user, {})
        for zuhe in user_sim[user][:topk_user]:
            userid_j, sim_score = zuhe
            if userid_j not in user_click:
                continue
            for itemid_j in user_click[userid_j][:item_num]:
                recom_result[user].setdefault(itemid_j, sim_score)
    return recom_result

def main_flow():
    user_click = reader.get_user_click("./data/ratings.csv")
    item_click_by_user = transfer_user_click(user_click)
    user_sim = cal_user_sim(item_click_by_user)
    recom_result = cal_recom_result(user_click, user_sim)
    print(recom_result["1"])


if __name__ == "__main__":
    main_flow()

 

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值