基于协同过滤的电影推荐研究,用协同过滤做出了算法得到了结果,但是评估是怎样得出的呢?准确率,召回率,F1是怎样得到的数据呢?
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')#控制警告
#读入数据
df = pd.read_csv('D:/Datamovies/ml-100k/u.data', sep='\t',names=['user_id','item_id','rating','titmestamp'])
# df = pd.read_csv('u.data', sep='\t',names=['用户id','电影id','评价','时间戳'])
df.head()
#读入电影数据 主要是想要其电影标题
movie_titles = pd.read_csv('D:/Datamovies/ml-100k/u.item',sep='|',encoding='ISO-8859-1',header=None)
movie_titles.rename(columns = {0: "item_id", 1:"title"}, inplace=True)
movie_titles.head()
df1 = pd.merge(df, movie_titles[['item_id','title']], on='item_id')
df1.head()
# 将df转为user_matrix
#数据的离差标准化(把数据映射到[0,1]),df1['rating']进行标准化
def MinMaxScale(data):
data = (data-data.min())/(data.max()-data.min())
return data
def df_to_userma(df):
user_matrix1 = df.pivot_table(index='user_id', columns='title', values='rating')
return user_matrix1
def userma_to_num(df):
m = df['user_id'].max() #用户最大数量,考虑id从0开始,所有+1
n=df['item_id'].max() #电影最大数量
df.sort_values(by=['user_id', 'item_id'],inplace=True)
user_matrix_num = np.zeros((m, n))#创建一个数值矩阵,其大小为用户数乘以电影数
for line in df.itertuples():
user_matrix_num[int(line[1])-1, int(line[2])-1] = line[3]
return user_matrix_num
# 返回用户数组
def userid_num(user_matrix):
user= user_matrix.index
return user
#返回电影标题数组
def title_num(user_matrix):
title= user_matrix.columns
return title
#创建评分记录矩阵record,含有评分记录为1,否则为0
def record_num(user_matrix_num):
record = user_matrix_num>0 # 出来为布尔值
record = np.array(record,dtype=int) #布尔值转为整型
return record
# 1.相关度计算(皮尔逊)并对缺失值进行填充和将其转换为numpy数组
def data_handling(user_matrix):
user_pearson= user_matrix.T.corr(method='pearson')
user_pearson = user_pearson.fillna(0)
user_pearson_num=user_pearson.to_numpy()
return user_pearson_num
# 2.预测推荐分
def predict(ratings, similarity):
record = record_num(ratings)
m,n = ratings.shape #m代表电影数量,n代表用户数量
mean_user_rating = np.zeros((m,1)) #用户每部电影的平均得分
for i in range(m):
idx = record[i,:]!=0 #每部电影的评分,[i,:]表示每一行的所有列
mean_user_rating[i] = np.mean(ratings[i,idx]) #第i行,评价分idx的平均得分
ratings_diff = (ratings - mean_user_rating)# 列中每一个元素减去平均数0
pred = mean_user_rating + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=0)]).T
return pred
import heapq
def main():
df1['rating'] = MinMaxScale(df1['rating'])
user_matrix1 = df_to_userma(df1)
user_matrix_num = userma_to_num(df1)
# print(user_matrix_num)
title = title_num(user_matrix1)
# user_matrix_num = fillna_numpy(user_matrix)
user_pearson_num = data_handling(pd.DataFrame(user_matrix_num))
user_prediction = predict(user_matrix_num, user_pearson_num)
# print(user_prediction)
while True:
userid = int(input("请输入用户id(如:3):"))
while userid not in range(0,944):
userid = int(input("抱歉您输入的用户id不存在:"))
if userid == 0:
print("感谢使用")
break
n = 10
a = user_prediction[:,userid]
max_indexs = heapq.nlargest(n, range(len(a)), a.take)
print('已为用户id为{}用户推荐以下十部电影:'.format(userid))
k=0
for i in max_indexs:
k=k+1
print('{}.{},推荐值为:{}'.format(k,title[i],user_prediction[:,userid][i]))
print()
main()
# print(f"a={a}")#皮尔逊衡量向量相似度