import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
# ================== 1. 数据加载与预处理 ==================
def load_data(train_path, test_path=None, columns=['userId', 'movieId', 'rating']):
"""加载训练集和测试集,构建评分矩阵"""
# 加载训练集
train_data = pd.read_csv(train_path, sep='\t', header=None, names=columns,
dtype={'userId': int, 'movieId': int, 'rating': float})
# 构建训练评分矩阵
user_ids = train_data['userId'].unique()
movie_ids = train_data['movieId'].unique()
train_matrix = pd.DataFrame(0, index=user_ids, columns=movie_ids)
for _, row in train_data.iterrows():
train_matrix.at[row['userId'], row['movieId']] = row['rating']
# 加载测试集(如果有)
test_data = None
if test_path:
test_data = pd.read_csv(test_path, sep='\t', header=None, names=columns,
dtype={'userId': int, 'movieId': int, 'rating': float})
return train_matrix, test_data
# ================== 2. 相似度计算 ==================
def calculate_similarity(matrix, mode='user', method='cosine'):
"""计算用户或物品的相似度矩阵"""
if mode == 'user':
if method == 'cosine':
sim_matrix = pd.DataFrame(
cosine_similarity(matrix),
index=matrix.index, columns=matrix.index # 用户ID作为行列索引
)
elif method == 'pearson':
sim_matrix = matrix.T.corr(method='pearson')
elif mode == 'item':
if method == 'cosine':
sim_matrix = pd.DataFrame(
cosine_similarity(matrix.T),
index=matrix.columns, columns=matrix.columns # 电影ID作为行列索引
)
elif method == 'pearson':
sim_matrix = matrix.corr(method='pearson')
return sim_matrix
# ================== 3. 评分预测 ==================
def user_based_predict(target_user, target_movie, train_matrix, user_sim, k=50):
"""基于用户的评分预测"""
if train_matrix.loc[target_user, target_movie] != 0:
return 0 # 已评分则跳过
sim_users = user_sim[target_user].sort_values(ascending=False)[1:k + 1]
weighted_sum, sim_sum = 0.0, 0.0
target_mean = train_matrix.loc[target_user].mean()
for user, similarity in sim_users.items():
if train_matrix.loc[user, target_movie] == 0:
continue
user_mean = train_matrix.loc[user].mean()
weighted_sum += similarity * (train_matrix.loc[user, target_movie] - user_mean)
sim_sum += abs(similarity)
return target_mean + (weighted_sum / sim_sum) if sim_sum != 0 else 0
def item_based_predict(target_user, target_movie, train_matrix, item_sim, k=50):
"""基于物品的评分预测"""
if train_matrix.loc[target_user, target_movie] != 0:
return 0 # 已评分则跳过
sim_movies = item_sim[target_movie].sort_values(ascending=False)[1:k + 1]
weighted_sum, sim_sum = 0.0, 0.0
for movie, similarity in sim_movies.items():
if train_matrix.loc[target_user, movie] == 0:
continue
weighted_sum += similarity * train_matrix.loc[target_user, movie]
sim_sum += abs(similarity)
return weighted_sum / sim_sum if sim_sum != 0 else 0
# ================== 4. 推荐与评估 ==================
def generate_recommendations(target_user, train_matrix, sim_matrix,
predict_func, top_n=10):
"""生成Top-N推荐"""
unrated_movies = train_matrix.columns[train_matrix.loc[target_user] == 0]
predictions = []
for movie in unrated_movies:
pred = predict_func(target_user, movie, train_matrix, sim_matrix)
predictions.append((movie, pred))
predictions.sort(key=lambda x: x[1], reverse=True)
return predictions[:top_n]
def evaluate_model(test_data, train_matrix, sim_matrix, predict_func):
"""计算测试集的RMSE"""
predictions, actuals = [], []
for _, row in test_data.iterrows():
user, movie, true_rating = row['userId'], row['movieId'], row['rating']
if user not in train_matrix.index or movie not in train_matrix.columns:
continue # 忽略训练集中不存在的用户或电影
pred_rating = predict_func(user, movie, train_matrix, sim_matrix)
predictions.append(pred_rating)
actuals.append(true_rating)
return np.sqrt(mean_squared_error(actuals, predictions))
# ================== 主程序 ==================
if __name__ == "__main__":
# 数据路径配置
train_path = './train_ratings.csv' # 训练集路径
test_path = './test_ratings.csv' # 测试集路径(可选)
# 加载数据
train_matrix, test_data = load_data(train_path, test_path, columns=['userId', 'movieId', 'rating'])
# 选择目标用户(示例)
target_user = 1
# ------------------ 基于用户的协同过滤 ------------------
print("计算用户相似度...")
user_sim = calculate_similarity(train_matrix, mode='user', method='cosine')
user_recommendations = generate_recommendations(
target_user, train_matrix, user_sim, user_based_predict, top_n=10
)
print(f"\n用户 {target_user} 的基于用户推荐:")
for movie, score in user_recommendations:
print(f"电影 {movie} \t预测评分:{score:.3f}")
# ------------------ 基于物品的协同过滤 ------------------
print("\n计算电影相似度...")
item_sim = calculate_similarity(train_matrix, mode='item', method='cosine')
item_recommendations = generate_recommendations(
target_user, train_matrix, item_sim, item_based_predict, top_n=10
)
print(f"\n用户 {target_user} 的基于电影推荐:")
for movie, score in item_recommendations:
print(f"电影 {movie} \t预测评分:{score:.3f}")
# ------------------ 测试集评估(如果提供测试集) ------------------
if test_data is not None:
print("\n评估基于用户的协同过滤模型...")
user_rmse = evaluate_model(test_data, train_matrix, user_sim, user_based_predict)
print(f"用户协同过滤的RMSE:{user_rmse:.4f}")
print("\n评估基于物品的协同过滤模型...")
item_rmse = evaluate_model(test_data, train_matrix, item_sim, item_based_predict)
print(f"物品协同过滤的RMSE:{item_rmse:.4f}")根据上述描述修改这份代码