from pyspark import SparkContext
import pyspark.mllib.recommendation as rd
#导入Spark上下文
sc = SparkContext("local","movie1")
#初始化Spark上下文,指定master为local,即本地运行,应用名称为movielens
rawData = sc.textFile("file:///home/hadoop/pyNote/train3.dat")
#加载本地movielens文件中的用户信息文件,file://开头,后接本地文件路径;也可上传至HDFS,hdfs://192.168.1.101:9000/ml-100k/u.user
rawRatings = rawData.map(lambda line: line.split('::') [0:3] )
# 由于ALS模型需要由Rating记录构成的RDD作为参数,因此这里用rd.Rating方法封装数据
ratings = rawRatings.map(lambda (user, movie, rating): rd.Rating(int(user), int(movie), float(rating)))
ratings.first()
import math
# 训练ALS模型
model = rd.ALS.train(ratings, 50, 10, 0.01) #rank=50 特征向量的大小,iteration=10默认的迭代次数.alpha=0.01,置信度常量
model.userFeatures
#predictedRating = model.predict(789, 123)
#print predictedRating
#topKRecs = model.recommendProducts(789, 10)
#topKRecs
usersProducts = ratings.map(lambda r: (r.user, r.product))
# predictAll方法以对(int, int)形式的rdd作为参数,这点与scala不同,scala直接用predict
predictions = model.predictAll(usersProducts).map(lambda r: ((r.user, r.product), r.rating))
#predictions.first()
# 形成一个(user, movie)做主键,(实际评分,预测评分)做值的rdd
ratingsAndPredictions = ratings.map(lambda r: ((r.user, r.product), r.rating)).join(predictions)
ratingsAndPredictions.first()
MSE = ratingsAndPredictions.map(lambda ((user, product), (actual, predicted)): (actual - predicted) ** 2).sum() \
/ ratingsAndPredictions.count()
#print "Mean Squared Error =", MSE
import math
RMSE = math.sqrt(MSE)
print "Root Mean Squared Error =", RMSE
def avgPrecisionK(actual, predicted, k):
predK = predicted[0:k]
score = 0.0
numHits = 0.0
for i, p in enumerate(predK):
if(p in actual):
numHits += 1
score += numHits / float(i + 1)
if(len(actual) == 0):
return 1.0
else:
return score / float(min(len(actual), k))
import numpy as np
itemFactors = model.productFeatures().map(lambda (id, factor): factor).collect()
itemMatrix = np.array(itemFactors)
#print itemMatrix.shape
imBroadcast = sc.broadcast(itemMatrix)
scoresForUser = model.userFeatures().map(lambda (userId, array): (userId, np.dot(imBroadcast.value, array)))
allRecs = scoresForUser.map(lambda (userId, scores):
(userId, sorted(zip(np.arange(1, scores.size), scores), key=lambda x: x[1], reverse=True))
).map(lambda (userId, sortedScores): (userId, np.array(sortedScores, dtype=int)[:,0]))
#print allRecs.first()[0]
#print allRecs.first()[1]
# groupByKey返回(int, ResultIterable), 其中ResultIterable.data才是数据
userMovies = ratings.map(lambda r: (r.user, r.product)).groupByKey()
print userMovies.first()[0]
print userMovies.first()[1].data
MAPK2 = allRecs.join(userMovies).map(lambda (userId, (predicted, actual)):avgPrecisionK(actual.data, predicted,10) ).sum() / allRecs.count()
print "Mean Average Precision at K =",MAPK2
MAPK3 = allRecs.join(userMovies).map(lambda (userId, (predicted, actual)):avgPrecisionK(actual.data, predicted,30)).sum() / allRecs.count()
print "Mean Average Precision at K =",MAPK3
MAPK4 = allRecs.join(userMovies).map(lambda (userId, (predicted, actual)):avgPrecisionK(actual.data, predicted,50)).sum() / allRecs.count()
print "Mean Average Precision at K =",MAPK4
MAPK5 = allRecs.join(userMovies).map(lambda (userId, (predicted, actual)):avgPrecisionK(actual.data, predicted,70)).sum() / allRecs.count()
print "Mean Average Precision at K =",MAPK5
MAPK6 = allRecs.join(userMovies).map(lambda (userId, (predicted, actual)):avgPrecisionK(actual.data, predicted,90)).sum() / allRecs.count()
print "Mean Average Precision at K =",MAPK6
ALS
最新推荐文章于 2023-07-27 22:19:27 发布