:scala版本算法
package com.bbw5.dataalgorithms.spark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
/**
* usermovieratings.txt
* <user> <movie> <rating>
* User1 Movie1 1
* User1 Movie2 2
* User1 Movie3 3
* User2 Movie1 2
* User2 Movie2 3
* User2 Movie3 3
* User2 Movie5 5
* author:baibaiw5
*/
object SparkContentBasedRecommend {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("SparkContentBasedRecommend")
val sc = new SparkContext(sparkConf)
//User Movie Rating->User1 Movie1 1
val texts = sc.textFile("G:/temp/data/usermovieratings.txt")
//User Movie Rating Number of raters->(Movie1,(User1,1,10))
val data = texts.map { x => x.split(" ") }.filter { x => x.length == 3 }.map { x => x(1) -> (x(0) -> x(2).toDouble)
}.groupByKey().flatMap {
case (movieId, userRatings) =>
userRatings.map { case (userId, rating) => userId -> (movieId, rating, userRatings.size) }
}
data.collect().foreach(println)
val data2 = data.groupByKey().flatMap {
case (_, movieRating) =>
movieRating.toList.sorted.combinations(2).map(x => (x(0), x(1))).map {
//ONLY TREATE USER who rates both movie1 and movie2
case (m1, m2) =>
//(m1.movieId,m2.movieId)
val key = (m1._1, m2._1)
//(m1.rating,m1.NumOfRaters,m2.rating,m2.NumOfRaters,ratingProduct,rating1Squared,rating2Squared)
val value = (m1._2, m1._3, m2._2, m2._3, m1._2 * m2._2, m1._2 * m1._2, m2._2 * m2._2)
(key, value)
}
}
//another way using join
//val data4Join = data.join(data).map { case (a, b) => b }.filter(c => c._1._1 < c._2._1)
data2.collect().foreach(println)
def calculatePearsonCorrelation(groupSize: Double,
rating1Sum: Double,
rating2Sum: Double,
dotProduct: Double,
rating1NormSq: Double,
rating2NormSq: Double): Double = {
val numerator = groupSize * dotProduct - rating1Sum * rating2Sum
val denominator = Math.sqrt(groupSize * rating1NormSq - rating1Sum * rating1Sum) *
Math.sqrt(groupSize * rating2NormSq - rating2Sum * rating2Sum)
if (denominator > 0) numerator / denominator else 0
}
def calculateCosineCorrelation(dotProduct: Double,
rating1Norm: Double,
rating2Norm: Double): Double = {
dotProduct / (rating1Norm * rating2Norm)
}
def calculateJaccardCorrelation(groupSize: Double, maxNumOfumRaters1: Double, maxNumOfumRaters2: Double): Double = {
val union = maxNumOfumRaters1 + maxNumOfumRaters2 - groupSize;
groupSize / union
}
data2.groupByKey().collect().foreach(println)
val data3 = data2.groupByKey().mapValues { values =>
val groupSize = values.size // length of each vector
val rating1Sum = values.map(_._1).sum // sum of rating1
val maxNumOfumRaters1 = values.map(_._2).max // max of numOfRaters1
val rating2Sum = values.map(_._3).sum // sum of rating2
val maxNumOfumRaters2 = values.map(_._4).max // max of numOfRaters2
val dotProduct = values.map(_._5).sum // sum of ratingProd
val rating1NormSq = values.map(_._6).sum // sum of rating1Squared
val rating2NormSq = values.map(_._7).sum // sum of rating2Squared
val pearson = calculatePearsonCorrelation(groupSize, rating1Sum, rating2Sum, dotProduct, rating1NormSq, rating2NormSq)
val cosin = calculateCosineCorrelation(dotProduct, Math.sqrt(rating1NormSq), Math.sqrt(rating2NormSq))
val jaccard = calculateJaccardCorrelation(groupSize, maxNumOfumRaters1, maxNumOfumRaters2)
(pearson, cosin, jaccard)
}
data3.collect().foreach(println)
}
}