【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter 10 Content-Based Recommend

本文介绍了一个基于Apache Spark的内容推荐系统实现。通过加载用户电影评分数据并进行预处理,使用Spark Context进行数据处理,实现了用户对电影评分数据的分析。文章详细展示了如何计算不同电影之间的相似度,包括皮尔逊相关系数、余弦相似度和杰卡德相似度等,并通过这些相似度为用户推荐可能感兴趣的电影。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

:scala版本算法

package com.bbw5.dataalgorithms.spark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

/**
 * usermovieratings.txt
 * <user> <movie> <rating>
 * User1 Movie1 1
 * User1 Movie2 2
 * User1 Movie3 3
 * User2 Movie1 2
 * User2 Movie2 3
 * User2 Movie3 3
 * User2 Movie5 5
 * author:baibaiw5
 */
object SparkContentBasedRecommend {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("SparkContentBasedRecommend")
    val sc = new SparkContext(sparkConf)

    //User Movie Rating->User1 Movie1 1
    val texts = sc.textFile("G:/temp/data/usermovieratings.txt")
    //User Movie Rating Number of raters->(Movie1,(User1,1,10))
    val data = texts.map { x => x.split(" ") }.filter { x => x.length == 3 }.map { x => x(1) -> (x(0) -> x(2).toDouble)
    }.groupByKey().flatMap {
      case (movieId, userRatings) =>
        userRatings.map { case (userId, rating) => userId -> (movieId, rating, userRatings.size) }
    }

    data.collect().foreach(println)

    val data2 = data.groupByKey().flatMap {
      case (_, movieRating) =>
        movieRating.toList.sorted.combinations(2).map(x => (x(0), x(1))).map {
          //ONLY TREATE USER who rates both movie1 and movie2
          case (m1, m2) =>
            //(m1.movieId,m2.movieId)
            val key = (m1._1, m2._1)
            //(m1.rating,m1.NumOfRaters,m2.rating,m2.NumOfRaters,ratingProduct,rating1Squared,rating2Squared)
            val value = (m1._2, m1._3, m2._2, m2._3, m1._2 * m2._2, m1._2 * m1._2, m2._2 * m2._2)
            (key, value)
        }
    }

    //another way using join
    //val data4Join = data.join(data).map { case (a, b) => b }.filter(c => c._1._1 < c._2._1)

    data2.collect().foreach(println)

    def calculatePearsonCorrelation(groupSize: Double,
                                    rating1Sum: Double,
                                    rating2Sum: Double,
                                    dotProduct: Double,
                                    rating1NormSq: Double,
                                    rating2NormSq: Double): Double = {
      val numerator = groupSize * dotProduct - rating1Sum * rating2Sum
      val denominator = Math.sqrt(groupSize * rating1NormSq - rating1Sum * rating1Sum) *
        Math.sqrt(groupSize * rating2NormSq - rating2Sum * rating2Sum)
      if (denominator > 0) numerator / denominator else 0
    }

    def calculateCosineCorrelation(dotProduct: Double,
                                   rating1Norm: Double,
                                   rating2Norm: Double): Double = {
      dotProduct / (rating1Norm * rating2Norm)
    }

    def calculateJaccardCorrelation(groupSize: Double, maxNumOfumRaters1: Double, maxNumOfumRaters2: Double): Double = {
      val union = maxNumOfumRaters1 + maxNumOfumRaters2 - groupSize;
      groupSize / union
    }

    data2.groupByKey().collect().foreach(println)

    val data3 = data2.groupByKey().mapValues { values =>
      val groupSize = values.size // length of each vector
      val rating1Sum = values.map(_._1).sum // sum of rating1
      val maxNumOfumRaters1 = values.map(_._2).max // max of numOfRaters1
      val rating2Sum = values.map(_._3).sum // sum of rating2
      val maxNumOfumRaters2 = values.map(_._4).max // max of numOfRaters2
      val dotProduct = values.map(_._5).sum // sum of ratingProd
      val rating1NormSq = values.map(_._6).sum // sum of rating1Squared
      val rating2NormSq = values.map(_._7).sum // sum of rating2Squared

      val pearson = calculatePearsonCorrelation(groupSize, rating1Sum, rating2Sum, dotProduct, rating1NormSq, rating2NormSq)
      val cosin = calculateCosineCorrelation(dotProduct, Math.sqrt(rating1NormSq), Math.sqrt(rating2NormSq))
      val jaccard = calculateJaccardCorrelation(groupSize, maxNumOfumRaters1, maxNumOfumRaters2)
      (pearson, cosin, jaccard)
    }

    data3.collect().foreach(println)
  }
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值