Spark实现集体智慧编程第二章电影推荐

最新推荐文章于 2025-04-28 22:28:35 发布

古巴与八股

最新推荐文章于 2025-04-28 22:28:35 发布

阅读量538

点赞数

CC 4.0 BY-SA版权

分类专栏： Spark 文章标签： spark

本文链接：https://blog.youkuaiyun.com/xuedingkai/article/details/78586067

Spark 专栏收录该内容

4 篇文章

订阅专栏

集体智慧编程中的电影推荐算法主要分两步：

1. 通过影评者对看多的相同电影的评分，计算影评者两两之间的兴趣相似度

2. 根据影评者之间的相似度和对电影的评分，为其他影评者没看过的电影提供推荐指数，推荐指数为相似度×电影评分

具体细节还是从代码中体会比较好：

package PCI2

import org.apache.spark.sql.SparkSession

object Recomment {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("recomment").master("local[2]").getOrCreate()
    import spark.implicits._
    val movies = spark.read.textFile("file:///home/xdk/file/movie.txt")
    val keyPeople = movies.map(x=>{val sz = x.split("\t"); Tuple2(sz(0), Tuple2(sz(1), sz(2)))}).rdd
    val keyMovie = keyPeople.map(x=>(x._2._1, (x._1, x._2._2)))
    val joined = keyMovie.join(keyMovie)
    val filtered = joined.filter(x=>x._2._1._1 != x._2._2._1)
    val peoplePair = filtered.map(x=>{if(x._2._1._1 < x._2._2._1)
      ((x._2._1._1, x._2._2._1), (x._1, x._2._1._2, x._2._2._2))
      else
      ((x._2._2._1, x._2._1._1), (x._1, x._2._2._2, x._2._1._2))
    })
    //影评者两两之间的相似度
    val sim = peoplePair.distinct().groupByKey().map(x=>(x._1, pearson(x._2)))

//    simulator.foreach(println)

    val simulator = sim.filter(x=>x._2>0).persist()
    val common = filtered.map(x=>((x._2._1._1, x._2._2._1), (x._1, x._2._2._2)))
//    common.foreach(println)
    val cart = keyPeople.cartesian(keyPeople).filter(x=>x._1._1 != x._2._1)
    val hisMovie = cart.map(x=>((x._1._1, x._2._1), x._2._2)).distinct()
    val recomm = hisMovie.subtract(common)
    val simulator1 = simulator.flatMap(x=>Array(x, ((x._1._2, x._1._1), x._2)))
    val joinRes = recomm.join(simulator1)
    val recommScore = joinRes.map(x=>((x._1._1, x._2._1._1), (x._2._1._2.toDouble * x._2._2.toDouble, x._2._2.toDouble)))
    val recommScoreSum = recommScore.reduceByKey((a, b) => (a._1+b._1, a._2+b._2))
    val recommMovies = recommScoreSum.map(x=>(x._1._1, (x._1._2, x._2._1/x._2._2))).groupByKey()

    println("======================")
    recommMovies.foreach(println)


  }


  def pearson(arg:Iterable[(String, String, String)]): Double ={
    var sumxy = 0.0
    var sumx = 0.0
    var sumy = 0.0
    var sumpowerx = 0.0
    var sumpowery = 0.0

    for(a<-arg){
      val x = a._2.toDouble
      val y = a._3.toDouble
      sumxy += x*y
      sumx += x
      sumy += y
      sumpowerx += x*x
      sumpowery += y*y
    }

    val n = arg.size
    val num = sumxy*n - sumx*sumy
    val den = math.sqrt(n*sumpowerx - sumx*sumx)*math.sqrt(n*sumpowery - sumy*sumy)

    if (den == 0)
      0
    else
      num/den
  }

}

测试数据：

Lisa Rose	The Night Listener	 3.0
Lisa Rose	You, Me and Dupree	 2.5
Lisa Rose	Lady in the Water	 2.5
Lisa Rose	Superman Returns	 3.5
Lisa Rose	Just My Luck	 3.0
Lisa Rose	Snakes on a Plane	 3.5
Michael Phillips	Lady in the Water	 2.5
Michael Phillips	Superman Returns	 3.5
Michael Phillips	The Night Listener	 4.0
Michael Phillips	Snakes on a Plane	 3.0
Mick LaSalle	The Night Listener	 3.0
Mick LaSalle	You, Me and Dupree	 2.0
Mick LaSalle	Lady in the Water	 3.0
Mick LaSalle	Superman Returns	 3.0
Mick LaSalle	Just My Luck	 2.0
Mick LaSalle	Snakes on a Plane	 4.0
Toby	Superman Returns	 4.0
Toby	You, Me and Dupree	 1.0
Toby	Snakes on a Plane	 4.5
Gene Seymour	The Night Listener	 3.0
Gene Seymour	You, Me and Dupree	 3.5
Gene Seymour	Lady in the Water	 3.0
Gene Seymour	Superman Returns	 5.0
Gene Seymour	Just My Luck	 1.5
Gene Seymour	Snakes on a Plane	 3.5
Jack Matthews	Lady in the Water	 3.0
Jack Matthews	Superman Returns	 5.0
Jack Matthews	The Night Listener	 3.0
Jack Matthews	You, Me and Dupree	 3.5
Jack Matthews	Snakes on a Plane	 4.0
Claudia Puig	Superman Returns	 4.0
Claudia Puig	Just My Luck	 3.0
Claudia Puig	The Night Listener	 4.5
Claudia Puig	You, Me and Dupree	 2.5
Claudia Puig	Snakes on a Plane	 3.5