Spark.graph实现找有某个共同关系的数据合并聚合

/**
  * Description: 使用Spark.graph实现找有某个共同关系的数据合并聚合
  * Author: Administrator
  * Date: 2020/4/14 0014
  */
object TempObj {
 def main(args: Array[String]): Unit = {
  val sparkSession = SparkSession.builder
   .appName(s"${this.getClass.getSimpleName}")
   .master("local[*]")
   .getOrCreate()
  /*业务:将下面包含相同uuid的数据合并成一条,相同的K,V做聚合,最终结果如下:
 uuid4 uuid1 uuid2 uuid3 K天行九歌:9 App爱奇艺:109 CD励志:8
 uuid5 uuid6 uuid7 App爱奇艺:10 K琅琊榜:5
 原始数据:
 uuid1 uuid2 uuid3 App爱奇艺:9 CD励志:8
 uuid3 uuid4 K天行九歌:9 App爱奇艺:100
 uuid5 uuid6 uuid7 App爱奇艺:10
 uuid6 K琅琊榜:5
*/
  val sc: SparkContext = sparkSession.sparkContext
  val baseData: RDD[Array[String]] = sc.textFile(
   "xxx.txt", 1)
   .map(x => {
    x.split(" ", -1)
   })
   // 缓存一下
  baseData.cache()
  /*
  * 构建点
  * (uuid1.hashcode,(uuid1,List(App爱奇艺:9 CD励志:8))
  * (uuid2.hashcode,(uuid2,List(App爱奇艺:9 CD励志:8))
  * (uuid3.hashcode,(uuid3,List(App爱奇艺:9 CD励志:8))
  * */
  val uv: RDD[(VertexId, (String, List[(String, Int)]))] = baseData.flatMap(arr => {
   val names = arr.filter(_.indexOf(":") == -1)
   val tags = arr.filter(_.indexOf(":") != -1)
   names.map(x => {
    // 这里将tags数据去重,约定每行的第一个唯一标识字段携带后面的日志信息
    if (x.equals(names(0))) {
     val tags2: List[(String, Int)] = tags.toList.map(_.split(":", -1)).map(x => {
      (x(0), x(1).toInt)
     })
     (x.hashCode.toLong, (x, tags2))
    }
    // 其他的唯一标识字段后面的日志信息传个空的和该算子返回值类型匹配的数据
    else (x.hashCode.toLong, (x, List.empty[(String, Int)]))
   })
  })
  // 缓存一下,否则会触发多次计算
  uv.cache()
  /*uv最终结果
  (111641558,(uuid1,List((App爱奇艺,9), (CD励志,8))))
  (111641559,(uuid2,List()))
  (111641560,(uuid3,List()))
  (111641560,(uuid3,List((K天行九歌:9 App爱奇艺:100))))
  (111641561,(uuid4,List()))
  (111641562,(uuid5,List((App爱奇艺,10))))
  (111641563,(uuid6,List()))
  (111641564,(uuid7,List()))
  (111641563,(uuid6,List((K琅琊榜,5))))
*/
  /*构建边
  * (uuid1.hashcode,uuid2.hashcode,0)
  * (uuid1.hashcode,uuid3.hashcode,0)
  * */
  val ue: RDD[Edge[Int]] = baseData.flatMap(arr => {
   val names = arr.filter(_.indexOf(":") == -1)
   val names2: Array[String] = names.slice(1, names.length)
   names2.map(name => {
    Edge(names(0).hashCode.toLong, name.hashCode.toLong, 0)
   })
  })
  // 缓存一下
  ue.cache()
  /* ue的最终结果
  Edge(111641558,111641559,0)
  Edge(111641558,111641560,0)
  Edge(111641560,111641561,0)
  Edge(111641562,111641563,0)
  Edge(111641562,111641564,0)
  * */
  // 构建图
  val gh: Graph[(String, List[(String, Int)]), Int] = Graph(uv, ue)
  // 连通图
  val vertices: VertexRDD[VertexId] = gh.connectedComponents().vertices
  /*vertices 表现为点到点的一条线
(111641560,111641558)
(111641563,111641562)
(111641561,111641558)
(111641558,111641558)
(111641564,111641562)
(111641559,111641558)
(111641562,111641562)
*/
  val joinRdd: RDD[(VertexId, (VertexId, (String, List[(String, Int)])))] = vertices
   .join(uv)
  // vertices根据相同顶点做join操作
  // (111641560,111641558) join  (111641560,(uuid3,List()))
  //                             (111641560,(uuid3,List((K天行九歌:9 App爱奇艺:100))))
  // 得到的结果为   (111641560,(111641558,(uuid3,List())))
  //              (111641560,(111641558,(uuid3,List((K天行九歌:9 App爱奇艺:100)))))
  /* 最终join结果
  (111641560,(111641558,(uuid3,List())))
  (111641560,(111641558,(uuid3,List((K天行九歌:9 App爱奇艺:100)))))
  (111641561,(111641558,(uuid4,List())))
  (111641558,(111641558,(uuid1,List((App爱奇艺,9), (CD励志,8)))))
  (111641559,(111641558,(uuid2,List())))
  =============================================================
  (111641563,(111641562,(uuid6,List())))
  (111641563,(111641562,(uuid6,List((K琅琊榜,5)))))
  (111641564,(111641562,(uuid7,List())))
  (111641562,(111641562,(uuid5,List((App爱奇艺,10)))))*/
  // 继续map操作,留下共同的最小的顶点cmId和原始的数据内容
  val cmIdNamesTags: RDD[(VertexId, (mutable.Set[String], List[(String, Int)]))] = joinRdd
   .map {
    case (id, (cmId, (name, tags))) => (cmId, (scala.collection.mutable.Set[String](name), tags))
    // (scala.collection.mutable.Set[String](name), tags)为原始的数据内容,只是用Set去了一下重操作
    /* 最终j结果
  (111641558,(uuid3,List()))
  (111641558,(uuid3,List((K天行九歌:9 App爱奇艺:100))))
  (111641558,(uuid4,List())))
  (111641558,(uuid1,List((App爱奇艺,9), (CD励志,8))))
  (111641558,(uuid2,List()))
  ==================================================
  (111641562,(uuid6,List()))
  (111641562,(uuid6,List((K琅琊榜,5))))
  (111641562,(uuid7,List())))
  (111641562,(uuid5,List((App爱奇艺,10))))*/
   }
  // 接下来就是按照相同的key做聚合逻辑,value是原始的数据内容,
  // 同一组的list相加后再按key分组再做worldCount操作
  // 最终达到合并聚合的目的
  cmIdNamesTags.reduceByKey((t1, t2) => {
   val names: mutable.Set[String] = t1._1 ++ t2._1
   val tags: List[(String, Int)] = (t1._2 ++ t2._2).groupBy(_._1).mapValues(_.foldLeft(0)(_ + _._2)).toList
   (names, tags)
  }).map(x => {
   val lt1: List[String] = x._2._2.map {
    case t => t._1 + ":" + t._2
   }
   x._2._1.mkString(" ") + " " + lt1.mkString(" ")
  })
   .foreach(println(_))
 }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值