spark-TestAggregateMessages-聚合消息

最新推荐文章于 2020-09-03 17:39:10 发布

原创最新推荐文章于 2020-09-03 17:39:10 发布 · 302 阅读

CC 4.0 BY-SA版权

本文深入探讨了使用Apache Spark处理大规模图数据的技巧，通过具体示例展示了如何利用Spark GraphX库实现图算法，如计算顶点间的最远距离。文章详细解释了RDD的工作原理，包括Transformation和Action的区别，以及它们在图算法中的应用。

import java.io.PrintWriter

import grizzled.slf4j.Logger
import org.apache.spark.graphx._
import org.apache.spark.graphx.util.GraphGenerators
import org.apache.spark.sql.SparkSession
import org.redblue.demo.TestGraph.toGexf

// 寻找与顶点距离最远的根顶点的算法
// 重要知识点
// RDD是延迟求值的，RDD的操作函数只是看起来会被执行，实际上并非如此。
// RDD的API函数有两类：transformation（转换函数）和action（执行函数）。
// Transformation是延迟执行的，这些函数的调用，会被放入待执行队列，并不会马上执行；
// 当一个action函数被调用时，会沿着这个action函数向上逐一追溯队列中的transformation函数，
// 直到最源头的起始调用（一般是读取数据源），然后再沿着transformation函数顺序实际执行，
// 一直执行到action函数，得出计算结果
/*
Transformation的官方文档方法集合如下：
map
filter
flatMap
mapPartitions
mapPartitionsWithIndex
sample
union
intersection
distinct
groupByKey
reduceByKey
aggregateByKey
sortByKey
join
cogroup
cartesian
pipe
coalesce
repartition
repartitionAndSortWithinPartitions
Action的官方文档方法集合如下：
reduce
collect
count
first
take
takeSample
takeOrdered
saveAsTextFile
saveAsSequenceFile
saveAsObjectFile
countByKey
foreach
*/

object TestAggregateMessages {

  val log = Logger(this.getClass)

  def main(args: Array[String]): Unit = {
    val sparkSession = SparkSession.builder
      .master("local[*]")
      .appName("TestGraph")
      .config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
      .config("spark.hadoop.validateOutputSpecs", false).getOrCreate
    val sc = sparkSession.sparkContext;

    val vertices = sc.makeRDD(Array((1L, "Ann"), (2L, "Bill"), (3L, "Charles"), (4L, "Diane")
      , (5L, "Went to gym this morning")))
    val edges = sc.makeRDD(Array(Edge(1L, 2L, "is-friends-with"), Edge(2L, 3L, "is-friends-with")
      , Edge(3L, 4L, "is-friends-with"), Edge(4L, 5L, "Likes-status")
      , Edge(3L, 5L, "Worse-status")))
    val graph = Graph(vertices, edges)

    //边上发送消息
    def sendMsg(ec: EdgeContext[Int, String, Int]): Unit = {
      ec.sendToDst(ec.srcAttr + 1)
      println("sendMsg: ", /*ec.srcId, ec.srcAttr,*/ ec.dstId, ec.dstAttr, ec.srcAttr + 1)
    }

    //合并消息，获取最大距离顶点
    def mergeMsg(a: Int, b: Int): Int = {
      println("mergeMsg: ", a, b)
      math.max(a, b)
    }

    // 执行函数执行时边开始向目标顶点发送消息
    // 目标顶点收到消息后进行合并处理，获取最大的距离
    // 图的递归结束条件是 图不再有变化
    // 顶点1 没有收到消息，距离为0
    // 顶点2 收到的消息都是 1 （顶点0的距离是0 + 1）
    // 顶点3 第一次收到消息是1 （顶点2当时的距离是0 + 1），第二次收到的消息是2 （顶点2这时的距离是1 + 1）
    // 一次类推，计算顶点4和顶点5
    // 直到图不再发生变化，图
    // (1,0) (2,1) (3,2) (4,3) (5,4)
    def propagateEdge(g: Graph[Int, String]): Graph[Int, String] = {
      val vertices = g.aggregateMessages[Int](sendMsg, mergeMsg)
      println("g.vertices && g.edges")
      g.vertices.collect().foreach(println)
      g.edges.collect().foreach(println)
      val g2 = Graph(vertices, g.edges)
      //g2.vertices.collect().foreach(println)
      //g2.edges.collect().foreach(println)
      val check = g2.vertices.join(g.vertices)
        .map(x => x._2._1 - x._2._2).reduce(_ + _)
      //g2.vertices.join(g.vertices).collect().foreach(println)
      if (check > 0) {
        println("propagateEdge(g2)")
        propagateEdge(g2)
      }
      else
        g
    }

    //顶点属性设置为0
    println("init")
    val initialGraph = graph.mapVertices((_, _) => 0)
    //initialGraph.vertices.collect().foreach(println)
    //initialGraph.edges.collect().foreach(println)
    println("propagateEdge")
    var g = propagateEdge(initialGraph)
    println("println")
    g.vertices.collect().foreach(println)
    sparkSession.stop()
  }
}