Spark自定义累加器

package sparkCore.accumulator

import org.apache.spark.sql.SparkSession

object AccumulatorV1 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName(this.getClass.getName)
      .master("local[*]")
      .getOrCreate()

    val sc = spark.sparkContext

    val rdd01 = sc.makeRDD(List(1, 2, 3, 4, 5))

    var sum = 0 //Driver端执行

    //foreach算子传入的函数,计算逻辑在Executor端执行
    rdd01.foreach {
      x => {
        println(s"Executor... x=$x sum=$sum")
        sum += x
      }
    }

    println(s"Driver... sum=$sum")
    sc.stop()

    /**
      * 输出结果:
      * Executor... x=1 sum=0
      * Executor... x=4 sum=0
      * Executor... x=2 sum=0
      * Executor... x=3 sum=0
      * Executor... x=5 sum=0
      * Driver... sum=0
      **/

  }
}
package sparkCore.accumulator

import org.apache.spark.sql.SparkSession

object AccumulatorV2 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName(this.getClass.getName)
      .master("local[*]")
      .getOrCreate()

    val sc = spark.sparkContext

    val rdd01 = sc.makeRDD(List(1, 2, 3, 4, 5))

    //分布式共享只写变量--累加器
    val sumAcc = sc.longAccumulator("sumAcc")

    rdd01.foreach(sumAcc.add(_))

    println(sumAcc.value) //15

    sc.stop()
  }
}
package sparkCore.accumulator

import org.apache.spark.sql.SparkSession

object AccumulatorV3 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName(this.getClass.getName)
      .master("local[*]")
      .getOrCreate()

    val sc = spark.sparkContext

    val rdd01 = sc.makeRDD(List(1, 2, 3, 4, 5))

    //分布式共享只写变量--累加器
    val sumAcc = sc.longAccumulator("sumAcc")

    val rdd02 = rdd01.map {
      x => {
        sumAcc.add(x)
        x
      }
    }

    //累加器一般在Action类型算子中使用,否则可能出现重复累加的情况
    rdd02.collect
    rdd02.collect
    println(sumAcc.value) //这里collect触发了两个job,累加器会被重复计算,所有输出30

    sc.stop()
  }
}
package sparkCore.accumulator

import org.apache.spark.sql.SparkSession
import org.apache.spark.util.AccumulatorV2
import scala.collection.mutable

object AccumulatorV4 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName(this.getClass.getName)
      .master("local[*]")
      .getOrCreate()

    val sc = spark.sparkContext

    val rdd01 = sc.makeRDD(List("hello", "java", "java", "scala", "scala", "scala"))

    //自定义累加器实现单词统计--可参考longAccumulator的实现过程
    val myAcc = new MyAccumulator
    sc.register(myAcc, "myAcc")

    rdd01.foreach {
      word => {
        myAcc.add(word)
      }
    }

    val wordcount = myAcc.value
    println(wordcount) //Map(scala -> 3, java -> 2, hello -> 1)

    sc.stop()
  }
}

/**
  * 自定义累加器 --实现wordcount
  * 通过累加器方式,没有shuffle阶段
  *
 **/

class MyAccumulator extends AccumulatorV2[String, mutable.Map[String, Long]] {
  private val map = mutable.Map[String, Long]()

  //判断是否为初始状态
  override def isZero: Boolean = map.isEmpty

  override def copy(): AccumulatorV2[String, mutable.Map[String, Long]] = new MyAccumulator

  //重置累加器
  override def reset(): Unit = map.clear()

  override def add(word: String): Unit = {
    val count = map.getOrElse(word, 0L) + 1
    map.update(word, count)
  }

  //Driver合并多个累加器的值
  override def merge(other: AccumulatorV2[String, mutable.Map[String, Long]]): Unit = {
    val map2: mutable.Map[String, Long] = other.value
    map2.foreach {
      case (word, cnt) => {
        val count = map.getOrElse(word, 0L) + cnt
        map.update(word, count)
      }
    }
  }

  override def value: mutable.Map[String, Long] = map
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值