1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
| |
package com.lyzx.day16
import org.apache.spark.{SparkContext, SparkConf}
class T2 {
/**
* reduceByKey = groupByKey + reduce
*从功能上讲相当于先做GroupByKey然后再做reduce操作但是比groupByKey+reduce操作效率要高
* 即对于键值对的RDD来说,应用于所有的key相同的值做操作
*
* 由于reduceByKey是先做groupByKey后做reduce
* 所以它包含shuffle操作(groupByKey包含shuffle操作)
*前面说过
* 功能上:reduceByKey = groupByKey + reduce
* 效率上:reduceByKey > groupByKey + reduce
* 原因是reduceByKey在map端自带Combiner
* 例如wordCount例子中map端
* [("java",(1,1,1,1)),("c",(1,1,1,1))]
* 如果在map做Combiner就像[("java",(4)),("c",(4))]
* 在reduce端fatch时效率会高
*/
def f1(sc:SparkContext): Unit ={
val arr = List(1,2,3,4,5,5,4,3,2,1)
//rdd中的数据如下[(1,2,3,4,5,5,4,3,2,1)]
val rdd = sc.parallelize(arr)
//mapRdd中的数据如下[((1,1),(2,2),(3,3),...(5,5),....,(1,1))]
val mapRdd = rdd.map(item=>(item,item*10))
val reduceRdd = mapRdd.reduceByKey(_+_)
reduceRdd.foreach(println)
}
}
object T2{
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("myTest").setMaster("local")
val sc = new SparkContext(conf)
val t = new T2()
t.f1(sc)
sc.stop()
}
}
|