Spark的reduceByKey使用时注意:如果key值是唯一的,那么value值就会原样输出。
reduceByKey是基于combineByKey,如果key值唯一,就不会执行merge步骤,也就不会执行reduceByKey方法体。
代码实例:
object Test {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
val sc = new SparkContext(conf)
val rdd = sc.parallelize(
List(
(("10001085", "51"), List(6,5,4)),
(("10001085", "51"), List(5,4,3)),
(("10001085", "51"), List(4,3,2)),
(("10001085", "01"), List(3,2,1))))
val ret = rdd.combineByKey(
(v: List[Int]) => {var ii = 0;for(i <- v){ii += i}; ii + "-" },
(c: String, v: List[Int]) => {var ii = 0;for(i <- v){ii += i}; ii + "@" + c },
(c1: String, c2: String) => c1 + "$" + c2
)
val array = ret.collect()
for(a <- array){
println(a._1, a._2)
}
}
}