spark:aggregate
/**
* Aggregate the elements of each partition, and then the results for all the partitions, using
* given combine functions and a neutral "zero value". This function can return a different result
* type, U, than the type of this RDD, T. Thus, we need one operation for merging a T into an U
* and one operation for merging two U's, as in scala.TraversableOnce. Both of these functions are
* allowed to modify and return their first argument instead of creating a new U to avoid memory
* allocation.
*
* @param zeroValue the initial value for the accumulated result of each partition for the
* `seqOp` operator, and also the initial value for the combine results from
* different partitions for the `combOp` operator - this will typically be the
* neutral element (e.g. `Nil` for list concatenation or `0` for summation)
* @param seqOp an operator used to accumulate results within a partition
* 在一个partition使用seqop进行计算操作
* @param combOp an associative operator used to combine results from different partitions
* 一个联合操作, 从不同的partitions用于combine函数结果。
*/
def aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U): U = withScope {
// Clone the zero value since we will also be serializing it as part of tasks
var jobResult = Utils.clone(zeroValue, sc.env.serializer.newInstance())
val cleanSeqOp = sc.clean(seqOp)
val cleanCombOp = sc.clean(combOp)
val aggregatePartition = (it: Iterator[T]) => it.aggregate(zeroValue)(cleanSeqOp, cleanCombOp)
val mergeResult = (index: Int, taskResult: U) => jobResult = combOp(jobResult, taskResult)
sc.runJob(this, aggregatePartition, mergeResult)
jobResult
}
example:
package test
import org.apache.spark.{SparkConf, SparkContext}
class testRdd {
def anylizeRdd():Unit={
val spark= new SparkContext(new SparkConf().setMaster("local").setAppName("test"))
rdd2=spark.textFile("file:///Users/Downloads/spark-2.1.1-bin-hadoop2.7/conf/log4j.properties.template")
val number=rdd2.repartition(10).aggregate(0)((x:Int,y:String)=>{
//x=0 初始值 表示计算中间值, y为每个分区的每一行string值
if(y.length>50) x+1
else x
},(a:Int,b:Int)=>{
//b表示每一个分区最后计算的值,a表示计算中间值,初始为0
a+b
})
println(number)
}
}
object testRdd{
def main(args: Array[String]): Unit = {
new testRdd().anylizeRdd()
}
}
result:
x<50==0
x>50==1
x>50==2
x>50==3
a:0||b:3
x<50==0
x>50==1
x<50==1
x<50==1
a:3||b:1
x>50==1
x>50==2
x>50==3
x>50==4
a:4||b:4
x>50==1
x>50==2
x>50==3
x>50==4
a:8||b:4
x>50==1
x>50==2
x<50==2
x>50==3
a:12||b:3
x>50==1
x<50==1
x>50==2
x<50==2
a:15||b:2
x>50==1
x<50==1
x>50==2
x<50==2
a:17||b:2
x>50==1
x<50==1
x>50==2
x<50==2
a:19||b:2
x<50==0
x<50==0
x<50==0
x>50==1
a:21||b:1
x<50==0
x<50==0
x<50==0
x>50==1
a:22||b:1
23
source anylize
var jobResult = Utils.clone(zeroValue, sc.env.serializer.newInstance())
val aggregatePartition = (it: Iterator[T]) => it.aggregate(zeroValue)(cleanSeqOp, cleanCombOp)
//对每一个分区(iterator)进行聚合操作
val mergeResult = (index: Int, taskResult: U) => jobResult = combOp(jobResult, taskResult)
//对每一个分区的计算结果进行聚合操作。
总结:seq对分别每一个分区进行聚合操作,comp对所有分区的结果进行聚合操作。