SparkPlan如何执行呢,SparkPlan是如何转变为RDD[Row]的呢?首先看一段代码:
SQLContext sqlContext = new SQLContext(jsc);
DataFrame dataFrame = sqlContext.parquetFile(parquetPath);
dataFrame.registerTempTable(source);
String sql = " select SUM(id) from test group by dev_chnid ";
DataFrame result = sqlContext.sql(sql);
log.info("Result:"+result.collect());//collect触发action
override def collect(): Array[Row] = {
val ret = queryExecution.executedPlan.executeCollect()//执行executedPlan的executeCollect
ret
}
def executeCollect(): Array[Row] = {
execute().mapPartitions { iter =>
val converter = CatalystTypeConverters.createToScalaConverter(schema)
iter.map(converter(_).asInstanceOf[Row])
}.collect()//最终执行的是executedPlan的execute,即SparkPlan的execute
}
def collect(): Array[T] = withScope {
val results = sc.runJob(this, (iter: Iterator[T]) => iter.toArray)
Array.concat(results: _*)
}
查看SparkPlan的execute函数:
abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializable {
……
final def execute(): RDD[Row] = {
RDDOperationScope.withScope(sparkContext, nodeName, false, true) {
doExecute()//执行各个具体SparkPlan的doExecute函数
}
}
……
}
可以每个具体的SparkPlan都会封装一个doExecute函数,其输出为RDD[Row]。就拿select SUM(id) from test group by dev_chnid语句来说,其executePlan为:
Aggregate false, [dev_chnid#0], [CombineSum(PartialSum#45L) AS c0#43L]
Exchange (HashPartitioning 200)
Aggregate true, [dev_chnid#0], [dev_chnid#0,SUM(id#17L) AS PartialSum#45L]
PhysicalRDD [dev_chnid#0,id#17L], MapPartitionsRDD
先看下Aggregatefalse, [dev_chnid#0], [CombineSum(PartialSum#45L) AS c0#43L]的doExecute的函数:
protected override def doExecute(): RDD[Row] = attachTree(this, "execute") {
if (groupingExpressions.isEmpty) {//如果没有分组
child.execute().mapPartitions { iter =>//执行child的execute函数
val buffer = newAggregateBuffer()
var currentRow: Row = null
while (iter.hasNext) {
currentRow = iter.next()
var i = 0
while (i < buffer.length) {//计算全局的值
buffer(i).update(currentRow)
i += 1
}
}
val resultProjection = new InterpretedProjection(resultExpressions, computedSchema)
val aggregateResults = new GenericMutableRow(computedAggregates.length)
var i = 0
while (i < buffer.length) {
aggregateResults(i) = buffer(i).eval(EmptyRow)
i += 1
}
Iterator(resultProjection(aggregateResults))
}
} else {
child.execute().mapPartitions { iter =>//执行child的execute函数
val hashTable = new HashMap[Row, Array[AggregateFunction]]
//groupingExpressions = [dev_chnid#0]
//child.output = [dev_chnid#0,id#17L]
val groupingProjection = new InterpretedMutableProjection(groupin