package com.sf.gis.scala.base.spark import org.apache.log4j.Logger import org.apache.spark.rdd.RDD import scala.collection.mutable.ArrayBuffer import scala.util.Random /** * Created by 01374443 on 2020/7/27. * 处理一些复杂的关联场景 */ object SparkJoin { @transient lazy val logger: Logger = Logger.getLogger(this.getClass) /** * 左关联,左表存在少部分key倾斜,采用单独处理部分key的方式 * * @param left 左表 * @param right 右表 * @param hashNum 散列系数,扩容倍数 * @param topLean 需要单独处理的倾斜数据量 */ def leftOuterJoinOfLeftLeanElem(left: RDD[(String, Object)], right: RDD[(String, Object)], hashNum: Int, topLean: Int = 10): RDD[(String, (Object, Option[Object]))] = { val keyCounts = left.countByKey().toArray.sortBy(-_._2).take(topLean) val keys = keyCounts.map(obj => obj._1) val counts = keyCounts.map(obj => obj._2).sum logger.error("单独处理的keys:" + keyCounts.mkString(",")) logger.error("单独处理的总数量:" + counts) //拆分数据为独立处理的key和非独立处理的key val leftHashKeyData = left.filter(obj => keys.contains(obj._1)) val leftOtherData = left.filter(obj => !keys.contains(obj._1)) val rightHashKeyData = right.filter(obj => keys.contains(obj._1)) val rightOtherData = right.filter(obj => !keys.contains(obj._1)) //先关联其他key数据 val otherJoin = leftOtherData.leftOuterJoin(rightOtherData) //扩展单独处理的数据 val leftHashKeyDataExpand = leftHashKeyData.map(obj => { val hashPrefix = new Random().nextInt(hashNum) ((hashPrefix, obj._1), obj._2) }) val rightHashKeyDataExpand = rightHashKeyData.flatMap(obj => { val dataArray = new ArrayBuffer[((Int, String), Object)]() for (i <- 0 until hashNum) { dataArray.append(((i, obj._1), obj._2)) } dataArray.iterator }) //关联数据 val hashKeyJoin = leftHashKeyDataExpand.leftOuterJoin(rightHashKeyDataExpand).map(obj => (obj._1._2, obj._2)) hashKeyJoin.union(otherJoin) } def main(args: Array[String]): Unit = { val spark = Spark.getSparkSession(this.getClass.getSimpleName.replace("$", ""), null, true, 5) val list = Array("3333", "dd", "dd", "11", "222", "ddd1", "3333", "11", "dd", "dd", "11", "3333", "3333", "3333", "333") val left = spark.sparkContext.parallelize(list).map(obj => (obj, obj)) val keys = left.countByKey().toArray.sortBy(-_._2) val topkey = keys.take(2) print(keys) } }