package com.spark.core
import org.apache.spark.sql.SparkSession
import org.apache.spark.{Partitioner, SparkConf}
/**
* Spark的二次排序
**/
object SparkSecondarySort {
System.setProperty("hadoop.home.dir","d://soft/hadoop/hadoop-2.7.3")
def main(args: Array[String]): Unit = {
//分区数量
// val partitions: Int = args(0).toInt
//文件输入路径
val inputPath: String = args(0)
//文件输出路径
// val outputPath: String = args(1)
val conf = new SparkConf()
conf.setMaster("local[*]").setAppName("SparkSecondarySort")
//创建Spark上下文
val sc = SparkSession.builder().config(conf).getOrCreate().sparkContext
//读取文件内容
val input = sc.textFile(inputPath)
val valueToKey = input.map(x => {
val line = x.split("\t")
(new SecondSort(line(0), line(1).toInt), x)
})
// val sorted = valueToKey.repartitionAndSortWithinPartitions(new SortPartitioner(input.getNumPartitions))
val sorted = valueToKey.partitionBy(new SortPartitioner(input.getNumPartitions)).sortByKey()
val result = sorted.map(x => x._2).coalesce(1)
result.saveAsTextFile(args(1))
// result.foreach(println)
// done
sc.stop()
}
}
case class SecondSort (first:String, second:Int) extends Ordered[SecondSort] with Serializable {//自定义比较类
override def compare(that: SecondSort): Int = {
if(this.first.compareTo(that.first) != 0){//第一个值不相等的时候,直接返回大小
this.first.compareTo(that.first) //返回值
} else {//第一个值相等的时候,比较第二个值
this.second.compareTo(that.second)
}
}
}
/**
* 自定义排序分区
**/
class SortPartitioner(partitions: Int) extends Partitioner {
def numPartitions: Int = partitions
def getPartition(key: Any): Int = key match {
case (k: String, v: Int) => math.abs(k.hashCode % numPartitions)
case null => 0
case _ => math.abs(key.hashCode % numPartitions)
}
override def equals(other: Any): Boolean = other match {
case o: SortPartitioner => o.numPartitions == numPartitions
case _ => false
}
override def hashCode: Int = numPartitions
}
Spark 的二次排序
最新推荐文章于 2022-07-22 15:37:35 发布