val isLocal = args(0).toBoolean
val conf = new SparkConf().setAppName(this.getClass.getCanonicalName)
if (isLocal) {
conf.setMaster("local[*]")
}
val sc = new SparkContext(conf)
//指定以后从哪里读取数据创建RDD
val lines = sc.textFile(args(1))
val topN = args(2).toInt
//对数据进行整理
val reduced: RDD[((String, String), Int)] = lines.map(line => {
val fields = line.split("/")
val url = fields(2)
val teacher = fields(3)
val subject = url.split("[.]")(0)
((subject, teacher), 1)
}).reduceByKey(_ + _)
//计算学科的数量 distinct之后里面有多少种类的学科,就有多少,然后collect,写到磁盘上
val subjects: Array[String] = reduced.map(_._1._1).distinct().collect()
/*println(subjects.toBuffer) //ArrayBuffer(javaee, php, bigdata)
sc.stop()*/
//初始化分区器 分区器创建好了
val partitioner: SubjectPartitioner = new SubjectPartitioner(subjects)
//创建好分区器之后,开始调用分区器
//按照学科进行分区
val partitioined: RDD[((String, String), Int)] = reduced.partitionBy(partitioner)
//把每个分区的数据拿出来
val result1: RDD[((String, String), Int)] = partitioined.mapPartitions(it => {
it.toList.sortBy((-_._2)).take(topN).iterator //输入的是迭代器,返回的也是迭代器
})
println(result1.collect().toBuffer)
sc.stop()
}
}
//定义分区器,传入成员变量方便 以后传进去
class SubjectPartitioner(val subjects:Array[String]) extends Partitioner{
//这里面一定要传进去传进去多少个分区,有多少个学科就有多少个分区
//在柱构造器中定义分区规则,HashMap里面装的是学科名字和分区编号
//柱构造器在new 的时候只会执行一次
private val nameTonum: mutable.HashMap[String, Int] = new mutable.HashMap[String, Int]()
var i = 0
for(sub<-subjects){
//这个是scala里面中 sub 当初key i 当初v put 到HashMap里面去
nameTonum(sub)=i //拿到一个,就把数据加进去
i+=1 //
}
//这时候就设置好了(javaee,0),(php,1),(bigdate,2)
override def numPartitions: Int = subjects.length//3
//executor在Task中,shufflewriter之前会调用
override def getPartition(key: Any): Int ={
//asInstanceOf将引用指定为子类的引用
val tuple: (String, String) = key.asInstanceOf[(String, String)]
val subject: String = tuple._1
nameTonum(subject) //根据学科获取相对应的编号
//判断是否本地运行
val isLocal: Boolean = args(0).toBoolean
//加载配置文件
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getCanonicalName)
//if语句执行判断
if(isLocal){
conf.setMaster("local[*]")
}
//创建连接
val sc: SparkContext = new SparkContext(conf)
//指定文件的路径,创建RDD
val lines: RDD[String] = sc.textFile(args(1))
//传入一个topN
val topN: Int = args(2).toInt
//对文件进行操作
val reduced: RDD[((String, String), Int)] = lines.map(it => {
val fields: Array[String] = it.split("/")
val teacher: String = fields(3)
val subjects: String = fields(2).split("[.]")(0)
((subjects, teacher), 1)
}) reduceByKey (_ + _)
/* val result: mutable.Buffer[((String, String), Int)] = reduced.collect().toBuffer
println(result)*/
//有通过subject进行分区
//先找出subject的种类
val subject: Array[String] = reduced.map(_._1._1).distinct().collect()
//new一个定义了的构造器
val partitioner: SubjectPartitioner2 = new SubjectPartitioner2(subject)
//reduced调用分区器
val partitioned: RDD[((String, String), Int)] = reduced.partitionBy(partitioner)
//使用一个特殊的集合进行排序
//TreeSet里面是一个Ordering的规则
//然后修改排序规则,通过隐式转换,按照自己想要的规则进行排序
//foreach把迭代器里面的数据遍历出来,也就是吧m每个分区里面的数据取出来
val result2: RDD[((String, String), Int)] = partitioned.mapPartitions(t => {
//定义一个隐式转换修改排序规则
implicit val rules: Ordering[((String, String), Int)] = Ordering[Int].on[((String, String), Int)](t => -t._2)
//new TreeSet 里面有自动排序规则 会自动找上面的隐式转换
val sorted: mutable.TreeSet[((String, String), Int)] = new mutable.TreeSet[((String, String), Int)]()
sorted.foreach(t => {
//将数据添加到TreeSet中
sorted += t //相当于add
if (sorted.size > topN) {
sorted -= sorted.last
}
})
sorted.iterator //输入的是迭代器,输出的也是迭代器
})
println(result2.collect().toBuffer)
}
}
//自定义一个分区,然后以subject为分区规则,把subject传进去
class SubjectPartitioner2(var subject: Array[String]) extends Partitioner{
//定义一个柱构造器,这个柱构造器每new 一次就执行一次
private val nameToNum: mutable.HashMap[String, Int] = new mutable.HashMap[String, Int]()
//通过遍历subject把学科,及其对应的编号传入HashMap中
var i = 0
for (sub<-subject){
nameToNum(sub)=i //这种形式在scala中就是把subject 0,1,2 put到map中
i+=1
}
//Array(("javaee",0),("php",1),("bigdate",2))
override def numPartitions: Int = subject.length //求出数组的长度
override def getPartition(key: Any): Int = {
//因为传进来的是any类型,但是我们只需要(String,String) 元组
//所以就需要强转
val tuple: (String, String) = key.asInstanceOf[(String, String)]
//获取subject
val subject: String = tuple._1
//取出subject对应的编号
nameToNum(subject)
上传一种新的比较规则
val isLocal: Boolean = args(0).toBoolean
//加载配置文件
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getCanonicalName)
//if语句执行判断
if(isLocal){
conf.setMaster("local[*]")
}
//创建连接
val sc: SparkContext = new SparkContext(conf)
//指定文件的路径,创建RDD
val lines: RDD[String] = sc.textFile(args(1))
//传入一个topN
val topN: Int = args(2).toInt
//对文件进行操作
val subjectAndOne: RDD[((String, String), Int)] = lines.map(it => {
val fields: Array[String] = it.split("/")
val teacher: String = fields(3)
val subjects: String = fields(2).split("[.]")(0)
((subjects, teacher), 1)
})
val subjects: Array[String] = subjectAndOne.map(_._1._1).distinct().collect()
//初始分化区
val subPartitioned: SubjectPartitioner6 = new SubjectPartitioner6(subjects)
//调用一个算子,可以重新分区,并且每一个分区内排序
//制定一个比较规则,参与比较的字段必须是在key里面
implicit val orderingRules: Ordering[((String, String), Int)] with Object = new Ordering[((String, String), Int)]{
override def compare(x: ((String, String), Int), y: ((String, String), Int)): Int = {
-(x._2 - y._2)
}
}
//注意这个是按照元组的比较规则进行比较,先按第一个进行比较,如果第一个不相等,那就直接比较第一个字段,如果相等在比较第二个字段,一次比较下去
//知道找到不相等的元组
//如果数据比较简单就用这种排序方法 on里面调用的也是 compare方法
// implicit val rules = Ordering[Int].on[((String, String), Int)](t=>t._2)
//既可以shuffle又可以排序
val result: RDD[((String, String), Int)] = subjectAndOne.repartitionAndSortWithinPartitions(subPartitioned)
println(result.collect().toBuffer)
}
}
//自定义一个分区,然后以subject为分区规则,把subject传进去
class SubjectPartitioner6(var subject: Array[String]) extends Partitioner{
//定义一个柱构造器,这个柱构造器每new 一次就执行一次
private val nameToNum: mutable.HashMap[String, Int] = new mutable.HashMap[String, Int]()
//通过遍历subject把学科,及其对应的编号传入HashMap中
var i = 0
for (sub<-subject){
nameToNum(sub)=i //这种形式在scala中就是把subject 0,1,2 put到map中
i+=1
}
//Array(("javaee",0),("php",1),("bigdate",2))
override def numPartitions: Int = subject.length //求出数组的长度
override def getPartition(key: Any): Int = {
//因为传进来的是any类型,但是我们只需要(String,String) 元组
//所以就需要强转
val tuple: (String, String) = key.asInstanceOf[(String, String)]
//获取subject
val subject: String = tuple._1
//取出subject对应的编号
nameToNum(subject)
shuffledRDD实现 repartitionAndSortWithPartition的功能
def main(args: Array[String]): Unit = { //判断是否在本地运行 val isLocal: Boolean = args(0).toBoolean //创建本地连接 val conf: SparkConf = new SparkConf().setAppName(this.getClass.getCanonicalName) //判断为本地运行 if(isLocal){ conf.setMaster("local[*]") } //建立连接 val sc: SparkContext = new SparkContext(conf) //指定传输路径,创建RDD val lines: RDD[String] = sc.textFile(args(1)) //把每一行的数据传到数组数组里面 //topN,输出 val topN: Int = args(2).toInt //对数据进行处理 val reduced: RDD[((String, String), Int)] = lines.map(line => { val fields: Array[String] = line.split("/") val teacher: String = fields(3) val subject: String = fields(2).split("[.]")(0) ((subject, teacher), 1) }).reduceByKey(_ + _) //对数据进行去重和把数据写到磁盘里面 val subjects: Array[String] = reduced.map(_._1._1).distinct().collect() //对reduced进行处理 val maped: RDD[((String, String, Int), Null)] = reduced.map(t => { ((t._1._1, t._1._2, t._2), null) }) //把数据放到自定义的分区器里面 val partitioner: SubjectPartitioner7 = new SubjectPartitioner7(subjects) //要制定一个排序规则,new Ordering就要实现Ordering的方法 implicit val rulesRodering: Ordering[(String, String, Int)] = new Ordering[(String, String, Int)] { override def compare(x: (String, String, Int), y: (String, String, Int)): Int = { -(x._3-y._3) //倒序 } } //new ShullfeRDD实现reparitionAndSortWithPartition的功能 //key是tuple3 value 是null null,与null 合并的结果是null //不要忘记shuffledRDD里面一套传进去key ;类型,value:类型,combiner类型,他只是把数据放到一起,但不会排序 val shuffled: ShuffledRDD[(String, String, Int), Null, Null] = new ShuffledRDD[(String, String, Int), Null, Null](maped, partitioner) //把数据和分区器放到一起执行 println(shuffled.collect().toBuffer) } } //自定义一个分区器,按照谁分区,就把水传进来了 class SubjectPartitioner7(val subjects:Array[String]) extends Partitioner{ //定义一个柱构造器,指定分区器得规则,每new 一次这里就执行一次 //先创建一个HashMap,存放分区器机器分区编号,hashMap里面需要传入一个隐式转换 private val rulesRodering: Ordering[(String, String, Int)]= Ordering[Int].on[(String, String, Int)](t=>t._3) private val rules: mutable.HashMap[String, Int] = new mutable.HashMap[String, Int]() var i = 0 for (sub<-subjects){ rules(sub)=i i+=1 } override def numPartitions: Int = subjects.length override def getPartition(key: Any): Int = { val tuple: (String, String, Int) = key.asInstanceOf[(String, String, Int)] val subject: String = tuple._1 rules(subject)
Spark分区与排序实践
596

被折叠的 条评论
为什么被折叠?



