import org.apache.spark.Partitioner
import scala.collection.mutable
/**
* @author Jacky
* 自定义分区器
* 自定义类Scala_HostNamePartitioner继承Partitioner分区器这个抽象类
*/
class Scala_HostNamePartitioner(hostnameArray: Array[String]) extends Partitioner {
val map = mutable.Map[String, Int]()
// val map=mutable.HashMap[String,Int]()
for (i <- 0 until (hostnameArray.length)) {
map.put(hostnameArray(i), i)
}
//返回分区数
override def numPartitions: Int = {
map.size
}
//返回所属分区
override def getPartition(key: Any): Int = {
map.getOrElse(key.toString, 0)
}
}
======================================================================
import java.net.URL
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* @author Jacky
* 需求:将每个学院的URL点击次数保存在同一个文件(分区)中
*/
object Scala_UserDefinedPartitioner {
def main(args: Array[String]): Unit = {
//设置Logger级别
Logger.getLogger("org").setLevel(Level.WARN)
//创建SparkConf对象
val conf = new SparkConf().setAppName("Scala_UserDefinedPartitioner").setMaster("local")
//创建SparkContext对象
val sc = new SparkContext(conf)
val logsRDD = sc.textFile("C:\\360.log")
//(url,1)
val urlRDD: RDD[(String, Int)] = logsRDD.map(line => {
val log = line.split("\t")
val url = log(1)
(url, 1)
})
//(url,count),统计url的访问次数
val urlCountRDD = urlRDD.reduceByKey(_ + _)
//(hostname,(url,count))
val resultRDD: RDD[(String, (String, Int))] = urlCountRDD.map(t2 => {
//获取url
val url = t2._1
//从URL中获取hostname
val hostname = new URL(url).getHost
(hostname, t2)
})
//hostname的列表
val hostnameList: Array[String] = resultRDD.map(x => x._1).distinct().collect()
//通过partitionBy算子调用自定义分区器,并将分区结果写入C盘
resultRDD.partitionBy(new Scala_HostNamePartitioner(hostnameList))
.saveAsTextFile("C:\\out" + System.currentTimeMillis())
sc.stop()
}
}