def main(args: Array[String]): Unit = {
// TODO Scala Wordcount TopN
// 需求: 从文件中读取数据, 将读取到的数据拆分成一个一个的单词,对每个单词出现的次数进行统计,
// 将最终统计的结果按照次数进行降序排序,取topn
// 1. 从文件中将数据读取到集合中
// val source: BufferedSource = Source.fromFile("input/input.txt")
// val dataList: List[String] = source.getLines().toList
// source.close()
// println("==>1." + dataList)
//List(hello world spark, hello scala hive, hive flume kafka hadoop, hello hadoop flume hbase)
val list = List(hello scala spark hello kafka flume hive helle hadoop hbase redis scala spark kafka hello kafka flume hive hello)
//2. 将集合中的每个元素按照" "切分成一个一个的单词
val words: List[String] = dataList.flatMap(_.split(" "))
println("==>2." + words)
// List(hello, world, spark, hello, scala, hive, hive, flume, kafka, hadoop, hello, hadoop, flume, hbase)
//3. 将单词进行分组
val wordGroup: Map[String, List[String]] = words.groupBy(word=>word)
println("==>3." + wordGroup)
//Map(world -> List(world), kafka -> List(kafka), hadoop -> List(hadoop, hadoop), spark -> List(spark), hive -> List(hive, hive), scala -> List(scala), flume -> List(flume, flume), hello -> List(hello, hello, hello), hbase -> List(hbase))
//4. 将每组中的单词进行次数统计,转化成 K -> v (word,count)
val wordCount: Map[String, Int] = wordGroup.map(kv=>(kv._1,kv._2.length))
println("==>4." +wordCount)
//Map(world -> 1, kafka -> 1, hadoop -> 2, spark -> 1, hive -> 2, scala -> 1, flume -> 2, hello -> 3, hbase -> 1)
//5. 按照每个单词的次数进行排序,
//val finalwordCount: List[(String, Int)] =wordCount.toList.sortBy(kv=>{kv._2})(Ordering.Int.reverse)
val finalwordCount: List[(String, Int)] =wordCount.toList.sortBy(_._2)(Ordering.Int.reverse)
println("==>5." + finalwordCount)
//6. 取topn
val result: List[(String, Int)] = finalwordCount.take(3)
println("==>6." + result)
var result1 = dataList.
flatMap(_.split(" ")).
groupBy(word=>word).
map(kv=>(kv._1,kv._2.length)).
toList.sortBy(_._2)(Ordering.Int.reverse).
take(3)
println(result1)
}