1、文件数据
Spark,100
Hadoop,62
Flink,77
Kafka,91
Hadoop,93
Spark,78
Hadoop,69
Spark,98
Hadoop,62
Spark,99
Hadoop,61
Spark,70
Hadoop,75
Spark,88
Hadoop,68
Spark,90
Hadoop,61
2、Scala代码:
package topN
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object GroupTop {
def main(args: Array[String]): Unit = {
/**
* 判断参数的多少
*/
if (args.length < 2){
println(
"""
|topN.GroupTop<inputPath><outputPath>
|<inputPath> 文件输入目录
|<outputPath> 输出目录
""".stripMargin
)
System.exit(0)
}
/**
* 接收参数
*/
val Array(inputPath,outputPath) = args
/**
* 初始化程序入口
*/
val conf = new SparkConf()
conf.setAppName(s"${this.getClass.getSimpleName}")
conf.setMaster("local")
conf.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
/**
* 计算topN
*/
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile(inputPath)
//拆分为Tuple2
val tupleRDD: RDD[(String, Int)] = lines.map(line => {
(line.split(",")(0), line.split(",")(1).toInt)
})
//分组
val groutRDD: RDD[(String, Iterable[Int])] = tupleRDD.groupByKey()
//针对分组对value排序,返回Tuple2
val groupSort: RDD[(String, List[Int])] = groutRDD.map(grouped => {
(grouped._1, grouped._2.toList.sortWith(_ > _).take(5))//升序,取Top3
})
//遍历输出
groupSort.sortByKey().collect().foreach(pair => {
println(pair._1+":")
pair._2.foreach(s => println(s + "\t"))
})
sc.stop()
}
}
运行结果:
Flink:
77
Hadoop:
93
75
69
68
62
Kafka:
91
Spark:
100
99
98
90
88
3、Java代码:
package topN;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.rdd.RDD;
import scala.Tuple2;
import java.util.Comparator;
import java.util.Iterator;
public class GroupTopN {
public static void main(String[] args) {
/**
* 判断参数的个数
*/
if (args.length > 2){
System.out.println("topN.GroupTopN" +
"need two parameter <inputPath><outputPath> \n" +
"<inputPath> 输入路径\n" +
"<outputPath> 输出路径");
System.exit(0);
}
/**
* 接收参数
*/
String inputPath = args[0];
String outputPath = args[1];
SparkConf conf = new SparkConf().setAppName("topN.GroupTopN").setMaster("local");;
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile(inputPath);
//拆分为JavaPairRDD代码 匿名内部类
JavaPairRDD<String, Integer> cs = lines.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2(s.split(",")[0],Integer.valueOf(s.split(",")[1]));
}
});
//拆分为JavaPairRDD代码 用lamda表达式
// JavaPairRDD cs = lines.mapToPair((PairFunction) s -> new Tuple2(s.toString().split(",")[0],Integer.valueOf(s.toString().split(",")[1])));
//根据Key分组
JavaPairRDD<String, Iterable<Integer>> csPairsRDD = cs.groupByKey();
//根据Key排序,降序
JavaPairRDD<String, Iterable<Integer>> sortbykey = csPairsRDD.sortByKey();
//遍历取出Top3
sortbykey.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
@Override
public void call(Tuple2<String, Iterable<Integer>> csPair) throws Exception {
String name = csPair._1();
Iterator<Integer> ite = csPair._2().iterator();
Integer[] res = new Integer[3];
//排序,取出Top3
while (ite.hasNext()){
Integer score = ite.next();
for (int i = 0; i < 3; i++){
if (res[i] == null){
res[i] = score;
break;
} else if (res[i] < score){
for (int j= 2 ; j > i; j--){
res[i] = res[j - 1];
}
res[i] = score;
break;
}
}
}
System.out.println(name+":");
for (int i = 0; i < res.length; i++){
System.out.println(res[i] + "\t");
}
System.out.println();
}
});
sc.close();
}
}
运行结果:
Flink:
77
null
null
Hadoop:
93
75
68
Kafka:
91
null
null
Spark:
100
99
90