一、groupByKey
groupByKey会将RDD[key,value] 按照相同的key进行分组,形成RDD[key,Iterable[value]]的形式, 有点类似于sql中的groupby
groupByKey不能传算法,消耗性能,优化尽量使用reduceByKey或combineByKey
例:对学生的成绩进行分组
scala版本
val conf=new SparkConf().setMaster("local[2]").setAppName("groupby")
val sc=new SparkContext(conf)
val rdd=sc.makeRDD(List(("张三",97),("张三",87),
("李四",97),("李四",99),("小罗",100)))
val scoreGroupRdd = rdd.groupByKey()
//姓名,(分数,分数)的元组
scoreGroupRdd.collect.foreach(println)
//姓名,分数
scoreGroupRdd.collect.foreach(x=> {
val name=x._1
val scoreDetail=x._2
scoreDetail.foreach(scoreDetail=>println(name,scoreDetail))
})
java版本
SparkConf conf=new SparkConf().setMaster("local[2]").setAppName("groupbyjava");
JavaSparkContext sc=new JavaSparkContext(conf);
//将JavaRDD<Tuple2<String,Float>> 类型转换为 JavaPairRDD<String, Float>
JavaPairRDD<String, Float> scoreMapRdd = JavaPairRDD.fromJavaRDD(scoreRdd);
JavaPairRDD<String, Iterable<Float>> scoreByKeyRdd = scoreMapRdd.groupByKey();
Map<String, Iterable<Float>> scoreMap = scoreByKeyRdd.collectAsMap();
Set<String> nameSet = scoreMap.keySet();
//增强for输出 name:score 和 name:scores的元组
for (String nameKey :nameSet) {
Iterable<Float> floats = scoreMap.get(nameKey);
Iterator<Float> iterator = floats.iterator();
while(iterator.hasNext()){
Float next = iterator.next();
System.out.println(nameKey+":"+next);
}
System.out.println(nameKey+":"+scoreMap.get(nameKey));
}
System.out.println("------------------");
//通过Set迭代 name:score
Iterator<String> iterator = nameSet.iterator();
while(iterator.hasNext()){
String nameKey=iterator.next();
Iterable<Float> scores = scoreMap.get(nameKey);
Iterator<Float> scoresiterator = scores.iterator();
while(scoresiterator.hasNext()){
Float value = scoresiterator.next();
System.out.println(nameKey+":"+value);
}
}
二、cogroup
groupByKey是对单个 RDD 的数据进行分组,还可以使用一个叫作 cogroup() 的函数对多个共享同一个键的 RDD 进行分组
scala版本
val rdd=sc.makeRDD(List(("张三",11),
("张三",31),("王五",11)))
val rdd2=sc.makeRDD(List(("张三",21),
("李四",21),("赵六",21)))
val rdd3=sc.makeRDD(List(("张三",31),
("李四",31),("小罗",31)))
println("--------2个RDD cogroup-------")
val rdd11 = rdd.cogroup(rdd2)
rdd11.collect.foreach(println)
println("--------3个RDD cogroup-------")
val rdd31 = rdd.cogroup(rdd2,rdd3).collect.foreach(println)
java版本
JavaRDD<Tuple2<String, Float>> scoreRdd1 = sc.parallelize(Arrays.asList(
new Tuple2<>("张三", 11F),
new Tuple2<>("张三", 12F),
new Tuple2<>("李四", 13F)));
JavaRDD<Tuple2<String, Float>> scoreRdd2 = sc.parallelize(Arrays.asList(
new Tuple2<>("张三", 21F),
new Tuple2<>("张三", 22F),
new Tuple2<>("李四", 23F)));
JavaRDD<Tuple2<String, Float>> scoreRdd3 = sc.parallelize(Arrays.asList(
new Tuple2<>("张三", 31F),
new Tuple2<>("张三", 32F),
new Tuple2<>("李四", 33F)));
JavaPairRDD<String, Float> PairRDD1 = JavaPairRDD.fromJavaRDD(scoreRdd1);
JavaPairRDD<String, Float> PairRDD2 = JavaPairRDD.fromJavaRDD(scoreRdd2);
JavaPairRDD<String, Float> PairRDD3 = JavaPairRDD.fromJavaRDD(scoreRdd3);
//两个rdd进行cogroup
JavaPairRDD<String, Tuple2<Iterable<Float>, Iterable<Float>>> cogroup1
= PairRDD1.cogroup(PairRDD2);
Map<String, Tuple2<Iterable<Float>, Iterable<Float>>> myMap = cogroup1.collectAsMap();
Set<String> keys = myMap.keySet();
for (String key:keys) {
System.out.println(key+" "+myMap.get(key));
}
System.out.println("----------------");
//三个rdd进行cogroup
JavaPairRDD<String, Tuple3<Iterable<Float>, Iterable<Float>, Iterable<Float>>> cogroup2
= PairRDD1.cogroup(PairRDD2, PairRDD3);
Map<String, Tuple3<Iterable<Float>, Iterable<Float>, Iterable<Float>>> myMap2 = cogroup2.collectAsMap();
Set<String> keys2 = myMap2.keySet();
for (String key:keys2) {
System.out.println(key+" "+myMap2.get(key));
}