package com.java;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.*;
/**
* java版本的关于Spark中transformation算子的操作
*/
public class JavaSparkTransformation {
/**
* flatMap
*
* @param ctx
*/
public static void demoFlatMap2(JavaSparkContext ctx) {
JavaRDD<String> listRDD = ctx.textFile("hdfs://master:9000/spark/input/hello.txt");
JavaRDD<String> flatRDD = listRDD.flatMap(line -> {
return Arrays.asList(line.split(" "));
});
JavaPairRDD<String, Integer> pairRDD = flatRDD.mapToPair(str -> {
return new Tuple2<String, Integer>(str, 1);
});
JavaPairRDD<String, Integer> reduceRDD = pairRDD.reduceByKey((v1, v2) -> {
return v1 + v2;
});
//
// reduceRDD.collect().forEach(v->System.out.println(v));
reduceRDD.saveAsTextFile("hdfs://master:9000/spark/output/core1/");
}
//
// 1、map:将集合中每个元素乘以7
public static void demoMap(JavaSparkContext ctx) {
List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> listRDD = ctx.parallelize(list);
JavaRDD<Integer> mapRDD = listRDD.map(num -> num * 7);
mapRDD.collect().forEach(v -> System.out.println(v));
}
// 2、filter:过滤出集合中的奇数
public static void demoFilter(JavaSparkContext ctx) {
List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> listRDD = ctx.parallelize(list);
JavaRDD<Integer> mapRDD = listRDD.filter(num -> num % 2 == 0);
mapRDD.collect().forEach(v -> System.out.println(v));
}
// 3、flatMap:将行拆分为单词
public static void demoFlatMap(JavaSparkContext ctx) {
JavaRDD<String> listRDD = ctx.textFile("hdfs://master:9000/spark/input/hello.txt");
//方法1:
JavaRDD<String> flatRDD = listRDD.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterable<String> call(String line) throws Exception {
return Arrays.asList(line.split(" "));
}
});
//方法2:
JavaRDD<String> flatRDD2 = listRDD.flatMap(line -> {
return Arrays.asList(line.split(" "));
});
flatRDD.collect().forEach(v -> System.out.println(v));
}
// 4、sample:根据给定的随机种子seed,随机抽样出数量为frac的数据
public static void demoSample(JavaSparkContext ctx) {
List<Integer> list = new ArrayList<>();
for (int i = 0; i < 10000; i++) {
list.add(i);
}
JavaRDD<Integer> listRDD = ctx.parallelize(list);
JavaRDD<Integer> sampleRDD = listRDD.sample(true, 0.01);
System.out.println("SampleRDD=>Count:" + sampleRDD.count());
sampleRDD.collect().forEach(s -> System.out.println(s));
}
// 5、union:返回一个新的数据集,由原数据集和参数联合而成
public static void demoUnion(JavaSparkContext ctx) {
JavaRDD<Integer> oddRDD = ctx.parallelize(Arrays.asList(1, 3, 5, 7, 9));
JavaRDD<Integer> evenRDD = ctx.parallelize(Arrays.asList(2, 4, 6, 8, 10));
JavaRDD<Integer> unionRDD = oddRDD.union(evenRDD);
unionRDD.collect().forEach(s -> System.out.println(s));
}
// 6、groupByKey:对数组进行 group by key操作
public static void demoGroupByKey(JavaSparkContext ctx) {
JavaRDD<String> listRDD = ctx.textFile("hdfs://master:9000/spark/input/hello.txt");
JavaRDD<String> lineRDD = listRDD.flatMap(line -> {
return Arrays.asList(line.split(" "));
});
JavaPairRDD<Object, Object> pairRDD = lineRDD.mapToPair(word -> {
return new Tuple2<Object, Object>(word, 1);
});
pairRDD.collect().forEach(v -> {
System.out.println(v._1() + " :" + v._2());
});
JavaPairRDD<Object, Iterable<Object>> groupRDD = pairRDD.groupByKey();
groupRDD.foreach(v -> System.out.println(v._1() + ": " + v._2()));
}
// 7、reduceByKey:统计每个班级的人数
public static void demoReduceByKey(JavaSparkContext ctx) {
JavaRDD<String> lineRDD = ctx.textFile("hdfs://master:9000/spark/input/class.txt");
JavaPairRDD<String, Integer> pairRDD = lineRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String clazz = splits[0];
int sum = Integer.valueOf(splits[2]);
return new Tuple2<String, Integer>(clazz, sum);
});
JavaPairRDD<String, Integer> reduceRDD = pairRDD.reduceByKey((v1, v2) -> {
return v1 + v2;
});
reduceRDD.foreach(t -> System.out.println(t._1() + ": " + t._2()));
}
// 8、join:打印关联的组合信息
public static void demoJoin(JavaSparkContext ctx) {
List<String> maleList = Arrays.asList(
"bd_1 male 20",
"bd_2 male 25",
"bd_3 male 15");
List<String> femaleList = Arrays.asList(
"bd_1 female 2",
"bd_2 female 10",
"bd_3 female 5"
);
JavaRDD<String> maleRDD = ctx.parallelize(maleList);
JavaRDD<String> femaleRDD = ctx.parallelize(femaleList);
JavaPairRDD<String, Integer> malePairRDD = maleRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String clazz = splits[0];
int sum = Integer.valueOf(splits[2]);
return new Tuple2<String, Integer>(clazz, sum);
});
System.out.println("=============malePairRDD===============");
malePairRDD.foreach(t -> System.out.println(t));
JavaPairRDD<String, Integer> femalePairRDD = femaleRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String clazz = splits[0];
int sum = Integer.valueOf(splits[2]);
return new Tuple2<String, Integer>(clazz, sum);
});
System.out.println("=============femalePairRDD===============");
femalePairRDD.foreach(t -> System.out.println(t));
JavaPairRDD<String, Tuple2<Integer, Integer>> joinRDD = malePairRDD.join(femalePairRDD);
System.out.println("=============joinPairRDD===============");
joinRDD.foreach(t -> System.out.println(t._1() + " :" + t._2()));
}
// 9、sortByKey:将学生身高进行排序
public static void demoSortByKey(JavaSparkContext ctx) {
List<String> list = Arrays.asList(
"zhangsan 176",
"xiaodingding 175",
"xiaobao 173",
"heyajie 174.5",
"liujun 173",
"wangxiaoxiong 150"
);
JavaRDD<String> lineRDD = ctx.parallelize(list);
// JavaPairRDD<String, Double> wordRDD = lineRDD.mapToPair(line -> {
// String[] data = line.split(" ");
// return new Tuple2<String, Double>(data[0], Double.valueOf(data[1]));
// });
//只需要调换两个参数的weizhi
JavaPairRDD<Double, String> wordRDD = lineRDD.mapToPair(line -> {
String[] data = line.split(" ");
return new Tuple2<Double, String>(Double.valueOf(data[1]), data[0]);
});
wordRDD.sortByKey().foreach(t -> System.out.println(t._2() + " :" + t._1()));
}
/**
* TopN
* 取前三名
* @param ctx
*/
public static void demoTopN(JavaSparkContext ctx) {
List<String> list = Arrays.asList(
"class1 90",
"class2 88",
"class2 80",
"class1 79",
"class2 60",
"class1 66",
"class2 86",
"class1 78",
"class1 82",
"class2 87"
);
JavaRDD<String> listRDD = ctx.parallelize(list);
JavaPairRDD<String, Double> pairRDD = listRDD.mapToPair(line -> {
String[] splits = line.split(" ");
return new Tuple2<String, Double>(splits[0], Double.valueOf((splits[1])));
});
JavaPairRDD<String, Iterable<Double>> groupRDD = pairRDD.groupByKey();
JavaRDD<Tuple2<String, Iterable<Double>>> mapRDD = groupRDD.map(m -> {
Iterable<Double> doubles = m._2();
TreeSet<Double> set = new TreeSet<Double>(new Comparator<Double>() {
@Override
public int compare(Double a, Double b) {
int ret = (int) (b - a);
return ret;
}
});
for (Double d : doubles) {
set.add(d);
if (set.size() > 3) {
set.pollLast();
}
}
return new Tuple2<String, Iterable<Double>>(m._1(), set);
});
groupRDD.foreach(t -> {
System.out.println(t._1 + ":" + t._2);
});
mapRDD.foreach(t -> {
System.out.println(t._1 + ":" + t._2);
});
}
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setAppName(JavaSparkTransformation.class.getSimpleName());
conf.setMaster("local");
JavaSparkContext ctx = new JavaSparkContext(conf);
//
demoTopN(ctx);
//
ctx.close();
}
}
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.*;
/**
* java版本的关于Spark中transformation算子的操作
*/
public class JavaSparkTransformation {
/**
* flatMap
*
* @param ctx
*/
public static void demoFlatMap2(JavaSparkContext ctx) {
JavaRDD<String> listRDD = ctx.textFile("hdfs://master:9000/spark/input/hello.txt");
JavaRDD<String> flatRDD = listRDD.flatMap(line -> {
return Arrays.asList(line.split(" "));
});
JavaPairRDD<String, Integer> pairRDD = flatRDD.mapToPair(str -> {
return new Tuple2<String, Integer>(str, 1);
});
JavaPairRDD<String, Integer> reduceRDD = pairRDD.reduceByKey((v1, v2) -> {
return v1 + v2;
});
//
// reduceRDD.collect().forEach(v->System.out.println(v));
reduceRDD.saveAsTextFile("hdfs://master:9000/spark/output/core1/");
}
//
// 1、map:将集合中每个元素乘以7
public static void demoMap(JavaSparkContext ctx) {
List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> listRDD = ctx.parallelize(list);
JavaRDD<Integer> mapRDD = listRDD.map(num -> num * 7);
mapRDD.collect().forEach(v -> System.out.println(v));
}
// 2、filter:过滤出集合中的奇数
public static void demoFilter(JavaSparkContext ctx) {
List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> listRDD = ctx.parallelize(list);
JavaRDD<Integer> mapRDD = listRDD.filter(num -> num % 2 == 0);
mapRDD.collect().forEach(v -> System.out.println(v));
}
// 3、flatMap:将行拆分为单词
public static void demoFlatMap(JavaSparkContext ctx) {
JavaRDD<String> listRDD = ctx.textFile("hdfs://master:9000/spark/input/hello.txt");
//方法1:
JavaRDD<String> flatRDD = listRDD.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterable<String> call(String line) throws Exception {
return Arrays.asList(line.split(" "));
}
});
//方法2:
JavaRDD<String> flatRDD2 = listRDD.flatMap(line -> {
return Arrays.asList(line.split(" "));
});
flatRDD.collect().forEach(v -> System.out.println(v));
}
// 4、sample:根据给定的随机种子seed,随机抽样出数量为frac的数据
public static void demoSample(JavaSparkContext ctx) {
List<Integer> list = new ArrayList<>();
for (int i = 0; i < 10000; i++) {
list.add(i);
}
JavaRDD<Integer> listRDD = ctx.parallelize(list);
JavaRDD<Integer> sampleRDD = listRDD.sample(true, 0.01);
System.out.println("SampleRDD=>Count:" + sampleRDD.count());
sampleRDD.collect().forEach(s -> System.out.println(s));
}
// 5、union:返回一个新的数据集,由原数据集和参数联合而成
public static void demoUnion(JavaSparkContext ctx) {
JavaRDD<Integer> oddRDD = ctx.parallelize(Arrays.asList(1, 3, 5, 7, 9));
JavaRDD<Integer> evenRDD = ctx.parallelize(Arrays.asList(2, 4, 6, 8, 10));
JavaRDD<Integer> unionRDD = oddRDD.union(evenRDD);
unionRDD.collect().forEach(s -> System.out.println(s));
}
// 6、groupByKey:对数组进行 group by key操作
public static void demoGroupByKey(JavaSparkContext ctx) {
JavaRDD<String> listRDD = ctx.textFile("hdfs://master:9000/spark/input/hello.txt");
JavaRDD<String> lineRDD = listRDD.flatMap(line -> {
return Arrays.asList(line.split(" "));
});
JavaPairRDD<Object, Object> pairRDD = lineRDD.mapToPair(word -> {
return new Tuple2<Object, Object>(word, 1);
});
pairRDD.collect().forEach(v -> {
System.out.println(v._1() + " :" + v._2());
});
JavaPairRDD<Object, Iterable<Object>> groupRDD = pairRDD.groupByKey();
groupRDD.foreach(v -> System.out.println(v._1() + ": " + v._2()));
}
// 7、reduceByKey:统计每个班级的人数
public static void demoReduceByKey(JavaSparkContext ctx) {
JavaRDD<String> lineRDD = ctx.textFile("hdfs://master:9000/spark/input/class.txt");
JavaPairRDD<String, Integer> pairRDD = lineRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String clazz = splits[0];
int sum = Integer.valueOf(splits[2]);
return new Tuple2<String, Integer>(clazz, sum);
});
JavaPairRDD<String, Integer> reduceRDD = pairRDD.reduceByKey((v1, v2) -> {
return v1 + v2;
});
reduceRDD.foreach(t -> System.out.println(t._1() + ": " + t._2()));
}
// 8、join:打印关联的组合信息
public static void demoJoin(JavaSparkContext ctx) {
List<String> maleList = Arrays.asList(
"bd_1 male 20",
"bd_2 male 25",
"bd_3 male 15");
List<String> femaleList = Arrays.asList(
"bd_1 female 2",
"bd_2 female 10",
"bd_3 female 5"
);
JavaRDD<String> maleRDD = ctx.parallelize(maleList);
JavaRDD<String> femaleRDD = ctx.parallelize(femaleList);
JavaPairRDD<String, Integer> malePairRDD = maleRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String clazz = splits[0];
int sum = Integer.valueOf(splits[2]);
return new Tuple2<String, Integer>(clazz, sum);
});
System.out.println("=============malePairRDD===============");
malePairRDD.foreach(t -> System.out.println(t));
JavaPairRDD<String, Integer> femalePairRDD = femaleRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String clazz = splits[0];
int sum = Integer.valueOf(splits[2]);
return new Tuple2<String, Integer>(clazz, sum);
});
System.out.println("=============femalePairRDD===============");
femalePairRDD.foreach(t -> System.out.println(t));
JavaPairRDD<String, Tuple2<Integer, Integer>> joinRDD = malePairRDD.join(femalePairRDD);
System.out.println("=============joinPairRDD===============");
joinRDD.foreach(t -> System.out.println(t._1() + " :" + t._2()));
}
// 9、sortByKey:将学生身高进行排序
public static void demoSortByKey(JavaSparkContext ctx) {
List<String> list = Arrays.asList(
"zhangsan 176",
"xiaodingding 175",
"xiaobao 173",
"heyajie 174.5",
"liujun 173",
"wangxiaoxiong 150"
);
JavaRDD<String> lineRDD = ctx.parallelize(list);
// JavaPairRDD<String, Double> wordRDD = lineRDD.mapToPair(line -> {
// String[] data = line.split(" ");
// return new Tuple2<String, Double>(data[0], Double.valueOf(data[1]));
// });
//只需要调换两个参数的weizhi
JavaPairRDD<Double, String> wordRDD = lineRDD.mapToPair(line -> {
String[] data = line.split(" ");
return new Tuple2<Double, String>(Double.valueOf(data[1]), data[0]);
});
wordRDD.sortByKey().foreach(t -> System.out.println(t._2() + " :" + t._1()));
}
/**
* TopN
* 取前三名
* @param ctx
*/
public static void demoTopN(JavaSparkContext ctx) {
List<String> list = Arrays.asList(
"class1 90",
"class2 88",
"class2 80",
"class1 79",
"class2 60",
"class1 66",
"class2 86",
"class1 78",
"class1 82",
"class2 87"
);
JavaRDD<String> listRDD = ctx.parallelize(list);
JavaPairRDD<String, Double> pairRDD = listRDD.mapToPair(line -> {
String[] splits = line.split(" ");
return new Tuple2<String, Double>(splits[0], Double.valueOf((splits[1])));
});
JavaPairRDD<String, Iterable<Double>> groupRDD = pairRDD.groupByKey();
JavaRDD<Tuple2<String, Iterable<Double>>> mapRDD = groupRDD.map(m -> {
Iterable<Double> doubles = m._2();
TreeSet<Double> set = new TreeSet<Double>(new Comparator<Double>() {
@Override
public int compare(Double a, Double b) {
int ret = (int) (b - a);
return ret;
}
});
for (Double d : doubles) {
set.add(d);
if (set.size() > 3) {
set.pollLast();
}
}
return new Tuple2<String, Iterable<Double>>(m._1(), set);
});
groupRDD.foreach(t -> {
System.out.println(t._1 + ":" + t._2);
});
mapRDD.foreach(t -> {
System.out.println(t._1 + ":" + t._2);
});
}
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setAppName(JavaSparkTransformation.class.getSimpleName());
conf.setMaster("local");
JavaSparkContext ctx = new JavaSparkContext(conf);
//
demoTopN(ctx);
//
ctx.close();
}
}