文章目录
-
-
- 1.Transformations转换算子
-
- 1.1 filter算子
- 1.2 map算子
- 1.3 flatMap算子
- 1.4 sample算子
- 1.5 reduceByKey算子
- 1.6 sortByKey与sortBy算子
- 1.7 join算子
- 1.8 union算子
- 1.9 intersection算子
- 1.10 subtract算子
- 1.11 mapPartition算子
- 1.12 distinct算子(map+reduceByKey+map)
- 1.13 cogroup算子
- 1.14 mapPartitionWithIndex算子
- 1.15 repartition算子
- 1.16 coalesce算子
- 1.17 groupByKey算子
- 1.18 zip算子
- 1.19 zipWithIndex算子
- 2.Action行动算子
- 3.控制算子(持久化算子)
-
- JavaSpark | 算子
- JavaSpark | RDD实战:WordCount
- JavaSpark | RDD实战:统计网站 pv 和 uv
- JavaSpark | RDD实战:二次排序
- JavaSpark | RDD实战:分组top n
- JavaSpark | SparkSQL | 创建DataSet | UDF与UDAF | 开窗函数
1.Transformations转换算子
Transformations 类算子是一类算子(函数)叫做转换算子,如map,flatMap,reduceByKey 等。Transformations 算子是延迟执行,也叫懒加载执行。
1.1 filter算子
filter
:过滤符合条件的记录数,true 保留,false 过滤掉。
Operator_filter.scala
:
package com.shsxt.java_Test.core.transform_operator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
/**
* filter
* 过滤符合符合条件的记录数,true的保留,false的过滤掉。
*
*/
public class Operator_filter {
public static void main(String[] args) {
/**
* SparkConf对象中主要设置Spark运行的环境参数。
* 1.运行模式
* 2.设置Application name
* 3.运行的资源需求
*/
SparkConf conf = new SparkConf();
conf.setMaster("local");
conf.setAppName("filter");
/**
* JavaSparkContext对象是spark运行的上下文,是通往集群的唯一通道。
*/
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> lines = jsc.textFile("data/word.txt");
JavaRDD<String> resultRDD = lines.filter(new Function<String, Boolean>() {
/**
* 内部类定义规则,一般是重写方法
*/
@Override
public Boolean call(String line) throws Exception {
return !line.contains("shsxt");
}
});
resultRDD.foreach(new VoidFunction<String>() {
@Override
public void call(String line) throws Exception {
System.out.println(line);
}
});
jsc.stop();
}
}
hello tiantian
hello gzsxt
hello Spark
1.2 map算子
map
:将一个 RDD 中的每个数据项,通过 map 中的函数映射变为一个新的元素。
特点:输入一条,输出一条数据。
package com.shsxt.java_Test.core.transform_operator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
/**
* map
* 通过传入的函数处理每个元素,返回新的数据集。
* 特点:输入一条,输出一条。
*
*
* @author root
*
*/
public class Operator_map {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local");
conf.setAppName("map");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> line = jsc.textFile("data/word.txt");
JavaRDD<String> mapResult = line.map(new Function<String, String>() {
@Override
public String call(String s) throws Exception {
return s+"~";
}
});
mapResult.foreach(new VoidFunction<String>() {
@Override
public void call(String t) throws Exception {
System.out.println(t);
}
});
jsc.stop();
}
}
hello tiantian~
hello shsxt~
hello gzsxt~
hello Spark~
1.3 flatMap算子
flatMap
:先 map
后 flat
。与 map 类似,每个输入项可以映射为 0 到多个输出项。
package com.shsxt.java_Test.core.transform_operator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.Arrays;
import java.util.Iterator;
/**
* flatMap
* 输入一条数据,输出0到多条数据。
* @author root
*
*/
public class Operator_flatMap {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local");
conf.setAppName("flatMap");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> lines = jsc.textFile("./data/word.txt",3);
JavaRDD<String> flatMapResult = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
return Arrays.asList(s.split(" ")).iterator();
}
});
flatMapResult.foreach(new VoidFunction<String>() {
@Override
public void call(String t) throws Exception {
System.out.println(t);
}
});
jsc.stop();
}
}
hello
tiantian
hello
shsxt
hello
gzsxt
hello
Spark
1.4 sample算子
sample
:随机抽样算子,根据传进去的小数按比例进行有放回或者无放回的抽样。
package com.shsxt.java_Test.core.transform_operator;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import scala.Tuple2;
public class Operator_sample {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local");
conf.setAppName("sample");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> lines = jsc.textFile("data/word.txt");
JavaPairRDD<String, Integer> flatMapToPair = lines.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
@Override
public Iterator<Tuple2<String, Integer>> call(String t)
throws Exception {
List<Tuple2<String,Integer>> tupleList = new ArrayList<Tuple2<String,Integer>>();
tupleList.add(new Tuple2<String,Integer>(t,1));
return tupleList.iterator();
}
});
JavaPairRDD<String, Integer> sampleResult = flatMapToPair.sample(true,0.3,4);
sampleResult.foreach(x-> System.out.println(x));
jsc.stop();
}
}
(hello shsxt,1)
1.5 reduceByKey算子
reduceByKey
:将相同的 Key 根据相应的逻辑进行处理。
package com.shsxt.java_Test.core.transform_operator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Arrays;
public class Operator_reduceByKey {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("countByKey");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<String, Integer> parallelizePairs = sc.parallelizePairs(Arrays.asList(
new Tuple2<>("a", 1),
new Tuple2<>("a", 1),
new Tuple2<>("a", 1),
new Tuple2<>("a", 1),
new Tuple2<>("a", 1),
new Tuple2<>("a", 1),
new Tuple2<>("a", 1)
), 2);
parallelizePairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
System.out.println("v1: " +v1 + " v2: " + v2);
return v1 + v2;
}
}).foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tuple2) throws Exception {
System.out.println(tuple2);
}
});
}
}
v1: 1 v2: 1
v1: 2 v2: 1
v1: 1 v2: 1
v1: 2 v2: 1
v1: 3 v2: 1
v1: 3 v2: 4
(a,7)
1.6 sortByKey与sortBy算子
sortByKey
:作用在 K、V 格式的 RDD 上,对 key 进行升序或者降序排序。
javaSpark中没有sortBy
算子。可以使用sortByKey
与mapToPair
算子达到一样的效果。
package com.shsxt.java_Test.core.transform_operator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
public class Operator_sortByKey {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local");
conf.setAppName("sortByKey");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> lines = jsc.textFile("data/word.txt");
JavaRDD<String> flatMap = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String t) throws Exception {
return Arrays.asList(t.split(" ")).iterator();
}
});
JavaPairRDD<String, Integer> mapToPair = flatMap.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<String, Integer>(s, 1);
}
});
mapToPair.sortByKey().foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tuple2) throws Exception {
System.out.println(tuple2);
}
});
JavaPairRDD<String, Integer> reduceByKey = mapToPair.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1+v2;
}
});
//想交换key与value,再排序,再交换回来
reduceByKey.mapToPair(new PairFunction<Tuple2<String,Integer>, Integer, String>() {
@Override
public Tuple2<Integer, String> call(Tuple2<String, Integer> t)
throws Exception {
return new Tuple2<Integer, String>(t._2, t._1);
}
}).sortByKey(false).mapToPair(new PairFunction<Tuple2<Integer,String>, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Tuple2<Integer, String> t)
throws Exception {
return new Tuple2<String,Integer>(t._2,t._1);
}
}).foreach(new VoidFunction<Tuple2<String,Integer>>() {
@Override
public void call(Tuple2<String, Integer> t) throws Exception {
System.out.println(t);
}
});
}
}
(hello,4)
(Spark,