5 Flink 流处理API
-
Enviroment
package env; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.streaming.api.environment.LocalStreamEnvironment; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; /** * @author wangkai */ public class Env { public static void main(String[] args) { /** * 流式执行环境 * 创建一个执行环境,表示当前执行程序的上下文。 如果程序是独立调用的,则 * 此方法返回本地执行环境;如果从命令行客户端调用程序以提交到集群,则此方法 * 返回此集群的执行环境,也就是说,getExecutionEnvironment 会根据查询运行的方 * 式决定返回什么样的运行环境,是最常用的一种创建执行环境的方式 * */ StreamExecutionEnvironment StreamEnvironment = StreamExecutionEnvironment.getExecutionEnvironment(); /** * 批处理执行环境 * */ ExecutionEnvironment BatchEnvironment1 = ExecutionEnvironment.getExecutionEnvironment(); /** * *创建本地执行环境 * */ LocalStreamEnvironment localEnvironment = StreamExecutionEnvironment.createLocalEnvironment(); /** * 创建远程执行环境 * */ StreamExecutionEnvironment remoteEnvironment = StreamExecutionEnvironment.createRemoteEnvironment("xxx", 1234, "xxx.jar"); } }
-
Source
package source; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.source.SourceFunction; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011; import java.util.Arrays; import java.util.Properties; /** * @author wangkai */ public class Source { public static void main(String[] args) { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); /** * 1 从集合读取 * */ DataStreamSource<String> source = env.fromCollection(Arrays.asList("zs", "ls", "wmz")); /** * 2 从文件读取数据 * */ DataStreamSource<String> source1 = env.readTextFile("path"); /** * 3 从kafka读取数据 * */ Properties properties = new Properties(); properties.setProperty("bootstrap.servers","xxx"); properties.setProperty("group.id","xxx"); DataStreamSource<String> source2 = env.addSource(new FlinkKafkaConsumer011<String>("topic", new SimpleStringSchema(), properties)); /** * * 4 自定义source * */ DataStreamSource<String> source3 = env.addSource(new MySourceFunction()); } public static class MySourceFunction implements SourceFunction<String> { private boolean isRunning = true; public void run(SourceContext<String> ctx) throws Exception { while(isRunning){ System.out.println("自定义数据源"); } } public void cancel() { isRunning = false; } } }
-
Transform
-
转换算子
package transform; import com.alibaba.fastjson.JSONObject; import org.apache.flink.api.common.functions.FilterFunction; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.java.functions.KeySelector; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.KeyedStream; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; /** * @author wangkai */ public class TransForm { public static void main(String[] args) { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); DataStreamSource<String> source = env.socketTextStream("", 7777); source.map(new MapFunction<String, String>() { /** * map算子,接受一个元素并产生一个元素 * */ public String map(String value) throws Exception { Object parse = JSONObject.parse(value); return parse.toString(); } } ); SingleOutputStreamOperator<Tuple2<String, Integer>> flatMapSource = source.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() { /** * flatMap 算子:接受一个元素并产生零个、一个或多个元素 * * */ public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception { String[] s = value.split(" "); for (int i = 0; i < s.length; i++) { out.collect(new Tuple2<String, Integer>(s[i], 1)); } } }); source.filter(new FilterFunction<String>() { /** * filter算子: 对每个元素计算一个布尔函数,并保留该函数返回true的那些元素 * */ public boolean filter(String value) throws Exception { return value == "filter"; } }); KeyedStream<Tuple2<String, Integer>, String> keyedStream = flatMapSource.keyBy(new KeySelector<Tuple2<String, Integer>, String>() { public String getKey(Tuple2<String, Integer> value) throws Exception { return value.f0; } }); } }
-
滚动聚合算子
-
sum
-
max
-
min
-
minBy
-
maxBy
package transform; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; /** * @author wangkai */ public class RollingAggregation { public static void main(String[] args) throws Exception{ StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); /** * 滚动聚合算子:sum max min maxBy minBy * * */ /** * * sensor1 123456789 35 * sensor2 234567890 36 * sensor3 456962456 24 * sensor1 123456789 20 * * * */ DataStreamSource<String> source = env.readTextFile("D:\\git\\csdn-flink\\csdn-flink-1\\src\\main\\resources\\sensor"); SingleOutputStreamOperator<Tuple3<String, String, Long>> stream = source.flatMap(new FlatMapFunction<String, Tuple3<String, String, Long>>() { public void flatMap(String value, Collector<Tuple3<String, String, Long>> out) throws Exception { String[] s = value.split(" "); out.collect(new Tuple3<String, String, Long>(s[0], s[1], Long.parseLong(s[2]))); } }); /** * sum * */ SingleOutputStreamOperator<Tuple3<String, String, Long>> sum = stream .keyBy(0) .sum(2); sum.print("sum"); /** * max * */ SingleOutputStreamOperator<Tuple3<String, String, Long>> max = stream .keyBy(0) .max(2); max.print("max"); /** * min * */ SingleOutputStreamOperator<Tuple3<String, String, Long>> min = stream .keyBy(0) .min(2); min.print("min"); /** * maxBy * */ SingleOutputStreamOperator<Tuple3<String, String, Long>> maxBy = stream .keyBy(0) .maxBy(2); maxBy.print("maxby"); /** * minBy * */ SingleOutputStreamOperator<Tuple3<String, String, Long>> minBy = stream .keyBy(0) .minBy(2); minBy.print("minby"); env.execute("rolling aggregate"); /** * max> (sensor1,123456789,35) * min> (sensor1,123456789,35) * maxby> (sensor1,123456789,35) * minby> (sensor1,123456789,35) * sum> (sensor1,123456789,35) * maxby> (sensor2,234567890,36) * min> (sensor2,234567890,36) * max> (sensor2,234567890,36) * maxby> (sensor3,456962456,24) * sum> (sensor2,234567890,36) * min> (sensor3,456962456,24) * minby> (sensor2,234567890,36) * sum> (sensor3,456962456,24) * min> (sensor1,123456789,20) * maxby> (sensor1,123456789,35) * max> (sensor3,456962456,24) * sum> (sensor1,123456789,55) * max> (sensor1,123456789,35) * minby> (sensor3,456962456,24) * minby> (sensor1,123456789,20) * */ } }
-
-
Reduce
package transform; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.common.functions.ReduceFunction; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; /** * @author wangkai */ public class Reduce { public static void main(String[] args) throws Exception{ StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); /** * * KeyedStream → DataStream:一个分组数据流的聚合操作,合并当前的元素 * 和上次聚合的结果,产生一个新的值,返回的流中包含每一次聚合的结果,而不是 * 只返回最后一次聚合的最终结果 * * sensor1 123456789 35 * sensor2 234567890 36 * sensor3 456962456 24 * sensor1 123456789 20 * */ DataStreamSource<String> source = env.readTextFile("D:\\git\\csdn-flink\\csdn-flink-1\\src\\main\\resources\\sensor"); SingleOutputStreamOperator<Tuple3<String, String, Long>> stream = source.flatMap(new FlatMapFunction<String, Tuple3<String, String, Long>>() { public void flatMap(String value, Collector<Tuple3<String, String, Long>> out) throws Exception { String[] s = value.split(" "); out.collect(new Tuple3<String, String, Long>(s[0], s[1], Long.parseLong(s[2]))); } }); SingleOutputStreamOperator<Tuple3<String, String, Long>> reduce = stream .keyBy(0) .reduce(new ReduceFunction<Tuple3<String, String, Long>>() { public Tuple3<String, String, Long> reduce(Tuple3<String, String, Long> value1, Tuple3<String, String, Long> value2) throws Exception { return new Tuple3<String, String, Long>(value1.f0, value1.f1, Math.max(value1.f2, value2.f2)); } }); reduce.print("reduce"); env.execute("reduce"); /** * reduce> (sensor1,123456789,35) * reduce> (sensor2,234567890,36) * reduce> (sensor3,456962456,24) * reduce> (sensor1,123456789,35) * */ } }
-
Split和select
package transform; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.streaming.api.collector.selector.OutputSelector; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.datastream.SplitStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; import java.util.Collections; /** * @author */ public class SplitAndSelect { public static void main(String[] args) throws Exception{ StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); /** *split:DataStream → SplitStream:根据某些特征把一个 DataStream 拆分成两个或者多个DataStream。 *select:SplitStream→DataStream:从一个 SplitStream 中获取一个或者多个DataStream * * sensor1 123456789 35 * sensor2 234567890 36 * sensor3 456962456 24 * sensor1 123456789 20 * */ DataStreamSource<String> source = env.readTextFile("D:\\git\\csdn-flink\\csdn-flink-1\\src\\main\\resources\\sensor"); SingleOutputStreamOperator<Tuple3<String, String, Long>> stream = source.flatMap(new FlatMapFunction<String, Tuple3<String, String, Long>>() { public void flatMap(String value, Collector<Tuple3<String, String, Long>> out) throws Exception { String[] s = value.split(" "); out.collect(new Tuple3<String, String, Long>(s[0], s[1], Long.parseLong(s[2]))); } }); SplitStream<Tuple3<String, String, Long>> split = stream.split(new OutputSelector<Tuple3<String, String, Long>>() { public Iterable<String> select(Tuple3<String, String, Long> value) { return value.f2 > 30 ? Collections.singletonList("high") : Collections.singletonList("low"); } }); split.print("split"); DataStream<Tuple3<String, String, Long>> high = split.select("high"); DataStream<Tuple3<String, String, Long>> low = split.select("low"); high.print("high"); low.print("low"); env.execute("split and select"); /** * * split> (sensor1,123456789,35) * high> (sensor1,123456789,35) * split> (sensor2,234567890,36) * high> (sensor2,234567890,36) * split> (sensor3,456962456,24) * low> (sensor3,456962456,24) * split> (sensor1,123456789,20) * low> (sensor1,123456789,20) * * */ } }
-
connect和CoMap
DataStream<Tuple3<String, String, Long>> high = split.select("high"); DataStream<Tuple3<String, String, Long>> low = split.select("low"); high.print("high"); low.print("low"); ConnectedStreams<Tuple3<String, String, Long>, Tuple3<String, String, Long>> connect = high.connect(low); connect.flatMap(new CoFlatMapFunction<Tuple3<String, String, Long>, Tuple3<String, String, Long>, Object>() { public void flatMap1(Tuple3<String, String, Long> value, Collector<Object> out) throws Exception { out.collect(value); } public void flatMap2(Tuple3<String, String, Long> value, Collector<Object> out) throws Exception { out.collect(value); } });
-
union
DataStream<Tuple3<String, String, Long>> union = high.union(low); Connect 与 Union 区别: 1. Union 之前两个流的类型必须是一样,Connect 可以不一样,在之后的 coMap中再去调整成为一样的。 2. Connect 只能操作两个流,Union 可以操作多个。
-
-
支持的数据类型
Flink 流应用程序处理的是以数据对象表示的事件流。所以在 Flink 内部,我们 需要能够处理这些对象。它们需要被序列化和反序列化,以便通过网络传送它们; 或者从状态后端、检查点和保存点读取它们。为了有效地做到这一点,Flink 需要明 确知道应用程序所处理的数据类型。Flink 使用类型信息的概念来表示数据类型,并 为每个数据类型生成特定的序列化器、反序列化器和比较器。 Flink 还具有一个类型提取系统,该系统分析函数的输入和返回类型,以自动获 取类型信息,从而获得序列化器和反序列化器。但是,在某些情况下,例如 lambda 函数或泛型类型,需要显式地提供类型信息,才能使应用程序正常工作或提高其性 能。
-
自定义udf函数
-
函数类
-
匿名函数
-
富函数
“富函数”是 DataStream API 提供的一个函数类的接口,所有 Flink 函数类都 有其 Rich 版本。它与常规函数的不同在于,可以获取运行环境的上下文,并拥有一 些生命周期方法,所以可以实现更复杂的功能。 有一个生命周期的概念。典型的生命周期方法有: open()方法是 rich function 的初始化方法,当一个算子例如 map 或者 filter 被调用之前 open()会被调用。 close()方法是生命周期中的最后一个调用的方法,做一些清理工作。 getRuntimeContext()方法提供了函数的 RuntimeContext 的一些信息,例如函 数执行的并行度,任务的名字,以及 state 状态
-
-
Sink
-
自定义sink函数
package sink; import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; public class MyJdbcSink extends RichSinkFunction<String> { Connection conn = null; PreparedStatement insertStmt = null; @Override public void open(Configuration parameters) throws Exception { conn = DriverManager.getConnection("url", "username", "password"); insertStmt = conn.prepareStatement("insert into xxx (a) values (?)"); } /** * 调用连接,执行sql */ @Override public void invoke(String value, Context context) throws Exception { insertStmt.setString(1,"test"); insertStmt.execute(); } @Override public void close() throws Exception { insertStmt.close(); conn.close(); } }
-