FLink-6-Flink多流操作api
Flink多流操作
1.split分流操作 (已过时。flink1.12之后删除了)
具体示例代码:
DataStreamSource<Integer> numbers = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
//将数据打上标签,拆分成奇数和偶数
SplitStream<Integer> splited = numbers.split(new OutputSelector<Integer>() {
@Override
public Iterable<String> select(Integer value) {
List<String> out = new ArrayList<String>();
if (value % 2 == 0) {
out.add("偶数"); //将该条数据打上 even 的标签
} else {
out.add("奇数"); //将该条数据打上 odd 的标签
}
return out; //返回带有标签的集合
} });
//挑选出偶数的数据流
DataStream<Integer> even = splited.select("偶数"); //挑选出奇数的数据流
DataStream<Integer> odd = splited.select("奇数"); //挑选出奇数、偶数全部的数据流
DataStream<Integer> all = splited.select("even","odd");
2.分流操作 SideOutput(使用侧流输出)
使用 SideOutput分流操作的时候,底层就是给数据进行打标签操作,不同的数据根据提供的条件不同,打上对应的标签信息,然后在数据查询的时候,分流算子内部使用过滤的操作,将对应标签的数据输出。
具体代码案例:
package com.yang.flink.sideStream;
import com.alibaba.fastjson.JSON;
import com.yang.flink.source.MySourceFunction;
import com.yang.flink.vo.EventLog;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
public class SideOutputDemo {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
configuration.setInteger("rest.port", 8822);
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
env.setParallelism(1);
// 开启checkpoint
env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointStorage("file:///e:/ckpt");
// 构造好一个数据流
DataStreamSource<EventLog> streamSource = env.addSource(new MySourceFunction());
/**
* 需求:将行为事件进行分流
* appLaunch 事件,分到一个流
* putBack 事件,分到一个流
* 所有事件都分到主流
*/
SingleOutputStreamOperator<EventLog> process = streamSource.process(new ProcessFunction<EventLog, EventLog>() {
@Override
public void processElement(EventLog eventLog, Context ctx, Collector<EventLog> out) throws Exception {
String eventId = eventLog.getEventId();
if ("appLaunch".equals(eventId)) {
ctx.output(new OutputTag<EventLog>("launch",TypeInformation.of(EventLog.class)), eventLog);
} else if ("putBack".equals(eventId)) {
ctx.output(new OutputTag<String>("back",TypeInformation.of(String.class)), JSON.toJSONString(eventLog));
}
out.collect(eventLog);
}
});
//appLaunch事件流 侧流输出
DataStream<EventLog> launch = process.getSideOutput(new OutputTag<EventLog>("launch", TypeInformation.of(EventLog.class)));
launch.print("launch");
//putBack事件流 侧流输出
DataStream<String> back = process.getSideOutput(new OutputTag<String>("back", TypeInformation.of(String.class)));
back.print("back");
env.execute();
}
}
3.connect 连接操作
- connect 翻译成中文意为连接,可以将两个数据类型一样也可以类型不一样 DataStream 连接成一个新 的 ConnectedStreams。需要注意的是,connect 方法与 union 方法不同,虽然调用 connect 方法将两个 流连接成一个新的 ConnectedStreams,但是里面的两个流依然是相互独立的,这个方法最大的好处是 可以让两个流共享 State 状态。
- 具体代码示例:
package com.yang.flink.sideStream;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoMapFunction;
public class StreamConnectDemo {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
configuration.setInteger("rest.port", 8822);
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
env.setParallelism(1);
// 开启checkpoint
env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointStorage("file:///e:/ckptwangzhenguangshigedashabi");
// 构造两个socket数据流
DataStreamSource<String> streamSource1 = env.socketTextStream("hadooop102", 9999);
DataStreamSource<String> streamSource2 = env.socketTextStream("hadoop102", 9998);
ConnectedStreams<String, String> connect = streamSource1.connect(streamSource2);
SingleOutputStreamOperator<String> result = connect.map(new CoMapFunction<String, String, String>() {
String prefix = "yang";
@Override
public String map1(String value) throws Exception {
//左边的流,将数值*10,返回字符串
return prefix + (Integer.parseInt(value) * 10);
}
@Override
public String map2(String value) throws Exception {
//右边的流,将字母转成大写
return prefix+value.toUpperCase();
}
});
result.print();
env.execute();
}
}
4.Union合并操作
该方法可以将两个或者多个数据类型一致的 DataStream 合并成一个 DataStream。DataStream union(DataStream… streams)可以看出 DataStream 的 union 方法的参数为可变参数,
即可以合并两 个或多个数据类型一致的 DataStream,也就是说参与union的流,必须数据类型一致。
5.coGroup协同分组
协同分组,如图上面的演示:
左边迭代器,窗口期内左边数据流的一组,比如id=1的组
右边迭代器,窗口期内右边数据流的一组,比如id=1的组
左右两边迭代器,在统一窗口期内,必然是相同key的分到了一组。
- 相关代码示例如下:
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
public class CoGroupDemo {
public static void main(String[] args) {
Configuration configuration = new Configuration();
configuration.setInteger("rest.port", 8822);
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
env.setParallelism(1);
// 开启checkpoint
env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointStorage("file:///e:/ckptwangzhenguangshigedashabi");
// 构造数据流1 socket传递来的数据格式为:id,name
DataStreamSource<String> streamSource1 = env.socketTextStream("hadooop102", 9999);
SingleOutputStreamOperator<Tuple2<String, String>> stream1 = streamSource1.map(s -> {
String[] split = s.split(",");
return Tuple2.of(split[0], split[1]);
}).returns(new TypeHint<Tuple2<String, String>>() {});
//构造数据流2 socket传递来的数据格式为:id,age,city
DataStreamSource<String> streamSource2 = env.socketTextStream("hadoop102", 9998);
SingleOutputStreamOperator<Tuple3<String, String, String>> stream2 = streamSource2.map(s -> {
String[] split = s.split(",");
return Tuple3.of(split[0], split[1], split[2]);
}).returns(new TypeHint<Tuple3<String, String, String>>() {});
DataStream<String> coStream = stream1.coGroup(stream2)
.where(s -> s.f0)
.equalTo(s -> s.f0)
.window(TumblingProcessingTimeWindows.of(Time.seconds(20)))
.apply(new CoGroupFunction<Tuple2<String, String>, Tuple3<String, String, String>, String>() {
/**
*
* @param first 是协同组中的第一个流的数据
* @param second 是协同组中的第二个流的数据
* @param out 是处理结果的输出器
* @throws Exception
*/
@Override
public void coGroup(Iterable<Tuple2<String, String>> first, Iterable<Tuple3<String, String, String>> second, Collector<String> out) throws Exception {
//Todo 实现左关联逻辑
boolean flag = false;
for (Tuple2<String, String> tuple2 : first) {
for (Tuple3<String, String, String> tuple3 : second) {
out.collect(tuple2.f0 + "," + tuple2.f1 + "," + tuple3.f0 + "," + tuple3.f1 + "," + tuple3.f2);
flag = true;
}
if (!flag) {
out.collect(tuple2.f0 + "," + tuple2.f1 + "," + null + "," + null + "," + null);
}
}
}
});
coStream.print();
}
}
6.Join (未look)
package com.yang.flink.sideStream;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
public class CoGroupDemo {
public static void main(String[] args) {
Configuration configuration = new Configuration();
configuration.setInteger("rest.port", 8822);
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
env.setParallelism(1);
// 开启checkpoint
env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointStorage("file:///e:/ckptwangzhenguangshigedashabi");
// 构造数据流1 socket传递来的数据格式为:id,name
DataStreamSource<String> streamSource1 = env.socketTextStream("hadooop102", 9999);
SingleOutputStreamOperator<Tuple2<String, String>> stream1 = streamSource1.map(s -> {
String[] split = s.split(",");
return Tuple2.of(split[0], split[1]);
}).returns(new TypeHint<Tuple2<String, String>>() {});
//构造数据流2 socket传递来的数据格式为:id,age,city
DataStreamSource<String> streamSource2 = env.socketTextStream("hadoop102", 9998);
SingleOutputStreamOperator<Tuple3<String, String, String>> stream2 = streamSource2.map(s -> {
String[] split = s.split(",");
return Tuple3.of(split[0], split[1], split[2]);
}).returns(new TypeHint<Tuple3<String, String, String>>() {});
/**
* 流的 join 算子
* 案例背景:
* 流1数据: id,name
* 流2数据: id,age,city
* 利用join算子,来实现两个流的数据按id关联
*/
DataStream<String> joinedStream = stream1.join(stream2)
.where(tp2 -> tp2.f0)
.equalTo(tp3 -> tp3.f0)
.window(TumblingProcessingTimeWindows.of(Time.seconds(20)))
.apply(new JoinFunction<Tuple2<String, String>, Tuple3<String, String, String>, String>() {
@Override
public String join(Tuple2<String, String> t1, Tuple3<String, String, String> t2) throws Exception {
return t1.f0 + "," + t1.f1 + "," + t2.f0 + "," + t2.f1 + "," + t2.f2;
}
});
joinedStream.print();
}
}