【FLink-6-Flink多流操作api】

Flink多流操作

1.split分流操作 (已过时。flink1.12之后删除了)

具体示例代码:

DataStreamSource<Integer> numbers = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); 
//将数据打上标签,拆分成奇数和偶数 
SplitStream<Integer> splited = numbers.split(new OutputSelector<Integer>() {
 	@Override 
 	public Iterable<String> select(Integer value) { 
 	List<String> out = new ArrayList<String>(); 
 		if (value % 2 == 0) { 
 			out.add("偶数"); //将该条数据打上 even 的标签 
 		} else { 
 			out.add("奇数"); //将该条数据打上 odd 的标签 
 		}
 	return out; //返回带有标签的集合 
 } }); 
 //挑选出偶数的数据流
DataStream<Integer> even = splited.select("偶数"); //挑选出奇数的数据流 
DataStream<Integer> odd = splited.select("奇数"); //挑选出奇数、偶数全部的数据流 
DataStream<Integer> all = splited.select("even","odd");

2.分流操作 SideOutput(使用侧流输出)

使用 SideOutput分流操作的时候,底层就是给数据进行打标签操作,不同的数据根据提供的条件不同,打上对应的标签信息,然后在数据查询的时候,分流算子内部使用过滤的操作,将对应标签的数据输出。

具体代码案例:

package com.yang.flink.sideStream;

import com.alibaba.fastjson.JSON;
import com.yang.flink.source.MySourceFunction;
import com.yang.flink.vo.EventLog;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;

public class SideOutputDemo {
    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();
        configuration.setInteger("rest.port", 8822);
        StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
        env.setParallelism(1);


        // 开启checkpoint
        env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
        env.getCheckpointConfig().setCheckpointStorage("file:///e:/ckpt");

        // 构造好一个数据流
        DataStreamSource<EventLog> streamSource = env.addSource(new MySourceFunction());

        /**
         * 需求:将行为事件进行分流
         *      appLaunch 事件,分到一个流
         *      putBack 事件,分到一个流
         *      所有事件都分到主流
         */
        SingleOutputStreamOperator<EventLog> process = streamSource.process(new ProcessFunction<EventLog, EventLog>() {
            @Override
            public void processElement(EventLog eventLog, Context ctx, Collector<EventLog> out) throws Exception {

                String eventId = eventLog.getEventId();
                if ("appLaunch".equals(eventId)) {
                    ctx.output(new OutputTag<EventLog>("launch",TypeInformation.of(EventLog.class)), eventLog);
                } else if ("putBack".equals(eventId)) {
                    ctx.output(new OutputTag<String>("back",TypeInformation.of(String.class)), JSON.toJSONString(eventLog));
                }
                out.collect(eventLog);
            }
        });

        //appLaunch事件流 侧流输出
        DataStream<EventLog> launch = process.getSideOutput(new OutputTag<EventLog>("launch", TypeInformation.of(EventLog.class)));
        launch.print("launch");

        //putBack事件流 侧流输出
        DataStream<String> back = process.getSideOutput(new OutputTag<String>("back", TypeInformation.of(String.class)));
        back.print("back");

        env.execute();

    }
}

3.connect 连接操作

  • connect 翻译成中文意为连接,可以将两个数据类型一样也可以类型不一样 DataStream 连接成一个新 的 ConnectedStreams。需要注意的是,connect 方法与 union 方法不同,虽然调用 connect 方法将两个 流连接成一个新的 ConnectedStreams,但是里面的两个流依然是相互独立的,这个方法最大的好处是 可以让两个流共享 State 状态。
  • 具体代码示例:
package com.yang.flink.sideStream;

import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoMapFunction;

public class StreamConnectDemo {
    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        configuration.setInteger("rest.port", 8822);
        StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
        env.setParallelism(1);


        // 开启checkpoint
        env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
        env.getCheckpointConfig().setCheckpointStorage("file:///e:/ckptwangzhenguangshigedashabi");

        // 构造两个socket数据流
        DataStreamSource<String> streamSource1 = env.socketTextStream("hadooop102", 9999);
        DataStreamSource<String> streamSource2 = env.socketTextStream("hadoop102", 9998);

        ConnectedStreams<String, String> connect = streamSource1.connect(streamSource2);
        SingleOutputStreamOperator<String> result = connect.map(new CoMapFunction<String, String, String>() {
            String prefix = "yang";
            @Override
            public String map1(String value) throws Exception {
                //左边的流,将数值*10,返回字符串
                return prefix + (Integer.parseInt(value) * 10);
            }
            @Override
            public String map2(String value) throws Exception {
                //右边的流,将字母转成大写
                return prefix+value.toUpperCase();
            }
        });
        result.print();
        env.execute();
    }
}

4.Union合并操作

该方法可以将两个或者多个数据类型一致的 DataStream 合并成一个 DataStream。DataStream union(DataStream… streams)可以看出 DataStream 的 union 方法的参数为可变参数,
可以合并两 个或多个数据类型一致的 DataStream,也就是说参与union的流,必须数据类型一致。

在这里插入图片描述

5.coGroup协同分组

在这里插入图片描述

协同分组,如图上面的演示:
左边迭代器,窗口期内左边数据流的一组,比如id=1的组
右边迭代器,窗口期内右边数据流的一组,比如id=1的组
左右两边迭代器,在统一窗口期内,必然是相同key的分到了一组。

  • 相关代码示例如下:
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;

public class CoGroupDemo {
    public static void main(String[] args) {
        Configuration configuration = new Configuration();
        configuration.setInteger("rest.port", 8822);
        StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
        env.setParallelism(1);


        // 开启checkpoint
        env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
        env.getCheckpointConfig().setCheckpointStorage("file:///e:/ckptwangzhenguangshigedashabi");

        // 构造数据流1  socket传递来的数据格式为:id,name
        DataStreamSource<String> streamSource1 = env.socketTextStream("hadooop102", 9999);
        SingleOutputStreamOperator<Tuple2<String, String>> stream1 = streamSource1.map(s -> {
            String[] split = s.split(",");
            return Tuple2.of(split[0], split[1]);
        }).returns(new TypeHint<Tuple2<String, String>>() {});

        //构造数据流2  socket传递来的数据格式为:id,age,city
        DataStreamSource<String> streamSource2 = env.socketTextStream("hadoop102", 9998);
        SingleOutputStreamOperator<Tuple3<String, String, String>> stream2 = streamSource2.map(s -> {
            String[] split = s.split(",");
            return Tuple3.of(split[0], split[1], split[2]);
        }).returns(new TypeHint<Tuple3<String, String, String>>() {});
        DataStream<String> coStream = stream1.coGroup(stream2)
                .where(s -> s.f0)
                .equalTo(s -> s.f0)
                .window(TumblingProcessingTimeWindows.of(Time.seconds(20)))
                .apply(new CoGroupFunction<Tuple2<String, String>, Tuple3<String, String, String>, String>() {

                    /**
                     *
                     * @param first  是协同组中的第一个流的数据
                     * @param second 是协同组中的第二个流的数据
                     * @param out 是处理结果的输出器
                     * @throws Exception
                     */
                    @Override
                    public void coGroup(Iterable<Tuple2<String, String>> first, Iterable<Tuple3<String, String, String>> second, Collector<String> out) throws Exception {
                        //Todo 实现左关联逻辑
                        boolean flag = false;
                        for (Tuple2<String, String> tuple2 : first) {
                            for (Tuple3<String, String, String> tuple3 : second) {
                                out.collect(tuple2.f0 + "," + tuple2.f1 + "," + tuple3.f0 + "," + tuple3.f1 + "," + tuple3.f2);
                                flag = true;
                            }
                            if (!flag) {
                                out.collect(tuple2.f0 + "," + tuple2.f1 + "," + null + "," + null + "," + null);
                            }
                        }
                    }
                });
        coStream.print();

    }
}

6.Join (未look)

package com.yang.flink.sideStream;

import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;

public class CoGroupDemo {
    public static void main(String[] args) {
        Configuration configuration = new Configuration();
        configuration.setInteger("rest.port", 8822);
        StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
        env.setParallelism(1);


        // 开启checkpoint
        env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
        env.getCheckpointConfig().setCheckpointStorage("file:///e:/ckptwangzhenguangshigedashabi");

        // 构造数据流1  socket传递来的数据格式为:id,name
        DataStreamSource<String> streamSource1 = env.socketTextStream("hadooop102", 9999);
        SingleOutputStreamOperator<Tuple2<String, String>> stream1 = streamSource1.map(s -> {
            String[] split = s.split(",");
            return Tuple2.of(split[0], split[1]);
        }).returns(new TypeHint<Tuple2<String, String>>() {});

        //构造数据流2  socket传递来的数据格式为:id,age,city
        DataStreamSource<String> streamSource2 = env.socketTextStream("hadoop102", 9998);
        SingleOutputStreamOperator<Tuple3<String, String, String>> stream2 = streamSource2.map(s -> {
            String[] split = s.split(",");
            return Tuple3.of(split[0], split[1], split[2]);
        }).returns(new TypeHint<Tuple3<String, String, String>>() {});
        

        /**
         * 流的 join 算子
         * 案例背景:
         *    流1数据:  id,name
         *    流2数据:  id,age,city
         *    利用join算子,来实现两个流的数据按id关联
         */
        DataStream<String> joinedStream = stream1.join(stream2)
                .where(tp2 -> tp2.f0)
                .equalTo(tp3 -> tp3.f0)
                .window(TumblingProcessingTimeWindows.of(Time.seconds(20)))
                .apply(new JoinFunction<Tuple2<String, String>, Tuple3<String, String, String>, String>() {
                    @Override
                    public String join(Tuple2<String, String> t1, Tuple3<String, String, String> t2) throws Exception {
                        return t1.f0 + "," + t1.f1 + "," + t2.f0 + "," + t2.f1 + "," + t2.f2;
                    }
                });

        joinedStream.print();

    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值