Flink process function使用详解

最新推荐文章于 2024-11-28 20:48:51 发布

iFence

最新推荐文章于 2024-11-28 20:48:51 发布

阅读量2.8k

点赞数 1

CC 4.0 BY-SA版权

分类专栏： Flink 文章标签： flink

本文链接：https://blog.youkuaiyun.com/Vector97/article/details/111404397

Flink 专栏收录该内容

26 篇文章

订阅专栏

本文介绍了Flink中底层的Process Function，它能操作event、state和timers，可看作能访问state和timer的flatmap function。还给出一个需求示例，要实现session window功能，每个间隔6s的“window”输出对应<key,count>元素计数。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

前言

process function是flink中比较底层的函数。能够实现一些高层函数无法实现的功能。它可以操作三个非常重要的对象：

event：数据流中的单个元素
state：状态
timers：（事件时间或处理时间）定时器，仅在keyedStream中可以访问。

process function可以看做是可以访问state和timer的flatmap function。

代码

假设有这样一个需求，实现session window的功能。每个间隔时间为6s的“window”输出对应<key,count>元素计数。

package it.kenn.source;

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import scala.Tuple2;
import scala.Tuple3;

import java.time.Duration;

/**
 * process function学习
 * 2020-12-19
 */
public class ProcessDemo2 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        //设置事件时间
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        SingleOutputStreamOperator<Tuple3<String, Long, Double>> source = env.addSource(new ForJoinSource1())
                //指定watermark生成策略
                .assignTimestampsAndWatermarks(WatermarkStrategy
                        .<Tuple3<String, Long, Double>>forBoundedOutOfOrderness(Duration.ofMillis(100))
                        .withTimestampAssigner((e, ts) -> e._2())
                );
        source.keyBy(e -> e._1())
                .process(new CountWithTimeoutFunction())
                .print();

        env.execute();
    }
}

class CountWithTimestamp {
    public String key;
    public long count;
    public long lastModified;
}

/**
 * 下面方法完成的功能是：
 * 如果某个键对应的值6s没有被修改就会输出这个键对应的计数。这个功能其实可以使用session window来实现
 */
class CountWithTimeoutFunction extends KeyedProcessFunction<String, Tuple3<String, Long, Double>, Tuple2<String, Long>> {
    private ValueState<CountWithTimestamp> state;

    @Override
    public void open(Configuration parameters) throws Exception {
        state = getRuntimeContext().getState(new ValueStateDescriptor<>("myState", CountWithTimestamp.class));
    }

    @Override
    public void processElement(Tuple3<String, Long, Double> value, Context context, Collector<Tuple2<String, Long>> collector) throws Exception {
        CountWithTimestamp current = state.value();
        if (current == null) {
            current = new CountWithTimestamp();
            current.key = value._1();
        }
        current.count++;
        current.lastModified = value._2();
        state.update(current);
        /**
         * 注册定时器。对每个[键,时间戳]的组合仅仅会注册一个定时器。如果[键,时间戳]注册了多个定时器，会被"去重",即注册的多个重复定时器仅会被调用一次
         * 由于每个[键,时间戳]仅仅会有一个定时器，我们可以在注册定时器的时候将时间戳到精度调小一点来减少定时器的数量。比如我们设计时间戳的精度为1s，那么理论上
         * 最多每秒才会产生一个定时器。
         *
         * 设置时间戳的精度为1s可以将代码改为：
         * context.timerService().registerEventTimeTimer( ((current.lastModified / 1000) * 1000) + 6000)
         */
        context.timerService().registerEventTimeTimer(current.lastModified + 6000);

        //停掉一个定时器，参数为需要停掉定时器的时间
//        long timestampOfTimerToStop = ...
//        context.timerService().deleteEventTimeTimer(timestampOfTimerToStop);
    }

    /**
     * 回调函数
     *
     * @param timestamp
     * @param ctx
     * @param out
     * @throws Exception
     */
    @Override
    public void onTimer(long timestamp, OnTimerContext ctx, Collector<Tuple2<String, Long>> out) throws Exception {
        CountWithTimestamp result = state.value();
        if (timestamp == result.lastModified + 6000) {
            out.collect(new Tuple2<>(result.key, result.count));
        }
    }
}