Flink广播流(BroadcastStream)实例 简单易懂


import java.text.SimpleDateFormat

import org.apache.flink.api.common.state.MapStateDescriptor
import org.apache.flink.api.common.typeinfo.BasicTypeInfo
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.{AssignerWithPeriodicWatermarks, AssignerWithPunctuatedWatermarks}
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.watermark.Watermark
import org.apache.flink.util.Collector

object BroadDateStreamTest {
  val mapStateDescriptor = new MapStateDescriptor(
    "wordsConfig", //随意命名
    BasicTypeInfo.STRING_TYPE_INFO, //state中key值的类型
    BasicTypeInfo.STRING_TYPE_INFO) //state中value值的类型

  def main(args: Array[String]): Unit = {

    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.enableCheckpointing(360000) //snapshot的间隔时间
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) //事件时间属性,即使用数据本身自带的时间
    //设置生成 水位线/水印 的时间间隔(即每隔多久调用一次getCurrentWatermark())。还有另外一种策略,
    // 会为满足条件的每条事件都生成并发出水印,由于每个水印都会在下游引起一些计算,因此频繁的生成水印会降低性能。
    env.getConfig.setAutoWatermarkInterval(1000)
    env.setParallelism(1) //并行度

    val seq1 = Seq(
      "1,cv2019-01-31,2,2020-03-03 12:04:36.433",
      "2,cv2019-01-32,2,2020-03-03 12:04:37.433",
      "2,cv2019-01-31,3,2020-03-03 12:04:38.433",
      "1,cv2019-01-31,2,2020-03-03 12:04:39.433",
      "1,cv2019-01-32,2,2020-03-03 12:04:40.433",
      "2,cv2019-01-31,2,2020-03-03 12:04:41.433",
      "2,cv2019-01-31,2,2020-03-03 12:04:42.433",
      "1,cv2019-01-32,2,2020-03-03 12:04:43.433",
      "1,cv2019-01-31,2,2020-03-03 12:04:44.433",
      "2,cv2019-01-32,2,2020-03-03 12:04:45.433",
      "1,cv2019-01-31,2,2020-03-03 12:04:46.433"
    )
    val seq2 = Seq(
      "3,pv2019-01-31,2,2020-03-03 12:04:36.433",
      "2,pv2019-01-32,2,2020-03-03 12:04:40.433",
      "4,pv2019-01-31,3,2020-03-03 12:04:45.433"
    )

    val dss1: DataStream[String] = env.fromCollection(seq1)
      .assignTimestampsAndWatermarks(
        new MyAssignerWithPeriodicWatermarks(3500)
      )

    // 获取广播流
    val broadStream = env.fromCollection(seq2)
      .assignTimestampsAndWatermarks(
        new MyAssignerWithPeriodicWatermarks(3500)
      )
      .broadcast(mapStateDescriptor)

    dss1.connect(broadStream).process(new MyBroadcastProcessFunction).print()

    env.execute("test")
  }

  class MyBroadcastProcessFunction extends BroadcastProcessFunction[String, String, String] {

    override def processElement(value: String,
                                ctx: BroadcastProcessFunction[String, String, String]#ReadOnlyContext,
                                out: Collector[String]): Unit = {
      var str = "value   " + value
      ctx.getBroadcastState(mapStateDescriptor).immutableEntries()
        .forEach(x => {
          if (x.getValue().split(",")(0).equals(value.split(",")(0))) {
            str = str + " <-> " + x.getValue()
          }
        })
      out.collect(str)

    }

    override def processBroadcastElement(value: String,
                                         ctx: BroadcastProcessFunction[String, String, String]#Context,
                                         out: Collector[String]): Unit = {
      val vsp = value.split(",")
      ctx.getBroadcastState(mapStateDescriptor).put(vsp(0) + "_" + vsp(3), value)
    }

  }

  class MyAssignerWithPeriodicWatermarks(delay: Long) extends AssignerWithPeriodicWatermarks[String] {
    val fm = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS")
    private val maxOutOfOrderness = delay // 3.5 seconds 简单来说就是每条数据允许延迟到达的最大时间
    private var currentMaxTimestamp = 0L // 上一次发射水印时的最大时间戳

    // 固定周期调用 setAutoWatermarkInterval
    override def getCurrentWatermark: Watermark = {
      //这里的水印即已经到达的最大时间戳减去允许延迟时间的时间戳
      new Watermark(currentMaxTimestamp - maxOutOfOrderness)
    }

    // 每条数据都会调用
    override def extractTimestamp(element: String, previousElementTimestamp: Long): Long = {
      val dt = fm.parse(element.split(",")(3))
      val timestamp = dt.getTime()
      currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp)
      timestamp
    }
  }

}

运行结果

  • 因并行度是1的原因,如果DataStream在BroadcastStream之前处理,那在处理DataStream的时候就连接不到BroadcastStream。
value   1,cv2019-01-31,2,2020-03-03 12:04:36.433
value   2,cv2019-01-32,2,2020-03-03 12:04:37.433 <-> 2,pv2019-01-32,2,2020-03-03 12:04:40.433
value   2,cv2019-01-31,3,2020-03-03 12:04:38.433 <-> 2,pv2019-01-32,2,2020-03-03 12:04:40.433
value   1,cv2019-01-31,2,2020-03-03 12:04:39.433
value   1,cv2019-01-32,2,2020-03-03 12:04:40.433
value   2,cv2019-01-31,2,2020-03-03 12:04:41.433 <-> 2,pv2019-01-32,2,2020-03-03 12:04:40.433
value   2,cv2019-01-31,2,2020-03-03 12:04:42.433 <-> 2,pv2019-01-32,2,2020-03-03 12:04:40.433
value   1,cv2019-01-32,2,2020-03-03 12:04:43.433
value   1,cv2019-01-31,2,2020-03-03 12:04:44.433
value   2,cv2019-01-32,2,2020-03-03 12:04:45.433 <-> 2,pv2019-01-32,2,2020-03-03 12:04:40.433
value   1,cv2019-01-31,2,2020-03-03 12:04:46.433
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值