import java.text.SimpleDateFormat
import org.apache.flink.api.common.state.MapStateDescriptor
import org.apache.flink.api.common.typeinfo.BasicTypeInfo
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.{AssignerWithPeriodicWatermarks, AssignerWithPunctuatedWatermarks}
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.watermark.Watermark
import org.apache.flink.util.Collector
object BroadDateStreamTest {
val mapStateDescriptor = new MapStateDescriptor(
"wordsConfig", //随意命名
BasicTypeInfo.STRING_TYPE_INFO, //state中key值的类型
BasicTypeInfo.STRING_TYPE_INFO) //state中value值的类型
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.enableCheckpointing(360000) //snapshot的间隔时间
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) //事件时间属性,即使用数据本身自带的时间
//设置生成 水位线/水印 的时间间隔(即每隔多久调用一次getCurrentWatermark())。还有另外一种策略,
// 会为满足条件的每条事件都生成并发出水印,由于每个水印都会在下游引起一些计算,因此频繁的生成水印会降低性能。
env.getConfig.setAutoWatermarkInterval(1000)
env.setParallelism(1) //并行度
val seq1 = Seq(
"1,cv2019-01-31,2,2020-03-03 12:04:36.433",
"2,cv2019-01-32,2,2020-03-03 12:04:37.433",
"2,cv2019-01-31,3,2020-03-03 12:04:38.433",
"1,cv2019-01-31,2,2020-03-03 12:04:39.433",
"1,cv2019-01-32,2,2020-03-03 12:04:40.433",
"2,cv2019-01-31,2,2020-03-03 12:04:41.433",
"2,cv2019-01-31,2,2020-03-03 12:04:42.433",
"1,cv2019-01-32,2,2020-03-03 12:04:43.433",
"1,cv2019-01-31,2,2020-03-03 12:04:44.433",
"2,cv2019-01-32,2,2020-03-03 12:04:45.433",
"1,cv2019-01-31,2,2020-03-03 12:04:46.433"
)
val seq2 = Seq(
"3,pv2019-01-31,2,2020-03-03 12:04:36.433",
"2,pv2019-01-32,2,2020-03-03 12:04:40.433",
"4,pv2019-01-31,3,2020-03-03 12:04:45.433"
)
val dss1: DataStream[String] = env.fromCollection(seq1)
.assignTimestampsAndWatermarks(
new MyAssignerWithPeriodicWatermarks(3500)
)
// 获取广播流
val broadStream = env.fromCollection(seq2)
.assignTimestampsAndWatermarks(
new MyAssignerWithPeriodicWatermarks(3500)
)
.broadcast(mapStateDescriptor)
dss1.connect(broadStream).process(new MyBroadcastProcessFunction).print()
env.execute("test")
}
class MyBroadcastProcessFunction extends BroadcastProcessFunction[String, String, String] {
override def processElement(value: String,
ctx: BroadcastProcessFunction[String, String, String]#ReadOnlyContext,
out: Collector[String]): Unit = {
var str = "value " + value
ctx.getBroadcastState(mapStateDescriptor).immutableEntries()
.forEach(x => {
if (x.getValue().split(",")(0).equals(value.split(",")(0))) {
str = str + " <-> " + x.getValue()
}
})
out.collect(str)
}
override def processBroadcastElement(value: String,
ctx: BroadcastProcessFunction[String, String, String]#Context,
out: Collector[String]): Unit = {
val vsp = value.split(",")
ctx.getBroadcastState(mapStateDescriptor).put(vsp(0) + "_" + vsp(3), value)
}
}
class MyAssignerWithPeriodicWatermarks(delay: Long) extends AssignerWithPeriodicWatermarks[String] {
val fm = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS")
private val maxOutOfOrderness = delay // 3.5 seconds 简单来说就是每条数据允许延迟到达的最大时间
private var currentMaxTimestamp = 0L // 上一次发射水印时的最大时间戳
// 固定周期调用 setAutoWatermarkInterval
override def getCurrentWatermark: Watermark = {
//这里的水印即已经到达的最大时间戳减去允许延迟时间的时间戳
new Watermark(currentMaxTimestamp - maxOutOfOrderness)
}
// 每条数据都会调用
override def extractTimestamp(element: String, previousElementTimestamp: Long): Long = {
val dt = fm.parse(element.split(",")(3))
val timestamp = dt.getTime()
currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp)
timestamp
}
}
}
运行结果
- 因并行度是1的原因,如果DataStream在BroadcastStream之前处理,那在处理DataStream的时候就连接不到BroadcastStream。
value 1,cv2019-01-31,2,2020-03-03 12:04:36.433
value 2,cv2019-01-32,2,2020-03-03 12:04:37.433 <-> 2,pv2019-01-32,2,2020-03-03 12:04:40.433
value 2,cv2019-01-31,3,2020-03-03 12:04:38.433 <-> 2,pv2019-01-32,2,2020-03-03 12:04:40.433
value 1,cv2019-01-31,2,2020-03-03 12:04:39.433
value 1,cv2019-01-32,2,2020-03-03 12:04:40.433
value 2,cv2019-01-31,2,2020-03-03 12:04:41.433 <-> 2,pv2019-01-32,2,2020-03-03 12:04:40.433
value 2,cv2019-01-31,2,2020-03-03 12:04:42.433 <-> 2,pv2019-01-32,2,2020-03-03 12:04:40.433
value 1,cv2019-01-32,2,2020-03-03 12:04:43.433
value 1,cv2019-01-31,2,2020-03-03 12:04:44.433
value 2,cv2019-01-32,2,2020-03-03 12:04:45.433 <-> 2,pv2019-01-32,2,2020-03-03 12:04:40.433
value 1,cv2019-01-31,2,2020-03-03 12:04:46.433