需求:日活明细分析,需要保存日活明细数据。通过对数据去重得到日活,因为乱序数据,客户端时间可能会存在变化。
1. Flink 窗口排序去重,写入HBase
import java.time.Duration
import com.sm.common.conf.PropManager
import com.sm.constants.Constants
import com.sm.utils.FlinkUtils
import org.apache.flink.api.common.restartstrategy.RestartStrategies
import org.apache.flink.api.common.time.Time
import org.apache.flink.streaming.api.environment.{CheckpointConfig, ExecutionCheckpointingOptions}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic}
import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
import org.apache.log4j.Level
import org.slf4j.LoggerFactory
/**
* flink sql User Active Daily Analysis
*
* create by LiuJinHe 2020/10/12
*/
object UserActiveDailyAnalysis {
private var logger: org.slf4j.Logger = _
def main(args: Array[String]): Unit = {
logger = LoggerFactory.getLogger(this.getClass.getSimpleName)
org.apache.log4j.Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
org.apache.log4j.Logger.getLogger("org.apache").setLevel(Level.INFO)
// 初始化 stream 环境
// 本地测试,需要 flink-runtime-web 依赖
// val env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI()
val env = StreamExecutionEnvironment.getExecutionEnvironment
// 本地测试线程 1
// env.setParallelism(1)
// val params = ParameterTool.fromArgs(args)
// env.getConfig.setGlobalJobParameters(params)
// 失败重启,固定间隔,每隔3秒重启1次,总尝试重启10次
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 3))
// 事件处理的时间,由系统时间决定
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime)
// 创建 streamTable 环境
val tableEnv: StreamTableEnvironment = StreamTableEnvironment.create(env, FlinkUtils.getSettings)
// checkpoint 设置
val tableConfig = tableEnv.getConfig.getConfiguration
tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_MODE, CheckpointingMode.EXACTLY_ONCE)
// checkpoint的超时时间周期,1 分钟做一次checkpoint, 每次checkpoint 完成后 sink 才会执行
tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_INTERVAL, Duration.ofSeconds(60))
// checkpoint的超时时间, 检查点一分钟内没有完成将被丢弃
tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_TIMEOUT, Duration.ofSeconds(60))
// checkpoint 最小间隔,两个检查点之间至少间隔 30 秒
tableConfig.set(ExecutionCheckpointingOptions.MIN_PAUSE_BETWEEN_CHECKPOINTS, Duration.ofSeconds(30))
// 同一时间只允许进行一个检查点
tableConfig.set(ExecutionCheckpointingOptions.MAX_CONCURRENT_CHECKPOINTS, Integer.valueOf(1))
// 手动cancel时是否保留checkpoint
tableConfig.set(ExecutionCheckpointingOptions.EXTERNALIZED_CHECKPOINT,
CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
// 设置状态的最小空闲时间和最大的空闲时间, 也就是空闲数据的保留时间
tableEnv.getConfig.setIdleStateRetentionTime(Time.hours(12), Time.hours(24))
// tableConfig.setString("table.exec.mini-batch.enabled