flink消费kafka json数据

最新推荐文章于 2025-01-24 09:54:57 发布

博学天下

最新推荐文章于 2025-01-24 09:54:57 发布

阅读量3.5k

点赞数 3

文章标签： flink 大数据 kafka hbase

本文链接：https://blog.youkuaiyun.com/weixin_40706727/article/details/115680866

版权

case class Canal(emptyCount: Long, //操作次数
                 logFileName: String, //binlog文件名
                 dbName: String, //数据库名
                 logFileOffset: Long, //binlong偏移量
                 eventType: String, //操作方式
                 columnValueList: String, //列值列表
                 tableName: String, //表名
                 timestamp: Long)

//时间戳


object Canal {
  def apply(json: String): Canal = {
    val jsonObject = JSON.parseObject(json)
    Canal(
      jsonObject.getLong("emptyCount"),
      jsonObject.getString("logFileName"),
      jsonObject.getString("dbName"),
      jsonObject.getLong("logFileOffset"),
      jsonObject.getString("eventType"),
      jsonObject.getString("columnValueList"),
      jsonObject.getString("tableName"),
      jsonObject.getLong("timestamp")
    )
  }
  
}


/*
    将Canal样例类预处理为HbaseOperation样例类
    主要就是封装HBaseUtil操作所必须的参数
 */
case class HbaseOperation (
                            opType:String,
                            tableName:String,
                            cfName:String,
                            rowkey:String,
                            colName:String,
                            colValue:String
                          )
//操作类型（opType） = INSERT/DELETE/UPDATE
//表名（tableName）= binlog数据库名_binlog表名
//列蔟名（cfName）= 固定为info
//rowkey = 唯一主键（去binlog中列数据的第一个）
//列名（colName）= binlog中列名
//列值（colValue）= binlog中列值

==========

object App {
  def main(args: Array[String]): Unit = {
    //2.创建main方法，获取StreamExecutionEnvironment运行环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    //3.设置流处理的时间为 EventTime ，使用数据发生的时间来进行数据处理
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    //4.将Flink默认的开发环境并行度设置为1
    env.setParallelism(3)

    //保证程序长时间运行的安全性进行checkpoint操作
    //5秒启动一次checkpoint
    env.enableCheckpointing(5000)
    // 设置checkpoint只checkpoint一次
    env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
    //设置两次checkpoint的最小时间间隔
    env.getCheckpointConfig.setMinPauseBetweenCheckpoints(1000)
    // checkpoint超时的时长
    env.getCheckpointConfig.setCheckpointTimeout(60000)
    // 允许的最大checkpoint并行度
    env.getCheckpointConfig.setMaxConcurrentCheckpoints(1)
    //当程序关闭的时，触发额外的checkpoint
    env.getCheckpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
    // 设置checkpoint的地址
    env.setStateBackend(new FsStateBackend("hdfs://node01:8020/flink-checkpoint/"))

    //整合Kafka
    val properties = new Properties()
    properties.setProperty("bootstrap.servers" , GlobalConfigUtil.bootstrapServers)
    properties.setProperty("zookeeper.connect" , GlobalConfigUtil.zookeeperConnect)
    properties.setProperty("group.id" , GlobalConfigUtil.groupId)
    properties.setProperty("enable.auto.commit" , GlobalConfigUtil.enableAutoCommit)
    properties.setProperty("auto.commit.interval.ms" , GlobalConfigUtil.autoCommitIntervalMs)
    properties.setProperty("auto.offset.reset" , GlobalConfigUtil.autoOffsetReset)
    //配置序列化和反序列化
    properties.setProperty("key.serializer" , "org.apache.kafka.common.serialization.StringSerializer")
    properties.setProperty("key.deserializer" , "org.apache.kafka.common.serialization.StringDeserializer")
    val consumer: FlinkKafkaConsumer09[String] = new FlinkKafkaConsumer09[String]( GlobalConfigUtil.inputTopic,
      new SimpleStringSchema(),
      properties
    )

    //拿到kafka的数据
    val

最低0.47元/天解锁文章