目前网上对于flinksql1.11消费kafka流式写入hive资料比较少,且大多是直接在flink中新建kafka表写入hive,但对多层嵌套的json解析不知支持力度几何 ,我这是使用streaming api 消费kafka先将json解析拉平,再转成临时表,最后流式写入hive,且看代码:
package com.xxx.xxx
import java.sql.Timestamp
import java.util.Properties
import java.time.Duration
import java.util
import java.util.{
Date, Properties}
import com.fasterxml.jackson.databind.node.JsonNodeType
import com.fasterxml.jackson.databind.{
JsonNode, ObjectMapper}
import org.apache.flink.configuration.RestOptions
import org.apache.flink.contrib.streaming.state.RocksDBStateBackend
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.restartstrategy.RestartStrategies
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.api.scala._
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
import org.apache.flink.streaming.api.environment.ExecutionCheckpointingOptions
import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks
import org.apache.flink.streaming.api.functions.timestamps.{
AscendingTimestampExtractor, BoundedOutOfOrdernessTimestampExtractor}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.watermark.Watermark
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.streaming.api.{
CheckpointingMode, TimeCharacteristic}
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.table.api.EnvironmentSettings
import scala.collection.JavaConversions._
import scala.util.{
Failure, Success, Try}
import org.apache.flink.api.scala._
import org.apache.flink.table.api._
import org.apache.flink.table.api.bridge.scala._
import org.apache.flink.table.catalog.hive.HiveCatalog
object NginxLog2HivePartitionTime {
def main(args: Array[String]): Unit = {
val ifCreateHiveTable = args(0)
val parallelism = 3
val kafkaBrokers = "x.x.x.x:9092"
val jobName = "xxx"
val topicNames = List("xxx")
val groupName = "xxx"
val properties = new Properties()
properties.setProperty("bootstrap.servers", kafkaBrokers)
properties.setProperty("group.id", groupName)
//流处理的环境构造
val conf: Configuration = new Configuration()
val streamEnv = StreamExecutionEnvironment.getExecutionEnvironment
streamEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val rocksDBStateBackend = new RocksDBStateBackend("hdfs:///user/hdfs/flink1_11backend",</