FlinkSQL Kafka to Hive

基于CDH平台,Hive2.1.1-CDH6.2.0 ,写入Hive简单操作。

Hive Streaming

 

import java.time.Duration

import com.sm.common.conf.PropManager
import com.sm.constants.Constants
import com.sm.utils.FlinkUtils
import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic}
import org.apache.flink.streaming.api.environment.{CheckpointConfig, ExecutionCheckpointingOptions}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.table.api.SqlDialect
import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
import org.apache.flink.table.catalog.hive.HiveCatalog
import org.apache.log4j.Level
import org.slf4j.LoggerFactory

/**
  * flink sql kafka to hive
  *
  * create by LiuJinHe 2020/10/22
  */
object FlinkSqlKafka2Hive {
  private var logger: org.slf4j.Logger = _

  def main(args: Array[String]): Unit = {
    logger = LoggerFactory.getLogger(this.getClass.getSimpleName)
    org.apache.log4j.Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
    org.apache.log4j.Logger.getLogger("org.apache").setLevel(Level.INFO)

    // 初始化 stream 环境
    // 本地测试,需要 flink-runtime-web 依赖
    val env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI()
    //    val env = StreamExecutionEnvironment.getExecutionEnvironment

    //    参数读取
    //    val params = ParameterTool.fromArgs(args)
    //    env.getConfig.setGlobalJobParameters(params)

    // 失败重启,固定间隔,每隔3秒重启1次,总尝试重启10次
    //    env.setRestartStrategy(RestartStrategies.fixedDelayRestart(10, 3))
    // 本地测试线程 1
    env.setParallelism(1)

    // 事件处理的时间,由系统时间决定
    env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime)

    // 创建 streamTable 环境
    val tableEnv: StreamTableEnvironment = StreamTableEnvironment.create(env, FlinkUtils.getSettings)

    // checkpoint 设置
    val tableConfig = tableEnv.getConfig.getConfiguration
    tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_MODE, CheckpointingMode.EXACTLY_ONCE)
    // checkpoint的超时时间周期,1 分钟做一次checkpoint
    tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_INTERVAL, Duration.ofSeconds(30))
    // checkpoint的超时时间, 检查点一分钟内没有完成将被丢弃
    //    tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_TIMEOUT, Duration.ofSeconds(60))
    // checkpoint 最小间隔,两个检查点之间至少间隔 30 秒
    //    tableConfig.set(ExecutionCheckpointingOptions.MIN_PAUSE_BETWEEN_CHECKPOINTS, Duration.ofSeconds(30))
    // 同一时间只允许进行一个检查点
    //    tableConfig.set(ExecutionCheckpointingOptions.MAX_CONCURRENT_CHECKPOINTS, Integer.valueOf(1))
    // 手动cancel时是否保留checkpoint
    //    tableConfig.set(ExecutionCheckpointingOptions.EXTERNALIZED_CHECKPOINT,
    //      CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)

    // 设置状态的最小空闲时间和最大的空闲时间
    //    tableEnv.getConfig.setIdleStateRetentionTime(Time.hours(12), Time.hours(24))

    // 加载配置
    val catalog_name = PropManager.getProp(Constants.CATALOG_NAME)
    val database = PropManager.getProp(Constants.DEFAULT_DATABASE)
    val schemaDataBase = PropManager.getProp(Constants.SCHEMA_DATABASE)

    // 构造 hive catalog
    val hiveCatalog = new HiveCatalog(
      catalog_name,
      database,
      PropManager.getProp(Constants.HIVE_CONF_DIR)
    )

    tableEnv.registerCatalog(catalog_name, hiveCatalog)
    tableEnv.useCatalog(c
以下是一个基于Flink消费Kafka并将数据写入Hive的示例代码: ```java import org.apache.flink.api.common.functions.RuntimeContext; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.streaming.api.CheckpointingMode; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; import org.apache.flink.streaming.api.functions.sink.SinkFunction; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.Table; import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; import org.apache.flink.types.Row; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.Properties; public class FlinkKafkaToHiveDemo { public static void main(String[] args) throws Exception { // 获取命令行参数 final ParameterTool params = ParameterTool.fromArgs(args); // 设置检查点配置 final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(10000, CheckpointingMode.EXACTLY_ONCE); env.getConfig().setGlobalJobParameters(params); // 设置Kafka配置 Properties kafkaProps = new Properties(); kafkaProps.setProperty("bootstrap.servers", params.get("bootstrap.servers", "localhost:9092")); kafkaProps.setProperty("group.id", params.get("group.id", "my-flink-consumer-group")); // 创建FlinkKafkaConsumer FlinkKafkaConsumer<String> kafkaConsumer = new FlinkKafkaConsumer<>( params.get("topic", "my-kafka-topic"), new SimpleStringSchema(), kafkaProps); // 将Kafka数据转换为Flink Table EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build(); StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings); Table kafkaTable = tableEnv.fromDataStream(env.addSource(kafkaConsumer), "value"); // 将Flink Table写入Hive kafkaTable.executeInsert("my_hive_table"); // 执行Flink作业 env.execute("Flink Kafka to Hive Demo"); } // 定义Hive Sink public static class HiveSink extends RichSinkFunction<Row> { private Connection conn; private PreparedStatement stmt; @Override public void open(Configuration parameters) throws Exception { // 获取Hive连接 Class.forName("org.apache.hive.jdbc.HiveDriver"); conn = DriverManager.getConnection("jdbc:hive2://localhost:10000/default", "hive", ""); stmt = conn.prepareStatement("INSERT INTO my_hive_table VALUES(?)"); } @Override public void invoke(Row row, SinkFunction.Context context) throws Exception { // 写入Hive stmt.setString(1, row.getField(0).toString()); stmt.executeUpdate(); } @Override public void close() throws SQLException { // 关闭连接 if (conn != null) { conn.close(); } if (stmt != null) { stmt.close(); } } } } ``` 上面的示例代码使用FlinkKafkaConsumer将Kafka数据转换为Flink Table,然后使用executeInsert将Flink Table写入Hive。此外,我们还定义了一个HiveSink来将数据写入Hive。请注意,为了使该作业正常运行,您需要在本地启动一个Hive服务,并创建一个名为“my_hive_table”的表。
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

訾零

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值