demo-flink1.11.2实现数据写入hive_flink catlog写hive-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_44131414/article/details/114639080

环境准备

1. hadoop 集群的开启,hive metastore 服务开启
2. flink-conf.yaml, sql-client-defaults.yaml 配置
    注意: 必须开启checkpoint ,flink 才可提交分区操作
3. flink 集群的开启
    启动：yarn-session.sh -n 3 -s 3 -nm flink-session -d 
    关闭：yarn application -kill applicationId
4. KafKa集群开启

flink-conf.yaml 配置

# The backend that will be used to store operator state checkpoints if
# checkpointing is enabled.
#
# Supported backends are 'jobmanager', 'filesystem', 'rocksdb', or the
# <class-name-of-factory>.
#
 state.backend: filesystem

# Directory for checkpoints filesystem, when using any of the default bundled
# state backends.
#
 state.checkpoints.dir: hdfs://hadoop001:9000/flink-checkpoints

# Default target directory for savepoints, optional.
#
state.savepoints.dir: hdfs://hadoop001:9000/flink-savepoints

# Flag to enable/disable incremental checkpoints for backends that
# support incremental checkpoints (like the RocksDB state backend).
#
# state.backend.incremental: false

#设置任务取消后保留hdfs上的checkpoint文件
execution.checkpointing.externalized-checkpoint-retention:RETAIN_ON_CANCELLATION
# checkpoint的时间间隔
execution.checkpointing.interval: 60000

execution.checkpointing.mode: EXACTLY_ONCE

sql-client 端命令
官方案列: https://ci.apache.org/projects/flink/flink-docs-release-1.12/zh/dev/table/connectors/hive/hive_read_write.html

# 启动
./bin/sql-client.sh embedded
# 进入hive catalog
use catalog myhive
# 使用 hive方言
SET table.sql-dialect=hive;
CREATE TABLE hive_table (
  user_id STRING,
  order_amount DOUBLE
) PARTITIONED BY (dt STRING, hr STRING) STORED AS parquet TBLPROPERTIES (
  # 配置hour级别的partition时间抽取策略
  'partition.time-extractor.timestamp-pattern'='$dt $hr:00:00',
  # 使用partition中抽取时间，加上watermark决定partiton commit的时机
  'sink.partition-commit.trigger'='partition-time',
  # 配置dalay为小时级，当 watermark > partition时间 + 1小时，会commit这个partition
  'sink.partition-commit.delay'='1 h',
  # partitiion commit的策略是：先更新metastore(addPartition)，再写SUCCESS文件
  'sink.partition-commit.policy.kind'='metastore,success-file'
);

# 使用 默认的

SET table.sql-dialect=default;
CREATE TABLE kafka_table (
  user_id STRING,
  order_amount DOUBLE,
  log_ts TIMESTAMP(3),
  WATERMARK FOR log_ts AS log_ts - INTERVAL '5' SECOND
) WITH (
'connector' = 'kafka',
'topic' = 'kafka_table', 
'scan.startup.mode' = 'earliest-offset',
'properties.group.id' = 'group1', 
'properties.bootstrap.servers' = 'hadoop001:9092,hadoop002:9092,hadoop003:9092', 
'format' = 'json', 
'json.fail-on-missing-field' = 'true',
'json.ignore-parse-errors' = 'false'

);

# streaming sql, insert into hive table 执行流任务
INSERT INTO  hive_table SELECT user_id, order_amount, DATE_FORMAT(log_ts, 'yyyy-MM-dd'), DATE_FORMAT(log_ts, 'HH') FROM kafka_table;

# batch sql, select with partition pruning 查询
SELECT * FROM hive_table WHERE dt='2020-05-20' and hr='12';