启动docker镜像
apache/flink:1.17-scala_2.12
1、复制paimon包到flink/lib
paimon-flink-1.17-1.0-SNAPSHOT.jar
flink-shaded-hadoop-2-uber-2.7.5-10.0.jar
2、启动flink sql
./bin/sql-client.sh embedded -i conf/sql-client-init.sql
sql-client-init.sql文件:
SET sql-client.execution.result-mode=tableau;
SET execution.runtime-mode=streaming;
SET parallelism.default=2;
SET table.exec.state.ttl=1000;
CREATE CATALOG paimon_hdfs_catalog WITH (
'type' = 'paimon',
'warehouse' = 'hdfs://namenode:8020/paimon'
);
USE CATALOG paimon_hdfs_catalog;
CREATE DATABASE IF NOT EXISTS db_test;
use db_test;
3、创建catalog
备注:需先把hdp集群启动。
-- file
CREATE CATALOG paimon_catalog WITH (
'type'='paimon',
'warehouse'='file:///data/paimon'
);
-- hdfs
CREATE CATALOG paimon_hdfs_catalog WITH (
'type' = 'paimon',
'warehouse' = 'hdfs://namenode:8020/paimon'
);
USE CATALOG paimon_hdfs_catalog ;
CREATE DATABASE IF NOT EXISTS db_test;
use db_test;
CREATE TABLE word_count (
word STRING PRIMARY KEY NOT ENFORCED,
cnt BIGINT
);
-- Write Data
CREATE TEMPORARY TABLE word_table (
word STRING
) WITH (
'connector' = 'datagen',
'fields.word.length' = '1'
);
-- paimon requires checkpoint interval in streaming mode
SET 'execution.checkpointing.interval' = '10 s';
-- write streaming data to dynamic table
INSERT INTO word_count SELECT word, COUNT(*) FROM word_table GROUP BY word;
OLAP Query
--
-- use tableau result mode
SET 'sql-client.execution.result-mode' = 'tableau';
-- switch to batch mode
RESET 'execution.checkpointing.interval';
SET 'execution.runtime-mode' = 'batch';
-- olap query the table
SELECT * FROM word_count;
Streaming Query
-- switch to streaming mode
SET 'execution.runtime-mode' = 'streaming';
-- track the changes of table and calculate the count interval statistics
SELECT `interval`, COUNT(*) AS interval_cnt FROM
(SELECT cnt / 10000 AS `interval` FROM word_count) GROUP BY `interval`;
使用 Flink 托管内存
Paimon 任务可以基于Flink executor执行器管理的内存来创建内存池,比如 Flink 任务管理器中的托管内存。它将通过执行器管理多个任务的写入缓冲区,从而提高接收器的稳定性和性能。
如果使用 Flink 托管内存,可以设置以下属性:
sink.use-managed-memory-allocator false
如果设置为 true,则 Flink sink 将使用托管内存进行合并树操作。
否则,它将创建一个独立的内存分配器,这意味着每个任务分配并管理自己的内存池(堆内存)。如果一个执行器(Executor)中有太多任务,可能会导致性能问题(cause performance issues),甚至出现内存溢出(OOM)。
sink.managed.writer-buffer-memory 256M
在托管内存(managed memory)中写入缓冲区的权重,Flink 将根据权重计算写入器的内存大小,实际使用的内存取决于运行环境。现在这个属性中定义的内存大小等于运行时写入缓冲区分配的确切内存。
在 SQL 中使用时,用户可以为 Flink 托管内存设置内存权重,然后 Flink 接收器(sink operator)操作符将获取内存池大小,并为 Paimon 写入器创建分配器。
-- 设置checkpoint
SET execution.checkpointing.interval = 6000;
-- 数据写入
INSERT INTO word_count /*+
OPTIONS('sink.use-managed-memory-allocator'='true',
'sink.managed.writer-buffer-memory'='256M') */
SELECT word, COUNT(*) FROM word_table GROUP BY word;
设置checkpoint:SET execution.checkpointing.interval = 6000;
1、Flink checkpoint配置属性
设置动态选项
在与 Paimon 表交互时,可以在不更改目录(catalog)中的选项的情况下调整表选项。Paimon 将提取作业级别的动态选项,并在当前会话中生效。
动态表选项的键格式为 paimon.${catalogName}.${dbName}.${tableName}.${config_key}
。catalogName/dbName/tableName 可以是 *,这意味着匹配所有特定的部分。动态全局选项的键格式是 ${config_key}
。全局选项将对所有表生效。如果有冲突,表选项将覆盖全局选项。
例如:
-- set scan.timestamp-millis=1697018249001 for all tables
SET 'scan.timestamp-millis' = '1697018249001';
SELECT * FROM T;
-- set scan.timestamp-millis=1697018249000 for the table mycatalog.default.T
SET 'paimon.mycatalog.default.T.scan.timestamp-millis' = '1697018249000';
SELECT * FROM T;
-- set scan.timestamp-millis=1697018249000 for the table default.T in any catalog
SET 'paimon.*.default.T.scan.timestamp-millis' = '1697018249000';
SELECT * FROM T;
-- set scan.timestamp-millis=1697018249000 for the table mycatalog.default.T1
-- set scan.timestamp-millis=1697018249001 for others tables
SET 'paimon.mycatalog.default.T1.scan.timestamp-millis' = '1697018249000';
SET 'scan.timestamp-millis' = '1697018249001';
SELECT * FROM T1 JOIN T2 ON xxxx;