Flink datagen代码:
public class TestDataKafka2doris {
private static final String JDBC_SQL = "CREATE TABLE join_test (\n"+
" id INT,\n" +
" name STRING\n"+
" ) WITH (\n"+
" 'connector' = 'jdbc',\n"+
" 'url' = 'jdbc:mysql://192.168.6.143:9030/example_db?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC',\n"+
" 'driver' = 'com.mysql.jdbc.Driver',\n"+
" 'table-name' = 'join_test',\n"+
" 'username' = 'root',\n"+
" 'password' = 'root'\n"+
" )";
private static final String DATA_GEN = "CREATE TABLE datagen (\n" +
" id STRING,\n" +
" name STRING,\n" +
" user_age STRING,\n" +
" user_other STRING,\n" +
" ts AS localtimestamp\n" +
") WITH (\n" +
" 'connector' = 'datagen',\n" +
" 'rows-per-second'='10',\n" +
" 'fields.id.kind'='sequence',\n" +
" 'fields.id.start'='1',\n" +
" 'fields.id.end'='10000000',\n" +
" 'fields.user_age.min'='1',\n" +
" 'fields.user_age.max'='1000',\n" +
" 'fields.name.length'='2',\n" +
" 'fields.user_other.length'='10'\n" +
")";
private static final String KAFKA_SQL ="CREATE TABLE kafkaTable (" +
" id STRING,\n" +
" name STRING,\n" +
" user_age STRING,\n" +
" user_other STRING,\n" +
" ts TIMESTAMP\n" +
") WITH (\n" +
" 'connector' = 'kafka',\n" +
" 'topic' = 'routine_load_test2',\n" +
" 'properties.bootstrap.servers' = 'dev-ct6-dc-worker01:9092,dev-ct6-dc-worker02:9092,dev-ct6-dc-worker03:9092',\n" +
" 'properties.group.id' = 'testGroup222',\n" +
" 'format' = 'json',\n" +
" 'scan.startup.mode' = 'earliest-offset'\n" +
")";
进入flink sql client:
$FLINK_HOME/bin/sql-client.sh embedded -j /wyyt/software/flink-1.12.2-2.11/flink-1.12.2/lib/hudi-flink-bundle_2.1?-*.*.*.jar shell
---0,设置参数
set execution.result-mode=tableau;
SET table.exec.resource.default-parallelism = 4;
---1,创建kafka表
CREATE TABLE data_gen (
id STRING,
name STRING,
user_age STRING,
user_other STRING,
ts TIMESTAMP(3)
) WITH (
'connector' = 'kafka',
'topic' = 'routine_load_test2',
'properties.bootstrap.servers' = 'dev-ct6-dc-worker01:9092,dev-ct6-dc-worker02:9092,dev-ct6-dc-worker03:9092',
'properties.group.id' = 'testGroup3',
'format' = 'json',
'scan.startup.mode' = 'earliest-offset'
);
---2,创建hudi表
COW模式:
CREATE TABLE hudi_cow_data_gen(
id STRING,
name STRING,
user_age STRING,
user_other STRING,
ts TIMESTAMP(3),
PRIMARY KEY(id) NOT ENFORCED
)
WITH (
'connector' = 'hudi',
'path' = 'hdfs://bi-524:8020/tmp/default/hudi_mor_data_gen',
'table.type' = 'COPY_ON_WRITE',
'write.insert.drop.duplicates' = 'true'
);
MOR模式:
CREATE TABLE hudi_mor_data_gen(
id STRING,
name STRING,
user_age STRING,
user_other STRING,
ts TIMESTAMP(3),
PRIMARY KEY(id) NOT ENFORCED
)
WITH (
'connector' = 'hudi',
'path' = 'hdfs://bi-524:8020/tmp/default/hudi_mor_data_gen',
'table.type' = 'MERGE_ON_READ',
'read.streaming.enabled' = 'true',
'write.tasks'= '4',
'compaction.tasks'= '4',
'compaction.delta_seconds' = '5',
'compaction.delta_commits' = '1',
'read.streaming.check-interval' = '1'
);
--分区表
CREATE TABLE hudi_mor_data_gen(
id STRING,
name STRING,
user_age STRING,
user_other STRING,
ts TIMESTAMP(3),
sdt STRING,
PRIMARY KEY(id) NOT ENFORCED
)
partitioned by (sdt)
WITH (
'connector' = 'hudi',
'path' = 'hdfs://bi-524:8020/tmp/default/hudi_mor_data_gen',
'table.type' = 'MERGE_ON_READ',
'read.streaming.enabled' = 'true',
'write.tasks'= '4',
'compaction.tasks'= '4',
'compaction.delta_seconds' = '5',
'compaction.delta_commits' = '1',
'read.streaming.check-interval' = '1'
);
CREATE TABLE hudi_mor_data_gen2(
id STRING,
name STRING,
user_age STRING,
user_other STRING,
ts TIMESTAMP(3),
PRIMARY KEY(id) NOT ENFORCED
)
WITH (
'connector' = 'hudi',
'path' = 'hdfs://bi-524:8020/tmp/default/hudi_mor_data_gen2',
'table.type' = 'MERGE_ON_READ',
'read.streaming.enabled' = 'true',
'write.tasks'= '4',
'compaction.tasks'= '4',
'compaction.delta_seconds' = '180',
'compaction.delta_commits' = '3',
'read.streaming.check-interval' = '10'
);
---3,写入hudi
insert into hudi_cow_data_gen select * from data_gen;
insert into hudi_mor_data_gen select * from data_gen;
查询:
select count(*) from hudi_data_gen;
select count(*) from hudi_mor_data_gen;
---4, 创建hive表
进入hive client
hive
导入依赖:
ADD JAR /wyyt/software/flink-1.12.2-2.11/flink-1.12.2/hudi-hadoop-mr-bundle-0.9.0-SNAPSHOT.jar;
COW模式:
CREATE EXTERNAL TABLE `hive_cow_data_gen`(
`_hoodie_commit_time` string,
`_hoodie_commit_seqno` string,
`_hoodie_record_key` string,
`_hoodie_partition_path` string,
`_hoodie_file_name` string,
`id` string,
`name` string,
`user_age` string,
`user_other` string,
`ts` string)
ROW FORMAT S