hudi的操作记录备份以及文档

本文介绍了如何使用Flink进行数据生成并写入Hudi,详细展示了创建Kafka表、Hudi表以及Hive表的步骤,包括COW和MOR两种Hudi模式。此外,还探讨了数据湖的概念,提到Hudi在大规模和近实时应用中的重要性,并分享了Hudi与Flink、Hive的集成案例。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

 

Flink datagen代码:

public class TestDataKafka2doris {
    private  static final String JDBC_SQL =  "CREATE TABLE join_test (\n"+
            " id INT,\n" +
            " name STRING\n"+
            " ) WITH (\n"+
            "   'connector' = 'jdbc',\n"+
            "   'url' = 'jdbc:mysql://192.168.6.143:9030/example_db?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC',\n"+
            "   'driver' = 'com.mysql.jdbc.Driver',\n"+
            "   'table-name' = 'join_test',\n"+
            "   'username' = 'root',\n"+
            "   'password' = 'root'\n"+
            " )";



    private static final String DATA_GEN =  "CREATE TABLE datagen (\n" +
            " id STRING,\n" +
            " name STRING,\n" +
            " user_age STRING,\n" +
            " user_other STRING,\n" +
            " ts AS localtimestamp\n" +
            ") WITH (\n" +
            " 'connector' = 'datagen',\n" +
            " 'rows-per-second'='10',\n" +
            " 'fields.id.kind'='sequence',\n" +
            " 'fields.id.start'='1',\n" +
            " 'fields.id.end'='10000000',\n" +
            " 'fields.user_age.min'='1',\n" +
            " 'fields.user_age.max'='1000',\n" +
            " 'fields.name.length'='2',\n" +
            " 'fields.user_other.length'='10'\n" +
            ")";



    private  static final String KAFKA_SQL ="CREATE TABLE kafkaTable (" +
            " id STRING,\n" +
            " name STRING,\n" +
            " user_age STRING,\n" +
            " user_other STRING,\n" +
            " ts TIMESTAMP\n" +
            ") WITH (\n" +
            " 'connector' = 'kafka',\n" +
            " 'topic' = 'routine_load_test2',\n" +
            " 'properties.bootstrap.servers' = 'dev-ct6-dc-worker01:9092,dev-ct6-dc-worker02:9092,dev-ct6-dc-worker03:9092',\n" +
            " 'properties.group.id' = 'testGroup222',\n" +
            " 'format' = 'json',\n" +
            " 'scan.startup.mode' = 'earliest-offset'\n" +
            ")";

 

 

 进入flink sql client:

$FLINK_HOME/bin/sql-client.sh embedded -j /wyyt/software/flink-1.12.2-2.11/flink-1.12.2/lib/hudi-flink-bundle_2.1?-*.*.*.jar shell
---0,设置参数
set execution.result-mode=tableau;
SET table.exec.resource.default-parallelism = 4;
 


---1,创建kafka表


CREATE TABLE data_gen (
id STRING,
name STRING,
user_age STRING,
user_other STRING,
ts TIMESTAMP(3)
) WITH (
 'connector' = 'kafka',
 'topic' = 'routine_load_test2',
 'properties.bootstrap.servers' = 'dev-ct6-dc-worker01:9092,dev-ct6-dc-worker02:9092,dev-ct6-dc-worker03:9092',
 'properties.group.id' = 'testGroup3',
 'format' = 'json',
 'scan.startup.mode' = 'earliest-offset'
);


---2,创建hudi表

COW模式:

CREATE TABLE hudi_cow_data_gen(
id STRING,
name STRING,
user_age STRING,
user_other STRING,
ts TIMESTAMP(3),
PRIMARY KEY(id) NOT ENFORCED
)
WITH (
  'connector' = 'hudi',
  'path' = 'hdfs://bi-524:8020/tmp/default/hudi_mor_data_gen',
  'table.type' = 'COPY_ON_WRITE',
  'write.insert.drop.duplicates' = 'true'
); 


MOR模式:

CREATE TABLE hudi_mor_data_gen(
id STRING,
name STRING,
user_age STRING,
user_other STRING,
ts TIMESTAMP(3),
PRIMARY KEY(id) NOT ENFORCED
)
WITH (
  'connector' = 'hudi',
  'path' = 'hdfs://bi-524:8020/tmp/default/hudi_mor_data_gen',
  'table.type' = 'MERGE_ON_READ',
  'read.streaming.enabled' = 'true',
  'write.tasks'= '4',
  'compaction.tasks'= '4',
  'compaction.delta_seconds' = '5',
  'compaction.delta_commits' = '1',
  'read.streaming.check-interval' = '1'
);

--分区表
CREATE TABLE hudi_mor_data_gen(
id STRING,
name STRING,
user_age STRING,
user_other STRING,
ts TIMESTAMP(3),
sdt STRING,
PRIMARY KEY(id) NOT ENFORCED

partitioned by (sdt)
WITH (
  'connector' = 'hudi',
  'path' = 'hdfs://bi-524:8020/tmp/default/hudi_mor_data_gen',
  'table.type' = 'MERGE_ON_READ',
  'read.streaming.enabled' = 'true',
  'write.tasks'= '4',
  'compaction.tasks'= '4',
  'compaction.delta_seconds' = '5',
  'compaction.delta_commits' = '1',
  'read.streaming.check-interval' = '1'
);


CREATE TABLE hudi_mor_data_gen2(
id STRING,
name STRING,
user_age STRING,
user_other STRING,
ts TIMESTAMP(3),
PRIMARY KEY(id) NOT ENFORCED
)
WITH (
  'connector' = 'hudi',
  'path' = 'hdfs://bi-524:8020/tmp/default/hudi_mor_data_gen2',
  'table.type' = 'MERGE_ON_READ',
  'read.streaming.enabled' = 'true',
  'write.tasks'= '4',
  'compaction.tasks'= '4',
  'compaction.delta_seconds' = '180',
  'compaction.delta_commits' = '3',
  'read.streaming.check-interval' = '10'
);

 
 


 
---3,写入hudi

insert into hudi_cow_data_gen select * from data_gen;
insert into hudi_mor_data_gen select * from data_gen;

查询:
select count(*) from hudi_data_gen;

select count(*) from hudi_mor_data_gen;

 

 

---4, 创建hive表

进入hive client

hive


导入依赖:
ADD JAR /wyyt/software/flink-1.12.2-2.11/flink-1.12.2/hudi-hadoop-mr-bundle-0.9.0-SNAPSHOT.jar;

COW模式:

CREATE EXTERNAL TABLE `hive_cow_data_gen`(               
   `_hoodie_commit_time` string,                    
   `_hoodie_commit_seqno` string,                   
   `_hoodie_record_key` string,                     
   `_hoodie_partition_path` string,                 
   `_hoodie_file_name` string,                      
   `id` string,
   `name` string,
   `user_age` string,
   `user_other` string,
   `ts` string)                                                                
 ROW FORMAT S

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值