【Clickhouse和Byconity对比】使用角度评测Clickhouse和Byconity

1. 建表

Clickhouse

-- clickhouse建表
-- 01 本地表样例
CREATE TABLE label_db.workload_local_v1 on cluster cowa_cluster
(
    `timestamp`               DateTime COMMENT '时间戳',
    `action`                  String COMMENT '操作类型',
    `user_id`                 UInt32 COMMENT '用户人id',
    `uid`                     UInt32 COMMENT '操作人id',
    `task_id`                 int COMMENT '任务id',
    `project_id`              int COMMENT '项目id',
    `work_time`               int COMMENT '工作时间',
    `object_size`             int COMMENT '新增对象数',
    `confirmed_prelabel_size` int COMMENT '已确认或修改预标注对象数',
    `prelabel_object_size`    int COMMENT '预标注对象总数',
    `review_size`             int COMMENT '审核对象数',
    `comment_size`            int COMMENT '批注对象数',
    `reviewed_size`           int COMMENT '被审核对象数',
    `commented_size`          int COMMENT '被批注对象数',
    `commit_label_1`          int COMMENT '提交标注1',
    `commit_review_1`         int COMMENT '提交审核1',
    `commit_review_2`         int COMMENT '提交审核2',
    `reject_review_1`         int COMMENT '驳回审核1',
    `reject_review_2`         int COMMENT '驳回审核2',
    `commit_reviewed_1`       int COMMENT '被审核1审核',
    `commit_reviewed_2`       int COMMENT '被审核2审核',
    `rejected_label_1`        int COMMENT '标注1被驳回',
    `rejected_review_1`       int COMMENT '审核1倍驳回',
    `dynamic_size`            int COMMENT '新增动态对象数',
    `static_size`             int COMMENT '新增静态对象数',
    `review_dynamic_size`     int COMMENT '审核动态对象数',
    `review_static_size`      int COMMENT '审核静态对象数',
    `is_rejected`             int COMMENT '是否被驳回',
    `_sign`                   UInt8,
    `_ver`                    UInt32
) ENGINE = ReplicatedReplacingMergeTree(
           '/clickhouse/tables/{shard}/workload_local_v1',
           '{replica}',
           _ver
    ) PARTITION BY toYYYYMM(timestamp)
      ORDER BY (
                uid,
                project_id,
                task_id,
                timestamp
          );
          
          
-- 02 分布式表样例
create table label_db.workload_v1 on cluster cowa_cluster
    as label_db.workload_local_v1 ENGINE = Distributed('cowa_cluster', 'label_db', label_db.workload_local_v1,
                                           cityHash64(user_id));

ByConity

-- 注意没有本地表分布式表的概念,直接创建
CREATE TABLE label_db.workload_v1
(
    `timestamp`               DateTime COMMENT '时间戳',
    `action`                  String COMMENT '操作类型',
    `user_id`                 UInt32 COMMENT '用户人id',
    `uid`                     UInt32 COMMENT '操作人id',
    `task_id`                 int COMMENT '任务id',
    `project_id`              int COMMENT '项目id',
    `work_time`               int COMMENT '工作时间',
    `object_size`             int COMMENT '新增对象数',
    `confirmed_prelabel_size` int COMMENT '已确认或修改预标注对象数',
    `prelabel_object_size`    int COMMENT '预标注对象总数',
    `review_size`             int COMMENT '审核对象数',
    `comment_size`            int COMMENT '批注对象数',
    `reviewed_size`           int COMMENT '被审核对象数',
    `commented_size`          int COMMENT '被批注对象数',
    `commit_label_1`          int COMMENT '提交标注1',
    `commit_review_1`         int COMMENT '提交审核1',
    `commit_review_2`         int COMMENT '提交审核2',
    `reject_review_1`         int COMMENT '驳回审核1',
    `reject_review_2`         int COMMENT '驳回审核2',
    `commit_reviewed_1`       int COMMENT '被审核1审核',
    `commit_reviewed_2`       int COMMENT '被审核2审核',
    `rejected_label_1`        int COMMENT '标注1被驳回',
    `rejected_review_1`       int COMMENT '审核1倍驳回',
    `dynamic_size`            int COMMENT '新增动态对象数',
    `static_size`             int COMMENT '新增静态对象数',
    `review_dynamic_size`     int COMMENT '审核动态对象数',
    `review_static_size`      int COMMENT '审核静态对象数',
    `is_rejected`             int COMMENT '是否被驳回',
    `_sign`                   UInt8
) ENGINE = CnchMergeTree(_ver) 
      PARTITION BY toDate(timestamp)
      UNIQUE KEY (
                uid,
                project_id,
                task_id,
                timestamp
          );

小结

  • ByConity不需要在每个节点建本地表和分布式表,更加简洁和易维护
  • ByConity的UNIQUE KEY(实现去重功能)相当于Clickhouse的ORDER BY ,语义更加合理。

2. 典型表引擎

Byconity的CnchMergeTree(唯一键引擎)对标Clickhouse最常用的ReplacingMergeTree
Byconity的数据按分区键Partition by进行分区,然后排序键Order by进行有序存储
Byconity同样通过Version的方式保留最新版本,CnchMergeTree后面直接指定version字段

2.1 Byconity版本管理

  • 不使用版本管理
-- 01
CREATE TABLE UniqTest_v1
(
    `key`       Int64,
    `val`       String,
    `eventTime` DateTime
)
    ENGINE = CnchMergeTree()-- 未加入版本管理
        ORDER BY `key`
        PRIMARY KEY `key` UNIQUE KEY `key`;
        
insert into UniqTest_v1 VALUES (1, 'first',  '2020-01-01 01:01:01');
insert into UniqTest_v1 VALUES (1, 'second', '2020-01-01 00:00:00');

在这里插入图片描述
结论:可以实现Upsert,但是不能保证时序

  • 使用版本管理
CREATE TABLE UniqTest_v2
(
    `key`       Int64,
    `val`       String,
    `eventTime` DateTime
)
    ENGINE = CnchMergeTree(eventTime)
        ORDER BY `key`
        PRIMARY KEY `key` UNIQUE KEY `key`;

insert into UniqTest_v2 VALUES (1, 'first', '2020-01-01 01:01:01');
insert into UniqTest_v2 VALUES (1, 'second', '2020-01-01 00:00:00');

在这里插入图片描述
结论:可以实现Upsert,且保证了时序

3. Byconity高级特性

3.1 投影

注意投影不能和UNIQUE KEY同时使用,下面是错误示例:

CREATE TABLE bigdata.iot_drive_base
(
  `create_time` DateTime,
  `local_time`  DateTime64(3, 'Asia/Shanghai'),
  `time`        String,
  `carNo`       String,
  `drivingMode` String,
  `latitude`    String,
  `longitude`   String,
  `mode`        String,
  `sequence`    int,
  `speed`       Float64,
  PROJECTION    prj_car (
      SELECT *
      ORDER BY
      carNo,
      `time`
      )
)
  ENGINE = CnchMergeTree()
      PARTITION BY toYYYYMMDD(create_time)
      PRIMARY KEY `time`
      UNIQUE KEY (`time`, carNo)
      TTL create_time + toIntervalDay(7);
      
      
-- DB::Exception: `Projection` cannot be used together with `UNIQUE KEY` SQLSTATE: 22000 (version 21.8.7.1)
  • 因为投影里面的Order By已经实现了UNIQUE KEY的功能,下面是正确示例:
CREATE TABLE bigdata.iot_drive_base
(
    `create_time` DateTime,
    `local_time`  DateTime64(3, 'Asia/Shanghai'),
    `time`        String,
    `carNo`       String,
    `drivingMode` String,
    `latitude`    String,
    `longitude`   String,
    `mode`        String,
    `sequence`    int,
    `speed`       Float64,
    PROJECTION    prj_car (
        SELECT *
        ORDER BY
        carNo,
        `time`
        )
)
    ENGINE = CnchMergeTree()
        PARTITION BY toYYYYMMDD(create_time)
        PRIMARY KEY `time`
        TTL create_time + toIntervalDay(7);

3.2 物化视图

注意目前Byconity官方显示还支持CnchMergeTree()引擎使用物化视图,但是聚合视图可以使用,示例如下:

CREATE MATERIALIZED VIEW bigdata.mv_iot_agg
            (
             `create_date` Date,
             `carNo` String,
             `latitude_nan_cnt` AggregateFunction(sum, UInt64),
             `latitude_null_cnt` AggregateFunction(sum, UInt64)
                )
            ENGINE = CnchAggregatingMergeTree
                PARTITION BY toDate(create_date)
                ORDER BY (create_date, carNo)
AS
SELECT toDate(create_time)                                 AS create_date,
       carNo,
       sumState(toUInt64(multiIf(latitude = 'NaN', 1, 0))) AS latitude_nan_cnt,
       sumState(toUInt64(multiIf(latitude = '', 1, 0)))    AS latitude_null_cnt
FROM bigdata.iot_drive_base
GROUP BY create_time,
         carNo;
set optimize_move_to_prewhere = 0; -- 目前的小bug,需要关闭优化器才可以正常查询

4. 函数

4.1 开窗

  • 区别于Clickhouse,Byconity实现了开窗,示例如下可以直接使用:
-- a1
select carNo,
       sum(diff_ts) / 3600 as mile
from (
         select carNo,
                toUnixTimestamp(lead_local_time) - toUnixTimestamp(local_time) as diff_ts
         from (
                  SELECT carNo,
                         local_time,
                         lead(local_time, 1) over(PARTITION BY carNo ORDER BY local_time) AS lead_local_time
                  FROM bigdata.iot_drive_base
                  where toDate(toUnixTimestamp(create_time)) = toDate(yesterday())
                  ) l1
         ) l2
where diff_ts < 60
  and diff_ts >= 0
group by carNo;

-- a2
select carNo,
       local_time,
       sequence,
       latitude,
       longitude,
       speed
from (
         SELECT carNo,
                local_time,
                sequence,
                latitude,
                longitude,
                speed,
                row_number() over(PARTITION BY carNo,local_time ORDER BY create_time desc) as rn
         FROM bigdata.iot_drive_base
         where toDate(toUnixTimestamp(create_time)) = toDate(yesterday())
         ) l1
where rn = 1;

4.2 其他普通函数

-- l1
SET optimize_move_to_prewhere = 1;
select l1.carNo,
       round(nvl(today_mile, 0), 0)                                                                   today_mile,
       round(nvl(yesterday_mile, 0), 0)                                                               yesterday_mile,
       round(case when today_mile > yesterday_mile then today_mile - yesterday_mile else 0 end, 2) as up_mile
from (
         select carNo,
                sum(speed) * 0.5 / 1000 today_mile
         from bigdata.iot_drive_base
         where toDate(toUnixTimestamp(create_time)) = toDate(today())
         group by carNo
         ) l1
         left join(
    select carNo,
           sum(speed) * 0.5 / 1000 yesterday_mile
    from bigdata.iot_drive_base
    where toDate(toUnixTimestamp(create_time)) = toDate(yesterday())
    group by carNo
    ) l2
                  on l1.carNo = l2.carNo;


-- l2
select toDate(toUnixTimestamp(create_time)) as dt,
       groupUniqArray(carNo)                as car_list,
       length(groupUniqArray(carNo))        as cars
from bigdata.iot_drive_base
group by toDate(toUnixTimestamp(create_time));

5. 总结

ByConityClickhouse整体进行了封装,实现了远程挂载HDFS,实现了类似Doris开窗能力的开窗函数,实现了简易建表易于维护。总体上还不错,一些Bug的解决观望后续社区的进展。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值