【数仓重构】基于StarRocks的敏捷数仓

在这里插入图片描述

1. StarRocks建表

1.1 背景

在特定的业务场景下,使用了StarRocks构建数仓越来越流行,尤其在云原生的场景下,StarRocks的BE/CN可以远程挂载分布式文件系统,使得存储计算分离,更加弹性,本文将介绍StarRocks在数仓方面的应用。

1.2 实现

这里注意date_trunc指定dt,类似Hive中的分区,如果不指定容易数据不均衡以及查询效率折扣。

CREATE TABLE bigdata.kafka_tb_mqtt_carinfo
(
    `time`   bigint,
    deviceName string,
    carno string,
    sequence bigint,
    drive string,
    now string,
    total string,
    power string,
    ioStatus string,
    productName string,
    dt       date,
    car_time datetime,
    obj_time string
)
    PARTITION BY date_trunc ('day',dt)
DISTRIBUTED BY HASH (deviceName)
    PROPERTIES
(
    "storage_volume" =
    "starrocks",
    "datacache.enable" =
    "true",
    "datacache.partition_duration" =
    "1 MONTH",
    "enable_persistent_index" =
    "true"
);

2. 增量数据导入

2.1 方式

这里采用了官方推荐的Routine Load方式,这种方式提供了比较快捷的ETL能力和比较好的健壮性,直接指定Kafka主题(这里指定mqtt_carinfo主题),直接路由到StarRocks表。

2.2 实现

CREATE
ROUTINE LOAD
bigdata.kafka_tb_mqtt_carinfo_load ON kafka_tb_mqtt_carinfo
COLUMNS (`time`,deviceName,carno=split_part(deviceName,'-',2),`sequence`,drive,now,total,power,ioStatus,productName,dt=from_unixtime(`time`/1000,'yyyy-MM-dd'),car_time=from_unixtime(substring(`time`,1,10),'yyyy-MM-dd HH:mm:ss'),obj_time=concat(from_unixtime(substring(`time`,1,10),'yyyy-MM-dd HH:mm:ss'),':',substring(`time`,11,13)))
PROPERTIES
(
    "desired_concurrent_number"="3",
    "max_batch_interval"="20",
    "max_batch_rows"="400000",
    "max_batch_size"="209715200",
    "strict_mode"="false",
    "format"="json"
)
FROM KAFKA
(
    "kafka_broker_list" = "172.16.xxx.40:9092,172.16.104.xxx:9092,172.16.xxx.42:9092",
    "kafka_topic" = "mqtt_carinfo",
    "kafka_partitions" = "0,1,2"
);

3. 存量数据从Hdfs导入

3.1 背景

我们的离线业务会采集Kafka数据到HDFS,因此在上一步从Kafka实时导入StarRocks后,为了兼顾以后统一数据查询,这里利用StarRocks查询远程文件的能力,查询HDFS,写入StarRocks。

3.2 实现

INSERT INTO kafka_tb_mqtt_carinfo
SELECT cast(concat(unix_timestamp(car_time), split_part(car_time, '.', 2)) as bigint) `time`,
       deviceName,
       split_part(deviceName, '-', 2)                                                 carno,
       `sequence`,
       drive,
       now,
       total,
       power,
       ioStatus,
       productName,
       cast(split_part(car_time, ' ', 1) as date)     as                              dt,
       cast(split_part(car_time, '.', 1) as datetime) as                              car_time,
       car_time                                       as                              obj_time
FROM FILES
     (
        "path" =
        "hdfs://172.16.xxx.51:8020/user/hive/warehouse/xxx_dwd.db/dwd_mqtt_carinfo_1d_d/dt=2024-09-12/000000_0",
        "format" = "orc"
     );

4. 构建业务所需物化视图

4.1 背景

StarRocks提供的物化视图比较好用,而且可以指定分区以及触发物化视图的时间间隔(这个能力很重要,省去了传统数仓上调度的步骤)

4.2 实现

CREATE
MATERIALIZED VIEW now_acc_fom_mqtt_carinfo_materialized
PARTITION BY dt
DISTRIBUTED BY HASH(carno)
REFRESH ASYNC START
('2022-09-01 10:00:00') EVERY (interval 1 minute) -- 类似于调度任务,这里设定每1分钟更新一次
AS
select obj_time,
       carno,
       manual,
       SUM(IF(diff_manual >= 0, diff_manual, 0)) over (partition by carno order by obj_time )        AS acc_manual,
        autonomous,
       SUM(IF(diff_autonomous >= 0, diff_autonomous, 0)) OVER (PARTITION BY carno ORDER BY obj_time) AS acc_autonomous,
        workon,
       SUM(IF(diff_workon >= 0, diff_workon, 0)) OVER (PARTITION BY carno ORDER BY obj_time)         AS acc_workon,
        workoff,
       SUM(IF(diff_workoff >= 0, diff_workoff, 0))
           OVER (PARTITION BY carno ORDER BY obj_time)                                               AS acc_workoff,
        remotecontrol,
       SUM(IF(diff_remotecontrol >= 0, diff_remotecontrol, 0))
           OVER (PARTITION BY carno ORDER BY obj_time)                                               AS acc_remotecontrol,
        manual_hours,
       SUM(IF(diff_manual_hours >= 0, diff_manual_hours, 0))
           OVER (PARTITION BY carno ORDER BY obj_time)                                               AS acc_manual_hours,
        autonomous_hours,
       SUM(IF(diff_autonomous_hours >= 0, diff_autonomous_hours, 0))
           OVER (PARTITION BY carno ORDER BY obj_time)                                               AS acc_autonomous_hours,
        workon_hours,
       SUM(IF(diff_workon_hours >= 0, diff_workon_hours, 0))
           OVER (PARTITION BY carno ORDER BY obj_time)                                               AS acc_workon_hours,
        workoff_hours,
       SUM(IF(diff_workoff_hours >= 0, diff_workoff_hours, 0))
           OVER (PARTITION BY carno ORDER BY obj_time)                                               AS acc_workoff_hours,
        remoteDrive_hours,
       SUM(IF(diff_remoteDrive_hours >= 0, diff_remoteDrive_hours, 0))
           OVER (PARTITION BY carno ORDER BY obj_time)                                               AS acc_remoteDrive_hours,
        statics_hours,
       SUM(IF(diff_statics_hours >= 0, diff_statics_hours, 0))
           OVER (PARTITION BY carno ORDER BY obj_time)                                               AS acc_statics_hours,
        dt
from (
         select obj_time,
                carno,
                manual,
                lead_manual - manual                       AS diff_manual,
                autonomous,
                lead_autonomous - autonomous               AS diff_autonomous,
                workon,
                lead_workon - workon                       AS diff_workon,
                workoff,
                lead_workoff - workoff                     AS diff_workoff,
                remotecontrol,
                lead_remotecontrol - remotecontrol         AS diff_remotecontrol,
                manual_hours,
                lead_manual_hours - manual_hours           AS diff_manual_hours,
                autonomous_hours,
                lead_autonomous_hours - autonomous_hours   AS diff_autonomous_hours,
                workon_hours,
                lead_workon_hours - workon_hours           AS diff_workon_hours,
                workoff_hours,
                lead_workoff_hours - workoff_hours         AS diff_workoff_hours,
                remoteDrive_hours,
                lead_remoteDrive_hours - remoteDrive_hours AS diff_remoteDrive_hours,
                statics_hours,
                lead_statics_hours - statics_hours         AS diff_statics_hours,
                dt
         from (
                  select obj_time,
                         carno,
                         manual,
                         lead(ifnull(manual, 0), 1) over (partition by carno order by obj_time)       lead_manual,
                          autonomous,
                         lead(ifnull(autonomous, 0), 1) over (partition by carno order by obj_time)   lead_autonomous,
                          workon,
                         lead(ifnull(workon, 0), 1) over (partition by carno order by obj_time)       lead_workon,
                          workoff,
                         lead(ifnull(workoff, 0), 1) over (partition by carno order by obj_time)      lead_workoff,
                          remotecontrol,
                         lead(ifnull(remotecontrol, 0), 1)
                             over (partition by carno order by obj_time)                             lead_remotecontrol,
                          manual_hours,
                         lead(ifnull(manual_hours, 0), 1) over (partition by carno order by obj_time) lead_manual_hours,
                          autonomous_hours,
                         lead(ifnull(autonomous_hours, 0), 1)
                             over (partition by carno order by obj_time)                             lead_autonomous_hours,
                          workon_hours,
                         lead(ifnull(workon_hours, 0), 1) over (partition by carno order by obj_time) lead_workon_hours,
                          workoff_hours,
                         lead(ifnull(workoff_hours, 0), 1)
                             over (partition by carno order by obj_time)                             lead_workoff_hours,
                          remoteDrive_hours,
                         lead(ifnull(remoteDrive_hours, 0), 1)
                             over (partition by carno order by obj_time)                             lead_remoteDrive_hours,
                          statics_hours,
                         lead(ifnull(statics_hours, 0), 1)
                             over (partition by carno order by obj_time)                             lead_statics_hours,
                          dt
                  from (
                           select obj_time,
                                  carno,
                                  cast(json_query(json_query(now, "$.mileage"), '$.manual') as double)         manual,
                                  cast(
                                          json_query(json_query(now, "$.mileage"), '$.autonomous') as double)  autonomous,
                                  cast(json_query(json_query(now, "$.mileage"), '$.workon') as double)         workon,
                                  cast(json_query(json_query(now, "$.mileage"), '$.workoff') as double)        workoff,
                                  cast(
                                          json_query(json_query(now, "$.mileage"), '$.remoteDrive') as double) remotecontrol,
                                  cast(json_query(json_query(now, "$.hours"), '$.manual') as double)           manual_hours,
                                  cast(
                                          json_query(json_query(now, "$.hours"), '$.autonomous') as double)    autonomous_hours,
                                  cast(json_query(json_query(now, "$.hours"), '$.workon') as double)           workon_hours,
                                  cast(json_query(json_query(now, "$.hours"), '$.workoff') as double)          workoff_hours,
                                  cast(
                                          json_query(json_query(now, "$.hours"), '$.remoteDrive') as double)   remoteDrive_hours,
                                  cast(json_query(json_query(now, "$.hours"), '$.statics') as double)          statics_hours,
                                  dt
                           from kafka_tb_mqtt_carinfo
                           where productName = 'ADC'
                       ) l1
              ) l2
     ) l3;

5. 应用层API查询物化视图

// 定义 CarData 结构体
#[derive(Debug, Deserialize, Serialize, Clone, sqlx::FromRow)]
pub struct CarData {
    pub carno: String,
    pub manual: f64,
    pub autonomous: f64,
    pub remotecontrol: f64,
    pub workon: f64,
    pub workoff: f64,
    pub manual_hours: f64,
    pub autonomous_hours: f64,
    pub remotecontrol_hours: f64,
    pub workon_hours: f64,
    pub workoff_hours: f64,
    pub statics_hours: f64,
}

pub async fn query_mile_and_time(
    db: StartRocksPool<MySql>,
    req: DurationsCarnoRequest,
) -> RecloudResult<Vec<CarData>> {
    info!("inside query_starrocks_duration_mileage");
    if req.car_no.is_empty() {
        error!("car_no is none or empty");
        return Err(RecloudError::RequestError);
    }

    let sql = format!(
        "
        select  carno,
                ifnull(round(max(acc_manual) - min(acc_manual), 2), 0)                       as manual,
                ifnull(round(max(acc_autonomous) - min(acc_autonomous), 2), 0)               as autonomous,
                ifnull(round(max(acc_remotecontrol) - min(acc_remotecontrol), 2), 0)         as remotecontrol,
                ifnull(round(max(acc_workon) - min(acc_workon), 2), 0)                       as workon,
                ifnull(round(max(acc_workoff) - min(acc_workoff), 2), 0)                     as workoff,
                ifnull(round(max(acc_manual_hours) - min(acc_manual_hours), 2), 0)           as manual_hours,
                ifnull(round(max(acc_autonomous_hours) - min(acc_autonomous_hours), 2), 0)   as autonomous_hours,
                ifnull(round(max(acc_remoteDrive_hours) - min(acc_remoteDrive_hours), 2), 0) as remotecontrol_hours,
                ifnull(round(max(acc_workon_hours) - min(acc_workon_hours), 2), 0)           as workon_hours,
                ifnull(round(max(acc_workoff_hours) - min(acc_workoff_hours), 2), 0)         as workoff_hours,
                ifnull(round(max(acc_statics_hours) - min(acc_statics_hours), 2), 0)         as statics_hours
        from bigdata.{}
        where dt = '{}'
        and carno = '{}'
        and obj_time >= '{}'
        and obj_time <= '{}'
        group by carno
        ",
        req.table,get_target_date(req.start_time.clone()),req.car_no, req.start_time, req.stop_time
    );
    info!("sql: {:?}", sql);
    let rows = match db.fetch_all(&*sql).await {
        Ok(it) => it,
        Err(err) => return Err(RecloudError::ConnectionError),
    };
    let mut data: Vec<CarData> = vec![];
    for row in rows.iter() {
        let carno = row.get(0);
        let manual: f64 = row.get(1);
        let autonomous: f64 = row.get(2);
        let remotecontrol: f64 = row.get(3);
        let workon: f64 = row.get(4);
        let workoff: f64 = row.get(5);
        let manual_hours: f64 = row.get(6);
        let autonomous_hours: f64 = row.get(7);
        let remotecontrol_hours: f64 = row.get(8);
        let workon_hours: f64 = row.get(9);
        let workoff_hours: f64 = row.get(10);
        let statics_hours: f64 = row.get(11);

        data.push(CarData {
            carno,
            manual,
            autonomous,
            remotecontrol,
            workon,
            workoff,
            manual_hours,
            autonomous_hours,
            remotecontrol_hours,
            workon_hours,
            workoff_hours,
            statics_hours,
        });
    }

    info!("{:?}", data);
    Ok(data)
}

  • 接口查询展示
    在这里插入图片描述
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值