1. StarRocks建表
1.1 背景
在特定的业务场景下,使用了StarRocks构建数仓越来越流行,尤其在云原生的场景下,StarRocks的BE/CN可以远程挂载分布式文件系统,使得存储计算分离,更加弹性,本文将介绍StarRocks在数仓方面的应用。
1.2 实现
这里注意date_trunc指定dt,类似Hive中的分区,如果不指定容易数据不均衡以及查询效率折扣。
CREATE TABLE bigdata.kafka_tb_mqtt_carinfo
(
`time` bigint,
deviceName string,
carno string,
sequence bigint,
drive string,
now string,
total string,
power string,
ioStatus string,
productName string,
dt date,
car_time datetime,
obj_time string
)
PARTITION BY date_trunc ('day',dt)
DISTRIBUTED BY HASH (deviceName)
PROPERTIES
(
"storage_volume" =
"starrocks",
"datacache.enable" =
"true",
"datacache.partition_duration" =
"1 MONTH",
"enable_persistent_index" =
"true"
);
2. 增量数据导入
2.1 方式
这里采用了官方推荐的Routine Load方式,这种方式提供了比较快捷的ETL能力和比较好的健壮性,直接指定Kafka主题(这里指定mqtt_carinfo主题),直接路由到StarRocks表。
2.2 实现
CREATE
ROUTINE LOAD
bigdata.kafka_tb_mqtt_carinfo_load ON kafka_tb_mqtt_carinfo
COLUMNS (`time`,deviceName,carno=split_part(deviceName,'-',2),`sequence`,drive,now,total,power,ioStatus,productName,dt=from_unixtime(`time`/1000,'yyyy-MM-dd'),car_time=from_unixtime(substring(`time`,1,10),'yyyy-MM-dd HH:mm:ss'),obj_time=concat(from_unixtime(substring(`time`,1,10),'yyyy-MM-dd HH:mm:ss'),':',substring(`time`,11,13)))
PROPERTIES
(
"desired_concurrent_number"="3",
"max_batch_interval"="20",
"max_batch_rows"="400000",
"max_batch_size"="209715200",
"strict_mode"="false",
"format"="json"
)
FROM KAFKA
(
"kafka_broker_list" = "172.16.xxx.40:9092,172.16.104.xxx:9092,172.16.xxx.42:9092",
"kafka_topic" = "mqtt_carinfo",
"kafka_partitions" = "0,1,2"
);
3. 存量数据从Hdfs导入
3.1 背景
我们的离线业务会采集Kafka数据到HDFS,因此在上一步从Kafka实时导入StarRocks后,为了兼顾以后统一数据查询,这里利用StarRocks查询远程文件的能力,查询HDFS,写入StarRocks。
3.2 实现
INSERT INTO kafka_tb_mqtt_carinfo
SELECT cast(concat(unix_timestamp(car_time), split_part(car_time, '.', 2)) as bigint) `time`,
deviceName,
split_part(deviceName, '-', 2) carno,
`sequence`,
drive,
now,
total,
power,
ioStatus,
productName,
cast(split_part(car_time, ' ', 1) as date) as dt,
cast(split_part(car_time, '.', 1) as datetime) as car_time,
car_time as obj_time
FROM FILES
(
"path" =
"hdfs://172.16.xxx.51:8020/user/hive/warehouse/xxx_dwd.db/dwd_mqtt_carinfo_1d_d/dt=2024-09-12/000000_0",
"format" = "orc"
);
4. 构建业务所需物化视图
4.1 背景
StarRocks提供的物化视图比较好用,而且可以指定分区以及触发物化视图的时间间隔(这个能力很重要,省去了传统数仓上调度的步骤)
4.2 实现
CREATE
MATERIALIZED VIEW now_acc_fom_mqtt_carinfo_materialized
PARTITION BY dt
DISTRIBUTED BY HASH(carno)
REFRESH ASYNC START
('2022-09-01 10:00:00') EVERY (interval 1 minute) -- 类似于调度任务,这里设定每1分钟更新一次
AS
select obj_time,
carno,
manual,
SUM(IF(diff_manual >= 0, diff_manual, 0)) over (partition by carno order by obj_time ) AS acc_manual,
autonomous,
SUM(IF(diff_autonomous >= 0, diff_autonomous, 0)) OVER (PARTITION BY carno ORDER BY obj_time) AS acc_autonomous,
workon,
SUM(IF(diff_workon >= 0, diff_workon, 0)) OVER (PARTITION BY carno ORDER BY obj_time) AS acc_workon,
workoff,
SUM(IF(diff_workoff >= 0, diff_workoff, 0))
OVER (PARTITION BY carno ORDER BY obj_time) AS acc_workoff,
remotecontrol,
SUM(IF(diff_remotecontrol >= 0, diff_remotecontrol, 0))
OVER (PARTITION BY carno ORDER BY obj_time) AS acc_remotecontrol,
manual_hours,
SUM(IF(diff_manual_hours >= 0, diff_manual_hours, 0))
OVER (PARTITION BY carno ORDER BY obj_time) AS acc_manual_hours,
autonomous_hours,
SUM(IF(diff_autonomous_hours >= 0, diff_autonomous_hours, 0))
OVER (PARTITION BY carno ORDER BY obj_time) AS acc_autonomous_hours,
workon_hours,
SUM(IF(diff_workon_hours >= 0, diff_workon_hours, 0))
OVER (PARTITION BY carno ORDER BY obj_time) AS acc_workon_hours,
workoff_hours,
SUM(IF(diff_workoff_hours >= 0, diff_workoff_hours, 0))
OVER (PARTITION BY carno ORDER BY obj_time) AS acc_workoff_hours,
remoteDrive_hours,
SUM(IF(diff_remoteDrive_hours >= 0, diff_remoteDrive_hours, 0))
OVER (PARTITION BY carno ORDER BY obj_time) AS acc_remoteDrive_hours,
statics_hours,
SUM(IF(diff_statics_hours >= 0, diff_statics_hours, 0))
OVER (PARTITION BY carno ORDER BY obj_time) AS acc_statics_hours,
dt
from (
select obj_time,
carno,
manual,
lead_manual - manual AS diff_manual,
autonomous,
lead_autonomous - autonomous AS diff_autonomous,
workon,
lead_workon - workon AS diff_workon,
workoff,
lead_workoff - workoff AS diff_workoff,
remotecontrol,
lead_remotecontrol - remotecontrol AS diff_remotecontrol,
manual_hours,
lead_manual_hours - manual_hours AS diff_manual_hours,
autonomous_hours,
lead_autonomous_hours - autonomous_hours AS diff_autonomous_hours,
workon_hours,
lead_workon_hours - workon_hours AS diff_workon_hours,
workoff_hours,
lead_workoff_hours - workoff_hours AS diff_workoff_hours,
remoteDrive_hours,
lead_remoteDrive_hours - remoteDrive_hours AS diff_remoteDrive_hours,
statics_hours,
lead_statics_hours - statics_hours AS diff_statics_hours,
dt
from (
select obj_time,
carno,
manual,
lead(ifnull(manual, 0), 1) over (partition by carno order by obj_time) lead_manual,
autonomous,
lead(ifnull(autonomous, 0), 1) over (partition by carno order by obj_time) lead_autonomous,
workon,
lead(ifnull(workon, 0), 1) over (partition by carno order by obj_time) lead_workon,
workoff,
lead(ifnull(workoff, 0), 1) over (partition by carno order by obj_time) lead_workoff,
remotecontrol,
lead(ifnull(remotecontrol, 0), 1)
over (partition by carno order by obj_time) lead_remotecontrol,
manual_hours,
lead(ifnull(manual_hours, 0), 1) over (partition by carno order by obj_time) lead_manual_hours,
autonomous_hours,
lead(ifnull(autonomous_hours, 0), 1)
over (partition by carno order by obj_time) lead_autonomous_hours,
workon_hours,
lead(ifnull(workon_hours, 0), 1) over (partition by carno order by obj_time) lead_workon_hours,
workoff_hours,
lead(ifnull(workoff_hours, 0), 1)
over (partition by carno order by obj_time) lead_workoff_hours,
remoteDrive_hours,
lead(ifnull(remoteDrive_hours, 0), 1)
over (partition by carno order by obj_time) lead_remoteDrive_hours,
statics_hours,
lead(ifnull(statics_hours, 0), 1)
over (partition by carno order by obj_time) lead_statics_hours,
dt
from (
select obj_time,
carno,
cast(json_query(json_query(now, "$.mileage"), '$.manual') as double) manual,
cast(
json_query(json_query(now, "$.mileage"), '$.autonomous') as double) autonomous,
cast(json_query(json_query(now, "$.mileage"), '$.workon') as double) workon,
cast(json_query(json_query(now, "$.mileage"), '$.workoff') as double) workoff,
cast(
json_query(json_query(now, "$.mileage"), '$.remoteDrive') as double) remotecontrol,
cast(json_query(json_query(now, "$.hours"), '$.manual') as double) manual_hours,
cast(
json_query(json_query(now, "$.hours"), '$.autonomous') as double) autonomous_hours,
cast(json_query(json_query(now, "$.hours"), '$.workon') as double) workon_hours,
cast(json_query(json_query(now, "$.hours"), '$.workoff') as double) workoff_hours,
cast(
json_query(json_query(now, "$.hours"), '$.remoteDrive') as double) remoteDrive_hours,
cast(json_query(json_query(now, "$.hours"), '$.statics') as double) statics_hours,
dt
from kafka_tb_mqtt_carinfo
where productName = 'ADC'
) l1
) l2
) l3;
5. 应用层API查询物化视图
// 定义 CarData 结构体
#[derive(Debug, Deserialize, Serialize, Clone, sqlx::FromRow)]
pub struct CarData {
pub carno: String,
pub manual: f64,
pub autonomous: f64,
pub remotecontrol: f64,
pub workon: f64,
pub workoff: f64,
pub manual_hours: f64,
pub autonomous_hours: f64,
pub remotecontrol_hours: f64,
pub workon_hours: f64,
pub workoff_hours: f64,
pub statics_hours: f64,
}
pub async fn query_mile_and_time(
db: StartRocksPool<MySql>,
req: DurationsCarnoRequest,
) -> RecloudResult<Vec<CarData>> {
info!("inside query_starrocks_duration_mileage");
if req.car_no.is_empty() {
error!("car_no is none or empty");
return Err(RecloudError::RequestError);
}
let sql = format!(
"
select carno,
ifnull(round(max(acc_manual) - min(acc_manual), 2), 0) as manual,
ifnull(round(max(acc_autonomous) - min(acc_autonomous), 2), 0) as autonomous,
ifnull(round(max(acc_remotecontrol) - min(acc_remotecontrol), 2), 0) as remotecontrol,
ifnull(round(max(acc_workon) - min(acc_workon), 2), 0) as workon,
ifnull(round(max(acc_workoff) - min(acc_workoff), 2), 0) as workoff,
ifnull(round(max(acc_manual_hours) - min(acc_manual_hours), 2), 0) as manual_hours,
ifnull(round(max(acc_autonomous_hours) - min(acc_autonomous_hours), 2), 0) as autonomous_hours,
ifnull(round(max(acc_remoteDrive_hours) - min(acc_remoteDrive_hours), 2), 0) as remotecontrol_hours,
ifnull(round(max(acc_workon_hours) - min(acc_workon_hours), 2), 0) as workon_hours,
ifnull(round(max(acc_workoff_hours) - min(acc_workoff_hours), 2), 0) as workoff_hours,
ifnull(round(max(acc_statics_hours) - min(acc_statics_hours), 2), 0) as statics_hours
from bigdata.{}
where dt = '{}'
and carno = '{}'
and obj_time >= '{}'
and obj_time <= '{}'
group by carno
",
req.table,get_target_date(req.start_time.clone()),req.car_no, req.start_time, req.stop_time
);
info!("sql: {:?}", sql);
let rows = match db.fetch_all(&*sql).await {
Ok(it) => it,
Err(err) => return Err(RecloudError::ConnectionError),
};
let mut data: Vec<CarData> = vec![];
for row in rows.iter() {
let carno = row.get(0);
let manual: f64 = row.get(1);
let autonomous: f64 = row.get(2);
let remotecontrol: f64 = row.get(3);
let workon: f64 = row.get(4);
let workoff: f64 = row.get(5);
let manual_hours: f64 = row.get(6);
let autonomous_hours: f64 = row.get(7);
let remotecontrol_hours: f64 = row.get(8);
let workon_hours: f64 = row.get(9);
let workoff_hours: f64 = row.get(10);
let statics_hours: f64 = row.get(11);
data.push(CarData {
carno,
manual,
autonomous,
remotecontrol,
workon,
workoff,
manual_hours,
autonomous_hours,
remotecontrol_hours,
workon_hours,
workoff_hours,
statics_hours,
});
}
info!("{:?}", data);
Ok(data)
}
- 接口查询展示