调整前跑完需要两个小时,调整后5分钟跑完
调整前HSQL:
insert overwrite table dwd.game_cid_new partition(thisdate,ckey)
select
b.cookie_key,
b.cookie_value,
count(*) views,
a.thisdate,
b.cookie_key
from dwd.d_op_behavior_host_new a
join (select qy_id,cookie_key,cookie_value from dim.dim_cookie_mapping where ckey in ('_cid','__ysuid')) b on a.id=b.qy_id
left semi join (select host from dim.dim_op_site where cateid in (
118,
150,
151,
152,
153,
154,
155,
42710,
44904,
44934,
44935,
44938,
44940,
44941,
44942,
44943
) ) c on a.host=c.host
where a.thisdate='$thisdate'
group by
b.cookie_key,
b.cookie_value,
a.thisdate
EXPLAIN查看执划,看到 dwd.d_op_behavior_host_new 与dim.dim_cookie_mapping先进行内连接,由于是一对多连接,数据成倍增长,帮调整为先进行left semi join过滤
调整后HSQL
insert overwrite table dwd.game_cid_new partition(thisdate,ckey)
select
b.cookie_key,
b.cookie_value,
count(*) views,
a.thisdate,
b.cookie_key
from dwd.d_op_behavior_host_new a
left semi join (select host from dim.dim_op_site where cateid in (
118,
150,
151,
152,
153,
154,
155,
42710,
44904,
44934,
44935,
44938,
44940,
44941,
44942,
44943
) ) c on a.host=c.host
join (select qy_id,cookie_key,cookie_value from dim.dim_cookie_mapping where ckey in ('_cid','__ysuid')) b on a.id=b.qy_id
where a.thisdate='$thisdate'
group by
b.cookie_key,
b.cookie_value,
a.thisdate