1 建议用left join代替 ,主表要为大表,然后附表中的字段为null晒选出去这样就可以查出not in 的数据
SELECT a.court_id,
sum(if(a.id_count >= 1, 1, 0)) house_count,
a.day_time
from (
select
n.court_id,
n.house_no,
count(distinct n.id ) id_count,
${end_day} day_time
from
(
SELECT r.court_id,
h.house_no,
r.id
FROM dwd_smart_community.dwd_house_info h
JOIN dwd_smart_community.dwd_person_resident_info r
ON h.court_id = r.court_id
AND h.building_no = r.building_no
AND h.unit_no = r.unit_no
AND h.house_no = r.house_no
where concat(substr(r.create_time, 1, 4), substr(r.create_time, 6, 2),
substr(r.create_time, 9, 2)) <= ${end_day}
) n
left join
(
SELECT t2.court_id,
t1.house_no,
t2.id
FROM dwd_smart_community.dwd_house_info t1
JOIN dwd_smart_community.dwd_person_resident_info t2
ON t1.court_id = t2.court_id
AND t1.building_no = t2.building_no
AND t1.unit_no = t2.unit_no
AND t1.house_no = t2.house_no
where concat(substr(t2.create_time, 1, 4), substr(t2.create_time, 6, 2),
substr(t2.create_time, 9, 2)) < ${end_day}
) m
on n.court_id = m.court_id and n.house_no = m.house_no
where m.house_no is null and m.court_id is null and m.id is null
GROUP BY n.court_id,
n.house_no
) a
GROUP BY a.court_id,
a.day_time
2 如果使用分区表后可以使用shell脚本
#!/usr/bin/env bash
startdate="$1"
enddate="$2"
echo 'startdate: '$startdate
echo 'enddate: '$enddate
echo "-----------------------------------"
#序列1-300,表示遍历300次,因为有结束时间的限制,所以实际上不会循环300次
for i in `seq 1 300`; do
#当开始时间大于结束时间时,直接结束脚本
if [[ $enddate -lt $startdate ]]; then
break
fi
echo $startdate
#执行hiveSQL脚本,我是需要按日期执行hiveSQL,这里可以无视
hive -e "
-- 防止中文乱码
set spark.driver.extraJavaOptions=-Dfile.encoding=utf-8;
set spark.executor.extraJavaOptions=-Dfile.encoding=utf-8;
set yarn.app.mapreduce.am.admin-command-opts=-Dfile.encoding=UTF-8;
SET hive.strict.checks.type.safety = false;
set hive.exec.dynamic.partition = true;
set hive.exec.dynamic.partition.mode=nonstrict;
SET hive.exec.max.dynamic.partitions = 10000;
SET hive.exec.max.dynamic.partitions.pernode = 10000;
INSERT overwrite TABLE dws_smart_community.dws_house_info partition (day_time)
select c.region_code,
c.region_name,
c.street_code,
c.street_name,
c.community_code,
c.community_name,
c.court_code,
c.court_name ,
t.house_count house_count,
t.toauditnum toauditnum,
'${startdate}' day_time
from (
select nvl(x.court_id, y.court_id) court_code,
nvl(y.house_count, 0) house_count,
nvl(x.toauditnum, 0) toauditnum,
nvl(x.day_time, y.day_time) day_time
from (
select t.court_id,
t.toauditnum,
t.day_time
from (
SELECT court_id,
sum(if(states = 1, 1, 0)) toauditnum,
concat(substr(create_time, 1, 4), substr(create_time, 6, 2),
substr(create_time, 9, 2)) day_time
FROM dwd_smart_community.dwd_person_resident_info
where length(concat(substr(create_time, 1, 4), substr(create_time, 6, 2),
substr(create_time, 9, 2))) = 8
and
concat(substr(create_time, 1, 4), substr(create_time, 6, 2),
substr(create_time, 9, 2)) = '${startdate}'
GROUP BY court_id,
concat(substr(create_time, 1, 4), substr(create_time, 6, 2),
substr(create_time, 9, 2))
) t
) x
full outer join
(
SELECT a.court_id,
sum(if(a.id_count >= 1, 1, 0)) house_count,
a.day_time
from (
select
n.court_id,
n.house_no,
count(distinct n.id ) id_count,
${startdate} day_time
from
(
SELECT r.court_id,
h.house_no,
r.id
FROM dwd_smart_community.dwd_house_info h
JOIN dwd_smart_community.dwd_person_resident_info r
ON h.court_id = r.court_id
AND h.building_no = r.building_no
AND h.unit_no = r.unit_no
AND h.house_no = r.house_no
where concat(substr(r.create_time, 1, 4), substr(r.create_time, 6, 2),
substr(r.create_time, 9, 2)) <= ${startdate}
) n
left join
(
SELECT t2.court_id,
t1.house_no,
t2.id
FROM dwd_smart_community.dwd_house_info t1
JOIN dwd_smart_community.dwd_person_resident_info t2
ON t1.court_id = t2.court_id
AND t1.building_no = t2.building_no
AND t1.unit_no = t2.unit_no
AND t1.house_no = t2.house_no
where concat(substr(t2.create_time, 1, 4), substr(t2.create_time, 6, 2),
substr(t2.create_time, 9, 2)) < ${startdate}
) m
on n.court_id = m.court_id and n.house_no = m.house_no
where m.house_no is null and m.court_id is null and m.id is null
GROUP BY n.court_id,
n.house_no
) a
GROUP BY a.court_id,
a.day_time
) y
on x.court_id = y.court_id and x.day_time = y.day_time
) t
join
(
select region_code
, region_name
, street_code
, street_name
, community_code
, community_name
, court_code
, court_name
from dim_smart_community.dim_area_info
where court_code <> ''
) c
on t.court_code = c.court_code
;
"
#每次执行后,使开始日期减一天,如果要正序,将下面-1换成+1即可,当然开始时间和结束时间也要换一下
startdate=$(date -d "$startdate +1 day" +%Y%m%d)
done