通过消除笛卡尔积将求解Advent of Code 2024第9题磁盘碎片整理的SQL提速百倍-优快云博客

前文给出了一个解答，它处理不了大字符串的输入，而正式测试数据有2万字节，每个字节表示长度，总长度=这些数之和，每个数0-9之间，要处理的字符数应该是10万左右

--正确但是很慢
with recursive t as
(select array_to_string([ repeat(case i%2 when 0 then chr((48+i//2)::int) else '.' end, substr(a, i+1, 1)::int) for i in range(0, length(a)-1)], '')s  
from(select trim(content) a from read_text('2409-input0.txt') )),
--from (select '2333133121414131402' a )),
v as(select s, replace(s, '.', '') n, reverse(n) r, length(s)-length(n) va , [ i  for i in range(length(s), 0, -1) if s[i:i]<>'.'] pos from t),
a using key(o)as (select 0 o, 1 lv, s, instr(s, '.') i, r, va,pos from v
union 
select 0, lv+1 , rpad(substr(s, 1, i-1) ||substr(r, lv, 1)  || s[ i+1:pos[lv]-1 ]  ,length(s)::int,'.') s , instr(s[i+1:pos[lv]], '.')+i i ,r,va,pos from a where lv<va and 
i=instr(s, '.')
)
select any_value(s), sum(n*(ascii(substr(s,n+1,1))-48))chksum from a, range(0, instr(s, '.')-1)b(n) ;
--select n , ascii(substr(s,n+1,1))-48 from a, range(0, instr(s, '.')-2)b(n) ;

一个字符串要20K，在递归CTE里面instr substr再拼接几万次迭代，而duckdb无法高效处理每次小迭代，其实应该把它转成行，然后表关联替换，这样就能充分利用duckdb的批量优势。
如下用关联代替递归CTE的代码正确，但是他能算出来10K的字符串，而15K就OOM错误。

with t as
(select array_to_string([ repeat(case i%2 when 0 then chr((48+i//2)::int) else '.' end, substr(a, i+1, 1)::int) for i in range(0, length(a)-1)], '')s  
from(select trim(content) a from read_text('2409-input3.txt') )),
--from (select '2333133121414131402' a )),
dzb as(select i, substr(s,i, 1)c from t,range(1, length(s)+1)t(i) ), --对照表
ept as(select row_number()over(order by i)rn, i, c from dzb where c='.'),   --空块位置, 正序
fid as(select row_number()over(order by i desc)rn,i, c from dzb where c<>'.'), --文件id位置，倒序
--交换位置，再拼回来, 只有空块i<文件i的才交换
ap as (select ept.rn, ept.i, fid.c from ept, fid where ept.rn=fid.rn and ept.i<fid.i
union all
select ept.rn, fid.i, ept.c from ept, fid where ept.rn=fid.rn and ept.i<fid.i
), 
a2 as(select i,c from ap
union all
select * from dzb where i not in (select i from ap)
), 
a as (select listagg(c,'' order by i)s from a2)
select any_value(s), sum(n*(ascii(substr(s,n+1,1))-48))chksum from a, range(0, instr(s, '.')-1)b(n) ;

把最后一步取子字符串的笛卡尔积去掉，还是同样的OOM错误，

with t as
(select array_to_string([ repeat(case i%2 when 0 then chr((48+i//2)::int) else '.' end, substr(a, i+1, 1)::int) for i in range(0, length(a)-1)], '')s  
from(select trim(content) a from read_text('2409-input3.txt') )),
--from (select '2333133121414131402' a )),
dzb as(select i, substr(s,i, 1)c from t,range(1, length(s)+1)t(i) ), --对照表
ept as(select row_number()over(order by i)rn, i, c from dzb where c='.'),   --空块位置, 正序
fid as(select row_number()over(order by i desc)rn,i, c from dzb where c<>'.'), --文件id位置，倒序
--交换位置，再拼回来, 只有空块i<文件i的才交换
ap as (select ept.rn, ept.i, fid.c from ept, fid where ept.rn=fid.rn and ept.i<fid.i
union all
select ept.rn, fid.i, ept.c from ept, fid where ept.rn=fid.rn and ept.i<fid.i
), 
a2 as(select i,c from ap
union all
select * from dzb where i not in (select i from ap)
)--, 
--a as (select listagg(c,'' order by i)s from a2)
select sum((i-1)*(ascii(c)-48))chksum from a2 ;

经过调试发现，早在生成对照表阶段，OOM已经发生，这里也有个笛卡尔积，20K的字符串复制了20K行，400M个字符，难怪会内存溢出。将其改成用unnest 列表操作转成行。再用row_number添加序号，这里不能用generate_subscripts，因为它产生的是最后一次展开前原列表元素的下标。
如下所示

D select  generate_subscripts(l, 1),unnest(l) from (select [[ 'a' for i in range(x)]for x in range(1,3)] l);
┌───────────────────────────┬───────────┐
│ generate_subscripts(l, 1) │ unnest(l) │
│           int64           │ varchar[] │
├───────────────────────────┼───────────┤
│                         1 │ [a]       │
│                         2 │ [a, a]    │
└───────────────────────────┴───────────┘
D select generate_subscripts(l, 1),unnest(l) from (select unnest([[ 'a' for i in range(x)]for x in range(1,3)])l);
┌───────────────────────────┬───────────┐
│ generate_subscripts(l, 1) │ unnest(l) │
│           int64           │  varchar  │
├───────────────────────────┼───────────┤
│                         1 │ a         │
│                         1 │ a         │
│                         2 │ a         │
└───────────────────────────┴───────────┘

改后的代码如下

with t as
(select unnest([[ case i%2 when 0 then chr((48+i//2)::int) else '.' end for x in range( substr(a, i+1, 1)::int)] for i in range(0, length(a)-1)]) l  
from(select trim(content) a from read_text('2409-input.txt') )),
--from (select '2333133121414131402' a )),
dzb as(select row_number()over()i, c from(select unnest(l) c from t )), --对照表
ept as(select row_number()over(order by i)rn, i, c from dzb where c='.'),   --空块位置, 正序
fid as(select row_number()over(order by i desc)rn,i, c from dzb where c<>'.'), --文件id位置，倒序
--交换位置，再拼回来, 只有空块i<文件i的才交换
ap as (select ept.rn, ept.i, fid.c from ept, fid where ept.rn=fid.rn and ept.i<fid.i
union all
select ept.rn, fid.i, ept.c from ept, fid where ept.rn=fid.rn and ept.i<fid.i
), 
a2 as(select i,c from ap
union all
select * from dzb where i not in (select i from ap)
)--, 
--a as (select listagg(c,'' order by i)s from a2)
, a as (select listagg(c,'' order by i)s from a2)
--select any_value(s), sum(n*(ascii(substr(s,n+1,1))-48))chksum from a, range(0, instr(s, '.')-1)b(n) ;
select sum((i-1)*(ascii(c)-48))chksum from a2 where c<>'.';

D .read 2409code.txt
┌─────────────────┐
│     chksum      │
│     int128      │
├─────────────────┤
│  6333571086074  │
│ (6.33 trillion) │
└─────────────────┘
Run Time (s): real 0.121 user 0.125000 sys 0.000000
D .read 2409code.txt
┌────────┐
│ chksum │
│ int128 │
├────────┤
│  962   │
└────────┘
Run Time (s): real 0.013 user 0.000000 sys 0.000000

速度令人满意，但结果又不对了。
将最后一步改回旧版本，注意取子字符串这里又引入了笛卡尔积，完整输入文件一下慢了很多，但计算出了答案，提交以后，是正确的。

D .read 2409code.txt
┌────────────────────────────────────────────┬────────┐
│                any_value(s)                │ chksum │
│                  varchar                   │ int128 │
├────────────────────────────────────────────┼────────┤
│ 0099811188827773336446555566.............. │  1928  │
└────────────────────────────────────────────┴────────┘
Run Time (s): real 0.019 user 0.000000 sys 0.000000
D .read 2409code.txt
100% ▕██████████████████████████████████████▏ (00:00:27.25 elapsed)
┌──────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────┐
│                                             any_value(s)                                             │    chksum     │
│                                               varchar                                                │    int128     │
├──────────────────────────────────────────────────────────────────────────────────────────────────────┼───────────────┤
│ 0000???11?2?3333?4??????5555?666??7788888????9?????:::::::;;;;;???????<<<<<<<??????===???????>>???…  │ 6340197768906 │
└──────────────────────────────────────────────────────────────────────────────────────────────────────┴───────────────┘
Run Time (s): real 27.302 user 6.265625 sys 16.765625

考虑到错误和正确的结果相差的比例不大，一定是遗漏了某个条件，经过检查，在慢速版本中，只对空块之前的位置计算，而快速版本未加此条件，导致将.字符当作数字计算了，而它的ascii码小于0的ascii码，减48成为负数，导致总和减少，加上此条件，结果就对了，处理完整输入文件时间从27秒降到0.12秒。

D .read 2409code.txt
┌────────┐
│ chksum │
│ int128 │
├────────┤
│  1928  │
└────────┘
Run Time (s): real 0.039 user 0.000000 sys 0.000000
D .read 2409code.txt
┌─────────────────┐
│     chksum      │
│     int128      │
├─────────────────┤
│  6340197768906  │
│ (6.34 trillion) │
└─────────────────┘
Run Time (s): real 0.123 user 0.109375 sys 0.015625