GPDB的数据倾斜问题

GPDB在运行中可能出现数据倾斜问题,导致不同节点数据分布不均,影响查询性能。通常,通过特定查询手段检查数据分布,但面对多个表时,此方法效率低下。有开发者提供了函数来简化这一过程,通过执行该函数可以便捷地检查数据倾斜程度。当数据差距(gap)过大时,调整分布键成为必要的优化策略。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

DB运行了一段时间后,数据发生了变化,在不同节点的数据分布会发生问题,有的节点分配的数据较多有的较少,这样在查询的时候会导致性能的降低,我们常规的查询手段是通过

Select gp_segment_id,count(*) from  tablename  group by 1 ;

但是如果库中的表比较多,这样查询很费事,有人编写了函数,拿来主义

CREATE OR REPLACE FUNCTION public.fn_get_skew(out schema_name      varchar,
                                              out table_name       varchar,
                                              out pTableName       varchar,
                                              out total_size_GB    numeric(15,2),
                                              out seg_min_size_GB  numeric(15,2),
                                              out seg_max_size_GB  numeric(15,2),
                                              out seg_avg_size_GB  numeric(15,2),
                                              out seg_gap_min_max_percent numeric(6,2),
                                              out seg_gap_min_max_GB      numeric(15,2),
                                              out nb_empty_seg     int) RETURNS SETOF record AS
$$
DECLARE
    v_function_name text := 'fn_get_skew';
    v_location int;
    v_sql text;
    v_db_oid text;
    v_num_segments numeric;
    v_skew_amount numeric;
    v_res record;
BEGIN
    v_location := 1000;
    SELECT oid INTO v_db_oid
    FROM pg_database
    WHERE datname = current_database();

    v_location := 2200;
    v_sql := 'DROP EXTERNAL TABLE IF EXISTS public.db_files_ext';

    v_location := 2300;
    EXECUTE v_sql;

    v_location := 3000;
    v_sql := 'CREATE EXTERNAL WEB TABLE public.db_files_ext ' ||
            '(segment_id int, relfilenode text, filename text, ' ||
            'size numeric) ' ||
            'execute E''ls -l $GP_SEG_DATADIR/base/' || v_db_oid ||
            ' | ' ||
            'grep gpadmin | ' ||
            E'awk {''''print ENVIRON["GP_SEGMENT_ID"] "\\t" $9 "\\t" ' ||
            'ENVIRON["GP_SEG_DATADIR"] "/' || v_db_oid ||
            E'/" $9 "\\t" $5''''}'' on all ' || 'format ''text''';

    v_location := 3100;
    EXECUTE v_sql;

    v_location := 4000;
    for v_res in (
                select  sub.vschema_name,
                        sub.vtable_name,
                        (sum(sub.size)/(1024^3))::numeric(15,2) AS vtotal_size_GB,
                        --Size on segments
                        (min(sub.size)/(1024^3))::numeric(15,2) as vseg_min_size_GB,
                        (max(sub.size)/(1024^3))::numeric(15,2) as vseg_max_size_GB,
                        (avg(sub.size)/(1024^3))::numeric(15,2) as vseg_avg_size_GB,
                        --Percentage of gap between smaller segment and bigger segment
                        (100*(max(sub.size) - min(sub.size))/greatest(max(sub.size),1))::numeric(6,2) as vseg_gap_min_max_percent,
                        ((max(sub.size) - min(sub.size))/(1024^3))::numeric(15,2) as vseg_gap_min_max_GB,
                        count(sub.size) filter (where sub.size = 0) as vnb_empty_seg
                    from (
                        SELECT  n.nspname AS vschema_name,
                                c.relname AS vtable_name,
                                db.segment_id,
                                sum(db.size) AS size
                            FROM ONLY public.db_files_ext db
                                JOIN pg_class c ON split_part(db.relfilenode, '.'::text, 1) = c.relfilenode::text
                                JOIN pg_namespace n ON c.relnamespace = n.oid
                            WHERE c.relkind = 'r'::"char"
                                and n.nspname not in ('pg_catalog','information_schema','gp_toolkit')
                                and not n.nspname like 'pg_temp%'
                            GROUP BY n.nspname, c.relname, db.segment_id
                        ) sub
                    group by 1,2
                    --Extract only table bigger than 1 GB
                    --   and with a skew greater than 20%
                    /*having sum(sub.size)/(1024^3) > 1
                        and (100*(max(sub.size) - min(sub.size))/greatest(max(sub.size),1))::numeric(6,2) > 20
                    order by 1,2,3
                    limit 100*/ ) loop
        schema_name         = v_res.vschema_name;
        table_name          = v_res.vtable_name;
        total_size_GB       = v_res.vtotal_size_GB;
        seg_min_size_GB     = v_res.vseg_min_size_GB;
        seg_max_size_GB     = v_res.vseg_max_size_GB;
        seg_avg_size_GB     = v_res.vseg_avg_size_GB;
        seg_gap_min_max_percent = v_res.vseg_gap_min_max_percent;
        seg_gap_min_max_GB  = v_res.vseg_gap_min_max_GB;
        nb_empty_seg        = v_res.vnb_empty_seg;
        return next;
    end loop;

    v_location := 4100;
    v_sql := 'DROP EXTERNAL TABLE IF EXISTS public.db_files_ext';

    v_location := 4200;
    EXECUTE v_sql;

    return;
EXCEPTION
        WHEN OTHERS THEN
                RAISE EXCEPTION '(%:%:%)', v_function_name, v_location, sqlerrm;
END;
$$
language plpgsql;

执行方式:

select * from public.fn_get_skew();

通过查询我们可以发现
这里写图片描述
如果gap数据过大,我们就需要调整分布键来调整分布情况。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值