1、背景
某某大数据系统接入很多接口。现需要将这部分接口任务修改为适配国产梧桐云原生分析型数据库;有部分接口,源端表提供的数据量大且会有几条、十几条内容不符合规范,导致梧桐云原生分析型数据库无法成功入库,为确保不因很少比例的数据影响整体接口成功接入,从而影响整体业务,需要对梧桐云原生分析型数据库设置入库最大抛弃条数。
2、设置抛弃条数的可行性
一般可以设置抛弃条数的接口数据都有数据量大的特点,对端传的文件有问题导致无法入库,对端又无法解决,数据量大且错误数据又不多的情况下,可以设置抛弃几条;
3、设置入库抛弃条数
3.1封装函数名
test_xxx
3.2 底层实现
3.2.1创建函数
CREATE FUNCTION test_xxx( dir text, filename text, loadtype text, schemaname text, tablename text, filedelimiter text, rej_max text)
RETURNS text AS
$$
DECLARE
V_MVIEWS RECORD;
V_COLUMNSTR TEXT := ‘’;
V_INDEX SMALLINT := 0;
V_SNAME TEXT;
V_TNAME TEXT;
str_sql TEXT := ‘’;
ETL_DATE DATE;
load_run_start_time TIMESTAMP;
load_run_end_time TIMESTAMP;
load_total_time TEXT;
load_step_start_time TIMESTAMP;
load_step_end_time TIMESTAMP;
load_step_total_time TEXT;
external_table_name TEXT := ‘’;
external_err_table_name TEXT := ‘’;
external_web_table_name TEXT := ‘’;
external_newline TEXT := ‘’;
md5_suffix TEXT := ‘8765c258776a38049c97416bad678b0f5c6c0f892b12f6518b5cc’;
load_type_lower TEXT := ‘’;
LOCATION TEXT := ‘’;
err_num BIGINT := 0;
ret TEXT=‘’;
BEGIN
SET CLIENT_MIN_MESSAGES = WARNING;
–新增20220824
SET NEW_EXECUTOR = OFF;
V_SNAME := lower(schemaname);
V_TNAME := lower(tablename);
load_type_lower := lower(loadtype);
–判断是否符合加载条件
IF load_type_lower NOT IN (‘replace’, ‘insert’) THEN
RETURN ‘Please enter the correct value for load type: replace or insert’;
END IF;
–生成外表名称
SELECT substr(md5(random()::varchar)||md5(random()::varchar), 0, 54) INTO md5_suffix;
external_table_name :=‘EXT_’ || md5_suffix;
external_err_table_name := ‘ERR_’ || md5_suffix;
–判断是否为DOS格式
external_web_table_name :=‘WEB_’ || md5_suffix;
SELECT timeofday()::timestamp INTO load_step_start_time;
str_sql:=‘drop EXTERNAL TABLE if exists ‘||V_SNAME||’.’||external_web_table_name;
execute str_sql;
RAISE info ‘%’,str_sql;
str_sql:=’
CREATE readable EXTERNAL WEB TABLE ‘||V_SNAME||’.‘||external_web_table_name||’
(rt text)
execute ‘’/usr/local/oushu/ext_script/GetFileType.sh ‘||dir||’ ‘||filename||’ ‘’ on master format ‘‘text’’ (delimiter ‘’|‘’)
‘;
RAISE info ‘%’,str_sql;
execute str_sql;
str_sql=‘select rt from ‘||V_SNAME||’.’||external_web_table_name;
RAISE info ‘%’,str_sql;
execute str_sql into ret;
IF ret = 0 THEN
external_newline :=‘LF’;
ELSE
external_newline :=‘CRLF’;
END IF;
str_sql:=‘drop EXTERNAL TABLE if exists ‘||V_SNAME||’.’||external_web_table_name;
execute str_sql;
RAISE info ‘%’,str_sql;
RAISE INFO ‘step % succeeded’, 0;
SELECT timeofday()::timestamp INTO load_step_end_time;
load_step_total_time := load_step_end_time - load_step_start_time;
RAISE INFO ‘step % time %’, 0, load_step_total_time ;
SELECT timeofday()::timestamp INTO load_run_start_time;
RAISE INFO ‘Start load: %’, load_run_start_time;
SELECT timeofday()::timestamp INTO load_step_start_time;
LOCATION := ‘LOCATION (’’’ || dir || ‘/’ || filename || E’‘’) \n FORMAT ‘‘text’’ (DELIMITER ‘’’ || filedelimiter || ‘’’ NULL ‘’‘’ ESCAPE ‘‘off’’ NEWLINE ‘’‘||external_newline||’‘’ )‘;
RAISE INFO ‘%’, LOCATION;
str_sql = ‘CREATE READABLE EXTERNAL TABLE ‘|| V_SNAME ||’.’|| external_table_name || E’\n(\n LIKE ’ || V_SNAME || ‘.’ || V_TNAME || E’\n) \n’ || LOCATION || ‘LOG ERRORS INTO ‘||V_SNAME||’.’||external_err_table_name||’ SEGMENT REJECT LIMIT 2147483647’;
RAISE INFO ‘%’, str_sql;
EXECUTE str_sql ;
RAISE INFO ‘step % succeeded’, 1;
SELECT timeofday()::timestamp INTO load_step_end_time;
load_step_total_time := load_step_end_time - load_step_start_time;
RAISE INFO ‘step % time %’, 1, load_step_total_time ;
–插入数据,若是为replace则先清空表
SELECT timeofday()::timestamp INTO load_step_start_time;
IF load_type_lower = ‘replace’ THEN
str_sql = ‘TRUNCATE TABLE ’ || V_SNAME || ‘.’ || V_TNAME || ‘;’;
RAISE INFO ‘%’, str_sql;
EXECUTE str_sql ;
END IF;
str_sql = ‘INSERT INTO ’ || schemaname || ‘.’ || tablename || ’ SELECT * FROM ‘|| V_SNAME ||’.’ || external_table_name;
RAISE INFO ‘%’, str_sql;
EXECUTE str_sql ;
RAISE INFO ‘step % succeeded’, 2;
SELECT timeofday()::timestamp INTO load_step_end_time;
load_step_total_time := load_step_end_time - load_step_start_time;
RAISE INFO ‘step % time %’, 2, load_step_total_time ;
–删掉外表
SELECT timeofday()::timestamp INTO load_step_start_time;
str_sql = ‘DROP EXTERNAL TABLE ‘|| V_SNAME ||’.’|| external_table_name;
RAISE INFO ‘%’, str_sql;
EXECUTE str_sql ;
RAISE INFO ‘step % succeeded’, 3;
SELECT timeofday()::timestamp INTO load_step_end_time;
load_step_total_time := load_step_end_time - load_step_start_time;
RAISE INFO ‘step % time %’, 3, load_step_total_time ;
–统计err表记录数量
str_sql = ‘SELECT count(*) FROM ‘||V_SNAME||’.’||external_err_table_name;
RAISE INFO ‘%’, str_sql;
EXECUTE str_sql into err_num;
IF err_num > cast(rej_max as bigint) THEN
RAISE INFO ‘The number of error records exceeds the limit %’, rej_max;
str_sql = ‘insert into MASADW.DATA_LOAD_ERRORS SELECT * FROM ‘||V_SNAME||’.’||external_err_table_name ||’ LIMIT '||rej_max;
EXECUTE str_sql;
exit;
END IF;
–删掉err表
SELECT timeofday()::timestamp INTO load_step_start_time;
str_sql = ‘INSERT INTO MASADW.DATA_LOAD_ERRORS SELECT * FROM ‘||V_SNAME||’.’||external_err_table_name;
RAISE INFO ‘%’, str_sql;
EXECUTE str_sql ;
str_sql = ‘DROP TABLE IF EXISTS ‘||V_SNAME||’.’||external_err_table_name;
RAISE INFO ‘%’, str_sql;
EXECUTE str_sql ;
RAISE INFO ‘step % succeeded’, 4;
SELECT timeofday()::timestamp INTO load_step_end_time;
load_step_total_time := load_step_end_time - load_step_start_time;
RAISE INFO ‘step % time %’, 4, load_step_total_time ;
RAISE INFO ‘%’, ‘LOAD DONE!!!’;
SELECT timeofday()::timestamp INTO load_run_end_time;
load_total_time := load_run_end_time - load_run_start_time;
RAISE INFO ‘total time %’, load_total_time;
RETURN 0;
EXCEPTION WHEN others THEN
RAISE WARNING ‘%’, sqlerrm;
RAISE WARNING ‘%’, ‘LOAD FAILED!!!’;
SELECT timeofday()::timestamp INTO load_run_end_time;
load_total_time := load_run_end_time - load_run_start_time;
RAISE INFO ‘total time %’, load_total_time;
return 1;
END;
$$ LANGUAGE plpgsql ;
3.2.2 生成入库文件
hadoop fs -cat /path/filename.txt:
111|aaa
222|bbb
333|ccc
444|ddd
3.2.3创建入库表
create table table1
(
col1 bigint,
clo2 char(6)
);
3.2.4 入库测试
情景一:抛弃条数设置为0,装载失败。
select test_xxx(‘hdfs://path’,‘filename.txt’,‘insert’,‘’,‘table1’,‘|’,0);
情景一:抛弃条数设置为1,装载成功。
select test_xxx(‘hdfs://path’,‘filename.txt’,‘insert’,‘’,‘table1’,‘|’,1);
3.2.5 验证入库数据
数据库查询结果:
select * from table1 ;
clo1 | col2
-----±-----
111 | aaa
222 | bbb
333 | ccc
444 | ddd
(4 rows)
2572

被折叠的 条评论
为什么被折叠?



