实时同步抽数ETL---Shell脚本

最新推荐文章于 2025-10-19 11:27:53 发布

原创

最新推荐文章于 2025-10-19 11:27:53 发布 · 1.2k 阅读

12 ·

CC 4.0 BY-SA版权

文章标签：

#sqoop #shell

方式一（推荐）：

sh  /data/dfp/etl/etl.sh  jdbc:postgresql://192.168.0.36:5432/postgres  test  'test'  /user/hive/warehouse/s66_postgres.db/test_miaodazhuang1/dt=20210107  "SELECT CAST(EXTRACT(EPOCH FROM COALESCE(update_time,create_time)::TIMESTAMP WITH TIME ZONE)* 1000 AS int8)||'|9' AS event_timestamp,CAST('INSERT' AS VARCHAR) AS event_type,tt.* FROM test_miaodazhuang1 AS tt WHERE ((create_time >= '2021-01-07 00:00:00' and create_time<= '2021-01-08') OR (update_time >= '2021-01-07 00:00:00' and update_time<= '2021-01-08'))"  create_time  test_miaodazhuang1  root.etl

etl.sh 详细代码如下：

#!/bin/bash
export LNAG=zh_CN.UTF-8

# 补数逻辑，pg => hdfs
# -- --schema <name>

connect=$1
username=$2
password=$3
hdfs_path=$4
query_condition=$5
alias_time=$6
tablename=$7
queue=$8

hdfs_path_tmp=${hdfs_path}_tmp
query_condition="${query_condition}"

echo "query_condition==> ${query_condition}"

sqoop_sql="sqoop import -D  mapred.job.queue.name=${queue}\
    --connect ${connect}?connectTimeout=300  \
    --username ${username}  \
    --password ${password}  \
    --target-dir ${hdfs_path_tmp}   \
    --fields-terminated-by '\t'  \
    --hive-delims-replacement ' '   \
    --null-string '\\\\N'  \
    --null-non-string '\\\\N' \
    --query \"${query_condition} AND \\\$CONDITIONS\" \
    --split-by ${alias_time} \
     "
echo "执行的sqoop脚本==> ${sqoop_sql}"
eval $sqoop_sql

if [ $? -eq 0 ] ;then
    echo "`date "+%Y-%m-%d %H:%M:%S"` ==> 提交抽数任务到 hdfs ${hdfs_path_tmp} 成功"

    echo "==>准备删除_SUCCESS文件"
    rm_sucess_file=" hdfs dfs -rm ${hdfs_path_tmp}/_SUCCESS "
    echo "rm_sucess_file == ${rm_sucess_file}"
    eval $rm_sucess_file
    if [ $? -eq 0 ] ;then
        echo "删除${rm_sucess_file}文件成功"
    else
        echo "删除${rm_sucess_file}文件失败"
    fi

    echo "==>判断${hdfs_path}路径是否存在"
    hdfs dfs -test -e ${hdfs_path}
    if [ $? -eq 0 ] ;then
        echo "${hdfs_path} exist"
    else
        echo "${hdfs_path} is not exist! create it"
        hdfs dfs -mkdir -p ${hdfs_path}
    fi
    
    echo "==>数据从临时路径${hdfs_path_tmp} 移动到${hdfs_path}"
        files=`hadoop fs -ls ${hdfs_path_tmp} | awk -F " " '{print $8}'`
        for file in ${files}
              do        
                        time_stamp=`date +%s`
                        filename=`echo ${file} | awk -F "part-m-" '{print $2}'`
                        mv_sql=" hdfs dfs -mv ${file}