DataX 安装、使用
1、下载安装包
# 创建安装包目录
[bigdata@tsp3dev01 ~]# mkdir /software/datax
# 下载安装包
[bigdata@tsp3dev01 datax]# wget http://datax-opensource.oss-cn-hangzhou.aliyuncs.com/datax.tar.gz
2、解压到目标目录
[bigdata@tsp3dev01 datax]# sudo tar xzvf /software/datax/datax.tar.gz -C /usr/local/
3、修改权限
# 修改权限
[bigdata@tsp3dev01 local]# sudo chown -R bigdata:bigdata /usr/local/datax
4、测试
1、postgreSQL 读取任务打印到控制台
{
"job": {
"setting": {
"speed": {
"byte": 1048576
}
},
"content": [
{
"reader": {
"name": "postgresqlreader",
"parameter": {
"username": "postgres",
"password": "postgres",
"connection": [
{
"querySql": [
"select id, message, datetime, data, group_id, message_id, project_id, time_spent, platform from sentry_message;"
],
"jdbcUrl": [
"jdbc:postgresql://10.6.215.45:5432/postgres"
]
}
]
}
},
"writer": {
"name": "streamwriter",
"parameter": {
"print": true,
"encoding": "UTF-8"
}
}
}
]
}
}
2、postgreSQL 读,HDFS 写 测试
{
"job": {
"setting": {
"speed": {
"byte": 1048576
}
},
"content": [
{
"reader": {
"name": "postgresqlreader",
"parameter": {
"username": "postgres",
"password": "postgres",
"connection": [
{
"querySql": [
"select id, message, datetime, data, group_id, message_id, project_id, time_spent, platform from sentry_message;"
],
"jdbcUrl": [
"jdbc:postgresql://10.6.215.45:5432/postgres"
]
}
]
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://tsp3dev01.para.com:8020",
"fileType": "text",
"path": "/user/hive/warehouse/ods.db",
"fileName": "sentry_message",
"column": [
{
"name": "id",
"type": "bigint"
},
{
"name": "message",
"type": "String"
},
{
"name": "datetime",
"type": "String"
},
{
"name": "data",
"type": "String"
},
{
"name": "group_id",
"type": "int"
},
{
"name": "message_id",
"type": "string"
},
{
"name": "project_id",
"type": "int"
},
{
"name": "time_spent",
"type": "string"
},
{
"name": "platform",
"type": "string"
}
],
"writeMode": "append",
"fieldDelimiter": "\t",
}
}
}
]
}
}
5、执行任务
[bigdata@tsp3dev01 datax]$ python /usr/local/datax/bin/datax.py /usr/local/datax/job/postgres2hdfs.job
6、查看结果
hdfs 地址:http://10.6.215.39:9870/explorer.html#/data/datax/sentry
7、创建hive表
create table IF NOT EXISTS ods.ods_sentry_message_da (
id bigint comment '主键id'
,message string comment '消息内容'
,`data` string comment 'data'
,group_id int comment 'group_id'
,message_id string comment 'message_id'
,project_id int comment 'project_id'
,time_spent string comment 'time_spent'
,platform string comment 'platform'
)
partitioned by (pt string comment '日期分区')
row format delimited
fields terminated by "\t"
STORED AS TEXTFILE;
8、同步数据到hive表(也可以地址直接写hive的存储目录)
show create table ods.ods_sentry_message_da;
在hive shell中执行
show create table ods.ods_sentry_message_da;
load data inpath '/data/datax/sentry/*' overwrite into table ods.ods_sentry_message_da partition ( dt = '2021-06-01');
参考:
- http://datax-opensource.oss-cn-hangzhou.aliyuncs.com/datax.tar.gz
- https://github.com/alibaba/DataX