数据迁移时,发现以前的Hive表都是没有压缩的,为了提高效率和节约资源,将Hive表添加压缩。
背景:原集群CDH5.16、Hive1.1,新集群CDH6.0.1、Hive2.1。
基本思路
1. 从其他集群把Hive数据discp到本地集群
2. 先将数据导入Hive, 只改动表名当做临时表
3. 建压缩表
4. 从临时表将数据导入压缩表
目录
原表建表语句与表结构
CREATE TABLE `virtual_payment_cp` (
...
) PARTITIONED BY(`DATE` STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
CREATE TABLE `virtual_payment_cp`(
...
PARTITIONED BY (
`date` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='\t',
'serialization.format'='\t')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://hadoop-master:8020/user/hive/warehouse/cp_data.db/virtual_payment_cp'
TBLPROPERTIES (
'transient_lastDdlTime'='1565767429')
最终表建表语句与表结构
CREATE TABLE `virtual_payment_cp` (
...
) PARTITIONED BY(`DATE` STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");
CREATE TABLE `tb_sdk_user_payment_cp`(
...
PARTITIONED BY (
`date` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
WITH SERDEPROPERTIES (
'field.delim'='\t',
'serialization.format'='\t')
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
LOCATION
'hdfs://nameservice2/user/hive/warehouse/cp_data.db/tb_sdk_user_payment_cp'
TBLPROPE