datax插件开发HdfsReader支持parquet

原创已于 2023-10-13 10:24:11 修改 · 1k 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#大数据 #java

于 2023-06-28 09:41:54 首次发布

文章介绍了如何对AlibabaDataX开源版的HdfsReader模块进行改造，以支持HIVE中常用的parquet格式数据读取。改造包括在pom文件中添加parquet相关的依赖，修改Constant类和HdfsFileType枚举，以及在DFSUtil中添加读取parquet文件的方法，解决了DataX不支持parquet格式的问题。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

数据仓库HIVE存储数据一般采用parquet格式，但Alibaba datax开源版不支持parquet格式，在网上查了很多资料，写的大多不完整，特此总结出完整版记录一下，供大家参考。

操作步骤

1.从gitee 拉取datax代码，对hdfsreader模块进行改造，主要改造以下几个类。

pom里面添加parquet支持依赖

 <dependency>
            <groupId>org.apache.parquet</groupId>
            <artifactId>parquet-avro</artifactId>
            <version>1.12.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.parquet</groupId>
            <artifactId>parquet-common</artifactId>
            <version>1.12.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.parquet</groupId>
            <artifactId>parquet-protobuf</artifactId>
            <version>1.12.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.parquet</groupId>
            <artifactId>parquet-protobuf</artifactId>
            <version>1.12.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.parquet</groupId>
            <artifactId>parquet-hadoop</artifactId>
            <version>1.12.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.1</version>
        </dependency>

Constant如下

public class Constant {
    public static final String SOURCE_FILES = "sourceFiles";
    public static final String TEXT = "TEXT";
    public static final String ORC = "ORC";
    public static final String CSV = "CSV";
    public static final String SEQ = "SEQ";
    public static final String RC = "RC";
    public static final String PARQUET= "PARQUET";

}

HdfsFileType

public enum HdfsFileType {
    ORC, SEQ, RC, CSV, TEXT,PARQUET,
}

DFSUtil添加读取parquet方法,根据orc读取方法改造

 public void parquetFileStartRead(String sourceParquetFilePath, Configuration readerSliceConfig,
                                     RecordSender recordSender, TaskPluginCollector taskPluginCollector) {
        LOG.info(String.format("Start Read parquetfile [%s].", sourceParquetFilePath));
        List<ColumnEntry> column = UnstructuredStorageReaderUtil
                .getListColumnEntry(readerSliceConfig, com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN);
        String nullFormat = readerSliceConfig.getString(NULL_FORMAT);
        boolean isReadAllColumns = false;
            Path parquetFilePath = new Path(sourceParquetFilePath);
            try {
                GroupReadSupport readSupport = new GroupReadSupport();
                ParquetReader.Builder<Group> reader= ParquetReader.builder(readSupport,parquetFilePath);
                ParquetReader<Group> build= reader.build();
                Group line = build.read();
                List<org.apache.parquet.schema.Type> typeList = line.getType().getFields();
                int size = typeList.size();
                List<Object> recordFields = null;
                int k=0;  //line = build.read()会忽略第一条数据,定义变量k解决
                while (k==0||((line = build.read()) != null)) {

                    k++;
                    recordFields = new ArrayList<Object>();
                    for (int i = 0; i < size; i++) {

                        //解决int96和int32问题
                        String schemaType = typeList.get(i).asPrimitiveType().getPrimitiveTypeName().name();
                        if (schemaType.toLowerCase().equalsIgnoreCase("int96")) {
                            Binary bin = line.getInt96(typeList.get(i).getName(), 0);
                            if (bin != null) {
                                Long longTime = ParquetTimestampUtils.getTimestampMillis(bin);
                                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                                recordFields.add(sdf.format(longTime));
                            }
                        } else if (schemaType.equalsIgnoreCase("int32")) {
                            Integer timeDay = line.getInteger(typeList.get(i).getName(), 0);
                            Long time = timeDay * 24 * 60 * 60 * 1000L;
                            SimpleDateFormat sdf1 = new SimpleDateFormat("yyyy-MM-dd");
                            recordFields.add(sdf1.format(time));
                        } else {
                            try {
                                recordFields.add(line.getValueToString(i, 0));
                            }catch (Exception e){
                                recordFields.add("");
                            }

                        }

                    }

                    transportOneRecord(column, recordFields, recordSender,
                            taskPluginCollector, isReadAllColumns, nullFormat);
                }
                build.close();

            } catch (Exception e) {
                String message = String.format("从parquetfile文件路径[%s]中读取数据发生异常,[%s]，请联系系统管理员。"
                        , sourceParquetFilePath, e);
                LOG.error(message);
                throw DataXException.asDataXException(HdfsReaderErrorCode.READ_FILE_ERROR, message);
            }

    }



    //判断文件是否是parquet
    private static boolean isParquetFile(Path file) {
        try {
            org.apache.parquet.hadoop.example.GroupReadSupport readSupport = new GroupReadSupport();
            ParquetReader.Builder<org.apache.parquet.example.data.Group> reader = ParquetReader.builder(readSupport, file);
            ParquetReader<Group> build = reader.build();
            if (build.read() != null) {
                return true;
            }
        } catch (IOException e) {

        }
        return false;
    }

package com.alibaba.datax.plugin.reader.hdfsreader;

import com.google.common.primitives.Ints;
import com.google.common.primitives.Longs;
import org.apache.parquet.io.api.Binary;

import java.util.concurrent.TimeUnit;


public class ParquetTimestampUtils {
    private static final int JULIAN_EPOCH_OFFSET_DAYS = 2440588;
    private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1);
    private static final long NANOS_PER_MILLISECOND = TimeUnit.MILLISECONDS.toNanos(1);

    private ParquetTimestampUtils() {}

    /**
     * Returns GMT timestamp from binary encoded parquet timestamp (12 bytes - julian date + time of day nanos).
     *
     * @param timestampBinary INT96 parquet timestamp
     * @return timestamp in millis, GMT timezone
     */
    public static long getTimestampMillis(Binary timestampBinary)
    {
        if (timestampBinary.length() != 12) {
            return 0;
//            throw new PrestoException(HIVE_BAD_DATA, "Parquet timestamp must be 12 bytes, actual " + timestampBinary.length());
        }
        byte[] bytes = timestampBinary.getBytes();

        // little endian encoding - need to invert byte order
        long timeOfDayNanos = Longs.fromBytes(bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]);
        int julianDay = Ints.fromBytes(bytes[11], bytes[10], bytes[9], bytes[8]);

        return julianDayToMillis(julianDay) + (timeOfDayNanos / NANOS_PER_MILLISECOND);
    }

    private static long julianDayToMillis(int julianDay)
    {
        return (julianDay - JULIAN_EPOCH_OFFSET_DAYS) * MILLIS_IN_DAY;
    }
}

最后是HdfsReader,添加parquet