Java向Parquet文件写数据代码示例

背景

在生产环境种数据量每天达到上亿,需要增量存放在hive对应分区种,纯文本数据占用存储还是比较大的,所以存储紧张的情况下需要使用Parquet存储格式去存储数据,下面将给出两个示例,一个是往本地Parquet文件写入数据,一个是往HDFS Parquet文件上写入数据。 

往本地Parquet文件写入数据示例

d62670620b6e83fb71c05c38a4b2dbf7.png

package com.htsc;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.example.ExampleParquetWriter;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Types;


import java.io.IOException;


public class writeParquetFileToLocal {
    private static MessageType FILE_SCHEMA;
    private static Configuration conf;


    public static void main(String[] args) throws IOException {
        String loclPathFile="F:\\works\\tmp\\testparquetfile"; //parquet文件名称
        Path path = new Path(loclPathFile);
       
        //定义表结构
        Types.MessageTypeBuilder messageTypeBuilder = Types.buildMessage();
        defineTbStru(messageTypeBuilder);
        SimpleGroupFactory simpleGroupFactory = new SimpleGroupFactory(FILE_SCHEMA);


        conf=new Configuration();
        //构建ParquetWriter对象,用于向parquet文件写入数据
        ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
                .withConf(conf)
                .withType(FILE_SCHEMA)
                .build();


        //样例数据
        String id="1001";
        String name="tom";
        String address="nanjin";
        
        writeHdfsFile(simpleGroupFactory, writer, id,name,address);


        //3.5 关闭资源
        closeWriter(writer);
    }
    
    /**
     * 定义表结构
     *
     * @param messageTypeBuilder
     * @return
     */
    public static MessageType defineTbStru(Types.MessageTypeBuilder messageTypeBuilder) {
        FILE_SCHEMA= messageTypeBuilder
                .optional(PrimitiveType.PrimitiveTypeName.BINARY)
                .named("id")
                .optional(PrimitiveType.PrimitiveTypeName.BINARY)
                .named("name")
                .optional(PrimitiveType.PrimitiveTypeName.BINARY)
                .named("address").named("testparquet");
//        FILE_SCHEMA = messageTypeBuilder.named("helloworld");


        return FILE_SCHEMA;
    }


    /**
     * 向hdfs parquet文件写入数据
     *
     * @param simpleGroupFactory
     * @param writer
     */
    public static void writeHdfsFile(SimpleGroupFactory simpleGroupFactory, ParquetWriter<Group> writer, String id,String name,String address) {
        Group group = simpleGroupFactory.newGroup();
        group.append("id", id);
        group.append("name", name);
        group.append("address", address);
        try {
            writer.write(group);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


    /**
     * 关闭资源
     *
     * @param writer
     */
    public static void closeWriter(ParquetWriter<Group> writer) {
        try {
            writer.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

向HDFS Parquet文件写入数据示例

45e62fa054b34d4000bb3248577d8053.png

package com.htsc;


import com.pojo.CsvCcgc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.security.AnnotatedSecurityInfo;
import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.example.ExampleParquetWriter;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Types;


import java.io.IOException;


import static com.htsc.CsvToHiveParquet.closeWriter;


/**
 *  向HDFS PARQUET格式文件写数据
 */
public class writeParquetFileToHdfs {
    private static Configuration conf;
    private static FileSystem fs;
    private static String krb5File = "F:\\works\\config\\cm111\\krb5.conf";
    private static String keyPath = "F:\\works\\config\\cm111\\hdfs.keytab";
    private static String user = "hdfs/cm111@WMM.COM";
    private static String core_path = "F:\\works\\config\\cm111\\core-site.xml";
    private static String hdfs_path = "F:\\works\\config\\cm111\\hdfs-site.xml";


    private static MessageType FILE_SCHEMA;
    //kerberos认证配置
    static {
        conf = new Configuration();
        SecurityUtil.setSecurityInfoProviders(new AnnotatedSecurityInfo());
        System.setProperty("java.security.krb5.conf", krb5File);
        conf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
        conf.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
        conf.addResource(new Path(hdfs_path));
        conf.addResource(new Path(core_path));
        conf.set("hadoop.security.authentication", "kerberos"); //配置认证方式
//        conf.set("dfs.client.use.datanode.hostname", "false");
        UserGroupInformation.setConfiguration(conf);
        try {
            UserGroupInformation.loginUserFromKeytab(user, keyPath);
            System.out.println("Kerberos认证成功,当前用户为:" + UserGroupInformation.getCurrentUser());
            //获取FileSystem 对象用于操作HDFS
            fs = FileSystem.get(conf);
        } catch (IOException e) {
            System.out.println("Kerberos 认证失败");
            e.printStackTrace();
        }
    }
    public static void main(String[] args) throws IOException {
        //1.定义路径
        String hdfsPathFile= "/tmp/testparquetfile";
        Path path = new Path(hdfsPathFile);
        //2.定义表结构 用于向hive parquet文件写入数据
        Types.MessageTypeBuilder messageTypeBuilder = Types.buildMessage();
        MessageType messageType = defineTbStru(messageTypeBuilder);
        //3.构建ParquetWriter对象,用于向parquet文件写入数据
        ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
                .withConf(conf)
                .withType(FILE_SCHEMA)
                .build();
        SimpleGroupFactory simpleGroupFactory = new SimpleGroupFactory(FILE_SCHEMA);
        String id="10001";
        String name="tom";
        String address="nanjin";
        
        writeHdfsFile(simpleGroupFactory, writer, id,name,address);
        //关闭资源
        closeWriter(writer);
    }


    /**
     * 定义表结构
     * @param messageTypeBuilder
     * @return
     */
    public static MessageType defineTbStru(Types.MessageTypeBuilder messageTypeBuilder) {
        messageTypeBuilder
                .optional(PrimitiveType.PrimitiveTypeName.BINARY)
                .named("id")
                .optional(PrimitiveType.PrimitiveTypeName.BINARY)
                .named("name")
                .optional(PrimitiveType.PrimitiveTypeName.BINARY)
                .named("address");
        FILE_SCHEMA = messageTypeBuilder.named("testwriteparquet");


        return FILE_SCHEMA;
    }
    /**
     * 向hdfs parquet文件写入数据
     *
     * @param simpleGroupFactory
     * @param writer
     */
    public static void writeHdfsFile(SimpleGroupFactory simpleGroupFactory, ParquetWriter<Group> writer,String id,String name,String address) {
        Group group = simpleGroupFactory.newGroup();
        group.append("id", id);
        group.append("name", name);
        group.append("address", address);
        try {
            writer.write(group);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

欢迎关注微信公众号:Wbigdata

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值