背景
在生产环境种数据量每天达到上亿,需要增量存放在hive对应分区种,纯文本数据占用存储还是比较大的,所以存储紧张的情况下需要使用Parquet存储格式去存储数据,下面将给出两个示例,一个是往本地Parquet文件写入数据,一个是往HDFS Parquet文件上写入数据。
往本地Parquet文件写入数据示例
package com.htsc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.example.ExampleParquetWriter;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Types;
import java.io.IOException;
public class writeParquetFileToLocal {
private static MessageType FILE_SCHEMA;
private static Configuration conf;
public static void main(String[] args) throws IOException {
String loclPathFile="F:\\works\\tmp\\testparquetfile"; //parquet文件名称
Path path = new Path(loclPathFile);
//定义表结构
Types.MessageTypeBuilder messageTypeBuilder = Types.buildMessage();
defineTbStru(messageTypeBuilder);
SimpleGroupFactory simpleGroupFactory = new SimpleGroupFactory(FILE_SCHEMA);
conf=new Configuration();
//构建ParquetWriter对象,用于向parquet文件写入数据
ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
.withConf(conf)
.withType(FILE_SCHEMA)
.build();
//样例数据
String id="1001";
String name="tom";
String address="nanjin";
writeHdfsFile(simpleGroupFactory, writer, id,name,address);
//3.5 关闭资源
closeWriter(writer);
}
/**
* 定义表结构
*
* @param messageTypeBuilder
* @return
*/
public static MessageType defineTbStru(Types.MessageTypeBuilder messageTypeBuilder) {
FILE_SCHEMA= messageTypeBuilder
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.named("id")
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.named("name")
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.named("address").named("testparquet");
// FILE_SCHEMA = messageTypeBuilder.named("helloworld");
return FILE_SCHEMA;
}
/**
* 向hdfs parquet文件写入数据
*
* @param simpleGroupFactory
* @param writer
*/
public static void writeHdfsFile(SimpleGroupFactory simpleGroupFactory, ParquetWriter<Group> writer, String id,String name,String address) {
Group group = simpleGroupFactory.newGroup();
group.append("id", id);
group.append("name", name);
group.append("address", address);
try {
writer.write(group);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 关闭资源
*
* @param writer
*/
public static void closeWriter(ParquetWriter<Group> writer) {
try {
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
向HDFS Parquet文件写入数据示例
package com.htsc;
import com.pojo.CsvCcgc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.security.AnnotatedSecurityInfo;
import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.example.ExampleParquetWriter;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Types;
import java.io.IOException;
import static com.htsc.CsvToHiveParquet.closeWriter;
/**
* 向HDFS PARQUET格式文件写数据
*/
public class writeParquetFileToHdfs {
private static Configuration conf;
private static FileSystem fs;
private static String krb5File = "F:\\works\\config\\cm111\\krb5.conf";
private static String keyPath = "F:\\works\\config\\cm111\\hdfs.keytab";
private static String user = "hdfs/cm111@WMM.COM";
private static String core_path = "F:\\works\\config\\cm111\\core-site.xml";
private static String hdfs_path = "F:\\works\\config\\cm111\\hdfs-site.xml";
private static MessageType FILE_SCHEMA;
//kerberos认证配置
static {
conf = new Configuration();
SecurityUtil.setSecurityInfoProviders(new AnnotatedSecurityInfo());
System.setProperty("java.security.krb5.conf", krb5File);
conf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
conf.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
conf.addResource(new Path(hdfs_path));
conf.addResource(new Path(core_path));
conf.set("hadoop.security.authentication", "kerberos"); //配置认证方式
// conf.set("dfs.client.use.datanode.hostname", "false");
UserGroupInformation.setConfiguration(conf);
try {
UserGroupInformation.loginUserFromKeytab(user, keyPath);
System.out.println("Kerberos认证成功,当前用户为:" + UserGroupInformation.getCurrentUser());
//获取FileSystem 对象用于操作HDFS
fs = FileSystem.get(conf);
} catch (IOException e) {
System.out.println("Kerberos 认证失败");
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
//1.定义路径
String hdfsPathFile= "/tmp/testparquetfile";
Path path = new Path(hdfsPathFile);
//2.定义表结构 用于向hive parquet文件写入数据
Types.MessageTypeBuilder messageTypeBuilder = Types.buildMessage();
MessageType messageType = defineTbStru(messageTypeBuilder);
//3.构建ParquetWriter对象,用于向parquet文件写入数据
ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
.withConf(conf)
.withType(FILE_SCHEMA)
.build();
SimpleGroupFactory simpleGroupFactory = new SimpleGroupFactory(FILE_SCHEMA);
String id="10001";
String name="tom";
String address="nanjin";
writeHdfsFile(simpleGroupFactory, writer, id,name,address);
//关闭资源
closeWriter(writer);
}
/**
* 定义表结构
* @param messageTypeBuilder
* @return
*/
public static MessageType defineTbStru(Types.MessageTypeBuilder messageTypeBuilder) {
messageTypeBuilder
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.named("id")
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.named("name")
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.named("address");
FILE_SCHEMA = messageTypeBuilder.named("testwriteparquet");
return FILE_SCHEMA;
}
/**
* 向hdfs parquet文件写入数据
*
* @param simpleGroupFactory
* @param writer
*/
public static void writeHdfsFile(SimpleGroupFactory simpleGroupFactory, ParquetWriter<Group> writer,String id,String name,String address) {
Group group = simpleGroupFactory.newGroup();
group.append("id", id);
group.append("name", name);
group.append("address", address);
try {
writer.write(group);
} catch (IOException e) {
e.printStackTrace();
}
}
}
欢迎关注微信公众号:Wbigdata