HDFS Java API 大全详解
一、API 核心架构概述
HDFS Java API 围绕 FileSystem
抽象类构建,提供了完整的分布式文件系统操作能力:
二、环境配置与初始化
1. Maven 依赖
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.4</version>
</dependency>
2. 核心配置方式
// 方式1:通过配置文件初始化
Configuration conf = new Configuration();
conf.addResource(new Path("/path/to/core-site.xml"));
conf.addResource(new Path("/path/to/hdfs-site.xml"));
// 方式2:代码直接配置
conf.set("fs.defaultFS", "hdfs://namenode:8020");
conf.set("dfs.replication", "2");
// 获取FileSystem实例
FileSystem fs = FileSystem.get(conf);
三、文件与目录操作 API
1. 目录操作
// 创建目录(支持递归创建)
Path dirPath = new Path("/user/data/raw");
boolean success = fs.mkdirs(dirPath);
// 检查目录是否存在
boolean exists = fs.exists(dirPath);
// 列出目录内容
FileStatus[] statuses = fs.listStatus(new Path("/user"));
for (FileStatus status : statuses) {
System.out.println(
(status.isDirectory() ? "DIR" : "FILE") + ": " +
status.getPath().getName()
);
}
// 删除目录(递归删除)
boolean recursive = true;
fs.delete(new Path("/tmp/old_data"), recursive);
2. 文件操作
// 创建新文件
Path filePath = new Path("/data/sample.txt");
FSDataOutputStream out = fs.create(filePath, (short)2); // 设置副本数
// 重命名文件
Path src = new Path("/data/sample.txt");
Path dst = new Path("/data/renamed.txt");
fs.rename(src, dst);
// 获取文件信息
FileStatus fileStatus = fs.getFileStatus(dst);
System.out.println("Size: " + fileStatus.getLen());
System.out.println("Block Size: " + fileStatus.getBlockSize());
System.out.println("Replication: " + fileStatus.getReplication());
// 检查是否为文件
boolean isFile = fs.isFile(dst);
四、数据读写 API
1. 文件写入
Path writePath = new Path("/data/write_demo.txt");
// 覆盖写入
try (FSDataOutputStream out = fs.create(writePath)) {
out.writeUTF("Hello HDFS!\n");
out.writeBytes("Second line\n");
out.hflush(); // 确保数据刷到DataNode
}
// 追加写入(需要dfs.support.append=true)
try (FSDataOutputStream out = fs.append(writePath)) {
out.writeBytes("Appended content\n");
out.hsync(); // 强制同步到磁盘
}
2. 文件读取
Path readPath = new Path("/data/write_demo.txt");
// 顺序读取
try (FSDataInputStream in = fs.open(readPath)) {
byte[] buffer = new byte[1024];
int bytesRead = in.read(buffer);
System.out.println(new String(buffer, 0, bytesRead));
}
// 随机访问
try (FSDataInputStream in = fs.open(readPath)) {
in.seek(10); // 跳转到第10字节
System.out.println(in.readByte());
// 读取整个文件到字节数组
byte[] fullData = new byte[(int) fs.getFileStatus(readPath).getLen()];
in.readFully(0, fullData); // 从0位置读取完整数据
}
3. 高效读写技术
// 使用缓冲区提升性能
try (FSDataOutputStream out = fs.create(new Path("/data/buffered"))) {
BufferedOutputStream bufferedOut = new BufferedOutputStream(out);
DataOutputStream dataOut = new DataOutputStream(bufferedOut);
for (int i = 0; i < 10000; i++) {
dataOut.writeInt(i);
}
dataOut.flush();
}
// 使用SequenceFile存储键值对
Path seqPath = new Path("/data/seqfile");
SequenceFile.Writer writer = SequenceFile.createWriter(
conf,
SequenceFile.Writer.file(seqPath),
SequenceFile.Writer.keyClass(IntWritable.class),
SequenceFile.Writer.valueClass(Text.class)
);
writer.append(new IntWritable(1), new Text("Value1"));
writer.append(new IntWritable(2), new Text("Value2"));
writer.close();
五、高级文件操作
1. 文件块操作
// 获取文件块位置
FileStatus fileStatus = fs.getFileStatus(new Path("/data/largefile"));
BlockLocation[] blocks = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
for (BlockLocation block : blocks) {
System.out.println("Block offset: " + block.getOffset());
System.out.println("Length: " + block.getLength());
System.out.println("Hosts: " + Arrays.toString(block.getHosts()));
}
// 设置副本因子
fs.setReplication(new Path("/data/important"), (short) 3);
2. 快照管理
// 创建快照
Path snapDir = new Path("/user/snapshot_dir");
fs.allowSnapshot(snapDir);
fs.createSnapshot(snapDir, "v1");
// 查看快照
Path snapPath = new Path("/user/snapshot_dir/.snapshot/v1");
FileStatus[] snapFiles = fs.listStatus(snapPath);
// 恢复文件
fs.copyFromLocalFile(false, true,
new Path(snapPath, "file.txt"),
new Path("/user/restored")
);
3. 归档文件操作 (HAR)
// 创建HAR文件
Path harPath = new Path("/archive/2023.har");
HarFileSystem harFs = new HarFileSystem(fs);
harFs.initialize(new URI("har://" + harPath.toString()), conf);
// 添加文件到HAR
try (FSDataOutputStream out = harFs.create(new Path("/doc.txt"))) {
out.writeBytes("Archived content");
}
// 读取HAR文件
try (FSDataInputStream in = harFs.open(new Path("/doc.txt"))) {
System.out.println(IOUtils.toString(in, StandardCharsets.UTF_8));
}
六、权限与配额管理
1. 权限控制
Path securePath = new Path("/secure/data");
// 设置权限 (rwxr-x---)
FsPermission permission = new FsPermission(FsAction.ALL, FsAction.READ_EXECUTE, FsAction.NONE);
fs.setPermission(securePath, permission);
// 设置所有者
fs.setOwner(securePath, "admin", "supergroup");
// 检查权限
FileStatus status = fs.getFileStatus(securePath);
System.out.println("Owner: " + status.getOwner());
System.out.println("Permission: " + status.getPermission());
2. 空间配额
// 设置目录配额(文件数量)
fs.setQuota(new Path("/limited"), 1000, HdfsConstants.QUOTA_DONT_SET);
// 设置空间配额(字节数)
fs.setSpaceQuota(new Path("/storage"), 1024 * 1024 * 1024); // 1GB
// 获取配额使用情况
ContentSummary summary = fs.getContentSummary(new Path("/limited"));
System.out.println("File count: " + summary.getFileCount() + "/" + summary.getQuota());
System.out.println("Space used: " + summary.getSpaceConsumed() + "/" + summary.getSpaceQuota());
七、高级特性 API
1. 纠删码
Path ecPath = new Path("/erasurecoded");
// 设置纠删码策略
fs.setErasureCodingPolicy(ecPath, "RS-6-3-1024k");
// 写入文件(自动使用EC)
try (FSDataOutputStream out = fs.create(new Path(ecPath, "ecfile"))) {
out.writeBytes("Data protected by erasure coding");
}
// 验证EC策略
ErasureCodingPolicy policy = fs.getErasureCodingPolicy(ecPath);
System.out.println("EC Policy: " + policy.getName());
2. 透明加密
Path encryptedPath = new Path("/encrypted_zone");
// 创建加密区域
fs.createEncryptionZone(encryptedPath, "my_key");
// 写入加密文件
try (FSDataOutputStream out = fs.create(new Path(encryptedPath, "secret.txt"))) {
out.writeBytes("Sensitive data");
}
// 读取(自动解密)
try (FSDataInputStream in = fs.open(new Path(encryptedPath, "secret.txt"))) {
System.out.println(IOUtils.toString(in, StandardCharsets.UTF_8));
}
八、最佳实践与性能优化
1. 连接管理
// 正确关闭资源(推荐try-with-resources)
try (FileSystem fs = FileSystem.get(conf)) {
// 所有操作
} // 自动关闭连接
// 避免频繁创建FileSystem实例(开销大)
private static FileSystem fs; // 作为静态成员
@BeforeClass
public static void setup() throws Exception {
fs = FileSystem.get(conf);
}
@AfterClass
public static void teardown() throws Exception {
fs.close();
}
2. 缓冲区优化
// 调整缓冲区大小(默认4KB)
conf.setInt("io.file.buffer.size", 65536); // 64KB
// 使用直接缓冲区减少拷贝
conf.setBoolean("fs.hdfs.impl.disable.cache", true);
try (FSDataInputStream in = fs.open(path)) {
ByteBuffer directBuffer = ByteBuffer.allocateDirect(8192);
in.read(directBuffer);
}
3. 异常处理模板
try {
fs.copyFromLocalFile(false, true,
new Path(localFile),
new Path(hdfsPath)
);
} catch (FileAlreadyExistsException e) {
log.warn("File already exists: " + hdfsPath);
} catch (AccessControlException e) {
log.error("Permission denied for user: " + user);
} catch (IOException e) {
log.error("HDFS operation failed", e);
if (e instanceof RemoteException) {
RemoteException re = (RemoteException) e;
log.error("Remote error: " + re.getErrorCode());
}
} finally {
IOUtils.closeStream(fs);
}
九、API 使用注意事项
-
路径处理:
- 使用
Path
对象而非字符串路径 - 绝对路径以
/
开头
- 使用
-
配置继承:
// 创建新实例时继承配置 FileSystem newFs = FileSystem.newInstance(fs.getUri(), fs.getConf());
-
跨集群访问:
// 访问不同集群 Configuration cluster2Conf = new Configuration(); cluster2Conf.set("fs.defaultFS", "hdfs://cluster2-nn:8020"); FileSystem cluster2Fs = FileSystem.get(cluster2Conf);
-
兼容性处理:
// 处理旧版本Hadoop if (fs instanceof DistributedFileSystem) { DistributedFileSystem dfs = (DistributedFileSystem) fs; // 调用新版特有API }
十、实用工具类
1. IOUtils 工具
// 高效复制流
try (InputStream in = fs.open(src);
OutputStream out = fs.create(dst)) {
IOUtils.copyBytes(in, out, conf, false); // 4096字节缓冲区
}
// 读取完整文件到字符串
String content = IOUtils.toString(
fs.open(new Path("/data/config.json")),
StandardCharsets.UTF_8
);
2. FileUtil 工具
// 合并HDFS文件
FileUtil.copyMerge(fs, new Path("/parts"),
fs, new Path("/merged"),
false, conf, null
);
// 本地与HDFS互操作
FileUtil.copy(
new File("local.txt"),
fs, new Path("/remote.txt"),
false, conf
);
十一、调试与监控
1. 日志配置
// 启用详细日志
org.apache.log4j.Logger.getLogger("org.apache.hadoop.hdfs")
.setLevel(org.apache.log4j.Level.DEBUG);
2. 指标监控
// 获取HDFS指标
HdfsInstrumentation metrics = ((DistributedFileSystem)fs).getMetrics();
System.out.println("Bytes written: " + metrics.bytesWritten.getCurrent());
System.out.println("Read operations: " + metrics.readOps.getCurrent());
// 使用JMX
JMXServiceURL url = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://localhost:9988/jmxrmi");
JMXConnector connector = JMXConnectorFactory.connect(url);
MBeanServerConnection mbsc = connector.getMBeanServerConnection();
ObjectName name = new ObjectName("Hadoop:service=NameNode,name=NameNodeInfo");
String clusterId = (String) mbsc.getAttribute(name, "ClusterId");
最佳实践建议:
- 生产环境使用连接池管理FileSystem实例
- 大文件读写使用缓冲和进度监听
- 定期检查文件完整性
- 为长时间操作设置超时
conf.setLong("dfs.client.socket-timeout", 60000); // 60秒超时