Hadoop生态 -- HDFS Java API 大全详解-优快云博客

本文链接：https://blog.youkuaiyun.com/csdn_tom_168/article/details/149319569

HDFS Java API 大全详解

一、API 核心架构概述

HDFS Java API 围绕 FileSystem 抽象类构建，提供了完整的分布式文件系统操作能力：

二、环境配置与初始化

1. Maven 依赖

<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>3.3.4</version>
</dependency>

2. 核心配置方式

// 方式1：通过配置文件初始化
Configuration conf = new Configuration();
conf.addResource(new Path("/path/to/core-site.xml"));
conf.addResource(new Path("/path/to/hdfs-site.xml"));

// 方式2：代码直接配置
conf.set("fs.defaultFS", "hdfs://namenode:8020");
conf.set("dfs.replication", "2");

// 获取FileSystem实例
FileSystem fs = FileSystem.get(conf);

三、文件与目录操作 API

1. 目录操作

// 创建目录（支持递归创建）
Path dirPath = new Path("/user/data/raw");
boolean success = fs.mkdirs(dirPath);

// 检查目录是否存在
boolean exists = fs.exists(dirPath);

// 列出目录内容
FileStatus[] statuses = fs.listStatus(new Path("/user"));
for (FileStatus status : statuses) {
    System.out.println(
        (status.isDirectory() ? "DIR" : "FILE") + ": " + 
        status.getPath().getName()
    );
}

// 删除目录（递归删除）
boolean recursive = true;
fs.delete(new Path("/tmp/old_data"), recursive);

2. 文件操作

// 创建新文件
Path filePath = new Path("/data/sample.txt");
FSDataOutputStream out = fs.create(filePath, (short)2); // 设置副本数

// 重命名文件
Path src = new Path("/data/sample.txt");
Path dst = new Path("/data/renamed.txt");
fs.rename(src, dst);

// 获取文件信息
FileStatus fileStatus = fs.getFileStatus(dst);
System.out.println("Size: " + fileStatus.getLen());
System.out.println("Block Size: " + fileStatus.getBlockSize());
System.out.println("Replication: " + fileStatus.getReplication());

// 检查是否为文件
boolean isFile = fs.isFile(dst);

四、数据读写 API

1. 文件写入

Path writePath = new Path("/data/write_demo.txt");

// 覆盖写入
try (FSDataOutputStream out = fs.create(writePath)) {
    out.writeUTF("Hello HDFS!\n");
    out.writeBytes("Second line\n");
    out.hflush(); // 确保数据刷到DataNode
}

// 追加写入（需要dfs.support.append=true）
try (FSDataOutputStream out = fs.append(writePath)) {
    out.writeBytes("Appended content\n");
    out.hsync(); // 强制同步到磁盘
}

2. 文件读取

Path readPath = new Path("/data/write_demo.txt");

// 顺序读取
try (FSDataInputStream in = fs.open(readPath)) {
    byte[] buffer = new byte[1024];
    int bytesRead = in.read(buffer);
    System.out.println(new String(buffer, 0, bytesRead));
}

// 随机访问
try (FSDataInputStream in = fs.open(readPath)) {
    in.seek(10); // 跳转到第10字节
    System.out.println(in.readByte());
    
    // 读取整个文件到字节数组
    byte[] fullData = new byte[(int) fs.getFileStatus(readPath).getLen()];
    in.readFully(0, fullData); // 从0位置读取完整数据
}

3. 高效读写技术

// 使用缓冲区提升性能
try (FSDataOutputStream out = fs.create(new Path("/data/buffered"))) {
    BufferedOutputStream bufferedOut = new BufferedOutputStream(out);
    DataOutputStream dataOut = new DataOutputStream(bufferedOut);
    for (int i = 0; i < 10000; i++) {
        dataOut.writeInt(i);
    }
    dataOut.flush();
}

// 使用SequenceFile存储键值对
Path seqPath = new Path("/data/seqfile");
SequenceFile.Writer writer = SequenceFile.createWriter(
    conf, 
    SequenceFile.Writer.file(seqPath),
    SequenceFile.Writer.keyClass(IntWritable.class),
    SequenceFile.Writer.valueClass(Text.class)
);

writer.append(new IntWritable(1), new Text("Value1"));
writer.append(new IntWritable(2), new Text("Value2"));
writer.close();

五、高级文件操作

1. 文件块操作

// 获取文件块位置
FileStatus fileStatus = fs.getFileStatus(new Path("/data/largefile"));
BlockLocation[] blocks = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());

for (BlockLocation block : blocks) {
    System.out.println("Block offset: " + block.getOffset());
    System.out.println("Length: " + block.getLength());
    System.out.println("Hosts: " + Arrays.toString(block.getHosts()));
}

// 设置副本因子
fs.setReplication(new Path("/data/important"), (short) 3);

2. 快照管理

// 创建快照
Path snapDir = new Path("/user/snapshot_dir");
fs.allowSnapshot(snapDir);
fs.createSnapshot(snapDir, "v1");

// 查看快照
Path snapPath = new Path("/user/snapshot_dir/.snapshot/v1");
FileStatus[] snapFiles = fs.listStatus(snapPath);

// 恢复文件
fs.copyFromLocalFile(false, true, 
    new Path(snapPath, "file.txt"), 
    new Path("/user/restored")
);

3. 归档文件操作 (HAR)

// 创建HAR文件
Path harPath = new Path("/archive/2023.har");
HarFileSystem harFs = new HarFileSystem(fs);
harFs.initialize(new URI("har://" + harPath.toString()), conf);

// 添加文件到HAR
try (FSDataOutputStream out = harFs.create(new Path("/doc.txt"))) {
    out.writeBytes("Archived content");
}

// 读取HAR文件
try (FSDataInputStream in = harFs.open(new Path("/doc.txt"))) {
    System.out.println(IOUtils.toString(in, StandardCharsets.UTF_8));
}

六、权限与配额管理

1. 权限控制

Path securePath = new Path("/secure/data");

// 设置权限 (rwxr-x---)
FsPermission permission = new FsPermission(FsAction.ALL, FsAction.READ_EXECUTE, FsAction.NONE);
fs.setPermission(securePath, permission);

// 设置所有者
fs.setOwner(securePath, "admin", "supergroup");

// 检查权限
FileStatus status = fs.getFileStatus(securePath);
System.out.println("Owner: " + status.getOwner());
System.out.println("Permission: " + status.getPermission());

2. 空间配额

// 设置目录配额（文件数量）
fs.setQuota(new Path("/limited"), 1000, HdfsConstants.QUOTA_DONT_SET);

// 设置空间配额（字节数）
fs.setSpaceQuota(new Path("/storage"), 1024 * 1024 * 1024); // 1GB

// 获取配额使用情况
ContentSummary summary = fs.getContentSummary(new Path("/limited"));
System.out.println("File count: " + summary.getFileCount() + "/" + summary.getQuota());
System.out.println("Space used: " + summary.getSpaceConsumed() + "/" + summary.getSpaceQuota());

七、高级特性 API

1. 纠删码

Path ecPath = new Path("/erasurecoded");

// 设置纠删码策略
fs.setErasureCodingPolicy(ecPath, "RS-6-3-1024k");

// 写入文件（自动使用EC）
try (FSDataOutputStream out = fs.create(new Path(ecPath, "ecfile"))) {
    out.writeBytes("Data protected by erasure coding");
}

// 验证EC策略
ErasureCodingPolicy policy = fs.getErasureCodingPolicy(ecPath);
System.out.println("EC Policy: " + policy.getName());

2. 透明加密

Path encryptedPath = new Path("/encrypted_zone");

// 创建加密区域
fs.createEncryptionZone(encryptedPath, "my_key");

// 写入加密文件
try (FSDataOutputStream out = fs.create(new Path(encryptedPath, "secret.txt"))) {
    out.writeBytes("Sensitive data");
}

// 读取（自动解密）
try (FSDataInputStream in = fs.open(new Path(encryptedPath, "secret.txt"))) {
    System.out.println(IOUtils.toString(in, StandardCharsets.UTF_8));
}

八、最佳实践与性能优化

1. 连接管理

// 正确关闭资源（推荐try-with-resources）
try (FileSystem fs = FileSystem.get(conf)) {
    // 所有操作
} // 自动关闭连接

// 避免频繁创建FileSystem实例（开销大）
private static FileSystem fs; // 作为静态成员

@BeforeClass
public static void setup() throws Exception {
    fs = FileSystem.get(conf);
}

@AfterClass
public static void teardown() throws Exception {
    fs.close();
}

2. 缓冲区优化

// 调整缓冲区大小（默认4KB）
conf.setInt("io.file.buffer.size", 65536); // 64KB

// 使用直接缓冲区减少拷贝
conf.setBoolean("fs.hdfs.impl.disable.cache", true);
try (FSDataInputStream in = fs.open(path)) {
    ByteBuffer directBuffer = ByteBuffer.allocateDirect(8192);
    in.read(directBuffer);
}

3. 异常处理模板

try {
    fs.copyFromLocalFile(false, true, 
        new Path(localFile), 
        new Path(hdfsPath)
    );
} catch (FileAlreadyExistsException e) {
    log.warn("File already exists: " + hdfsPath);
} catch (AccessControlException e) {
    log.error("Permission denied for user: " + user);
} catch (IOException e) {
    log.error("HDFS operation failed", e);
    if (e instanceof RemoteException) {
        RemoteException re = (RemoteException) e;
        log.error("Remote error: " + re.getErrorCode());
    }
} finally {
    IOUtils.closeStream(fs);
}

九、API 使用注意事项

路径处理：
- 使用 Path 对象而非字符串路径
- 绝对路径以 / 开头

配置继承：

// 创建新实例时继承配置
FileSystem newFs = FileSystem.newInstance(fs.getUri(), fs.getConf());

跨集群访问：

// 访问不同集群
Configuration cluster2Conf = new Configuration();
cluster2Conf.set("fs.defaultFS", "hdfs://cluster2-nn:8020");
FileSystem cluster2Fs = FileSystem.get(cluster2Conf);

兼容性处理：

// 处理旧版本Hadoop
if (fs instanceof DistributedFileSystem) {
    DistributedFileSystem dfs = (DistributedFileSystem) fs;
    // 调用新版特有API
}

十、实用工具类

1. IOUtils 工具

// 高效复制流
try (InputStream in = fs.open(src);
     OutputStream out = fs.create(dst)) {
    IOUtils.copyBytes(in, out, conf, false); // 4096字节缓冲区
}

// 读取完整文件到字符串
String content = IOUtils.toString(
    fs.open(new Path("/data/config.json")), 
    StandardCharsets.UTF_8
);

2. FileUtil 工具

// 合并HDFS文件
FileUtil.copyMerge(fs, new Path("/parts"), 
    fs, new Path("/merged"), 
    false, conf, null
);

// 本地与HDFS互操作
FileUtil.copy(
    new File("local.txt"), 
    fs, new Path("/remote.txt"), 
    false, conf
);

十一、调试与监控

1. 日志配置

// 启用详细日志
org.apache.log4j.Logger.getLogger("org.apache.hadoop.hdfs")
    .setLevel(org.apache.log4j.Level.DEBUG);

2. 指标监控

// 获取HDFS指标
HdfsInstrumentation metrics = ((DistributedFileSystem)fs).getMetrics();
System.out.println("Bytes written: " + metrics.bytesWritten.getCurrent());
System.out.println("Read operations: " + metrics.readOps.getCurrent());

// 使用JMX
JMXServiceURL url = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://localhost:9988/jmxrmi");
JMXConnector connector = JMXConnectorFactory.connect(url);
MBeanServerConnection mbsc = connector.getMBeanServerConnection();
ObjectName name = new ObjectName("Hadoop:service=NameNode,name=NameNodeInfo");
String clusterId = (String) mbsc.getAttribute(name, "ClusterId");