Hadoop生态 -- HDFS Java API 大全详解

HDFS Java API 大全详解

一、API 核心架构概述

HDFS Java API 围绕 FileSystem 抽象类构建,提供了完整的分布式文件系统操作能力:

1
*
«abstract»
FileSystem
+open(Path) : FSDataInputStream
+create(Path) : FSDataOutputStream
+mkdirs(Path) : boolean
+rename(Path, Path) : boolean
+delete(Path, boolean) : boolean
+listStatus(Path) : FileStatus[]
+getFileStatus(Path) : FileStatus
+setPermission(Path, FsPermission) : void
+setReplication(Path, short) : boolean
FSDataInputStream
+read() : int
+seek(long) : void
+getPos() : long
+readFully(long, byte[]) : void
FSDataOutputStream
+write(byte[], int, int) : void
+hflush() : void
+hsync() : void
+getPos() : long
FileStatus
+getPath() : Path
+getLen() : long
+getModificationTime() : long
+getReplication() : short
+getBlockSize() : long
+getPermission() : FsPermission
DistributedFileSystem
HdfsDataInputStream
HdfsDataOutputStream

二、环境配置与初始化

1. Maven 依赖

<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>3.3.4</version>
</dependency>

2. 核心配置方式

// 方式1:通过配置文件初始化
Configuration conf = new Configuration();
conf.addResource(new Path("/path/to/core-site.xml"));
conf.addResource(new Path("/path/to/hdfs-site.xml"));

// 方式2:代码直接配置
conf.set("fs.defaultFS", "hdfs://namenode:8020");
conf.set("dfs.replication", "2");

// 获取FileSystem实例
FileSystem fs = FileSystem.get(conf);

三、文件与目录操作 API

1. 目录操作

// 创建目录(支持递归创建)
Path dirPath = new Path("/user/data/raw");
boolean success = fs.mkdirs(dirPath);

// 检查目录是否存在
boolean exists = fs.exists(dirPath);

// 列出目录内容
FileStatus[] statuses = fs.listStatus(new Path("/user"));
for (FileStatus status : statuses) {
    System.out.println(
        (status.isDirectory() ? "DIR" : "FILE") + ": " + 
        status.getPath().getName()
    );
}

// 删除目录(递归删除)
boolean recursive = true;
fs.delete(new Path("/tmp/old_data"), recursive);

2. 文件操作

// 创建新文件
Path filePath = new Path("/data/sample.txt");
FSDataOutputStream out = fs.create(filePath, (short)2); // 设置副本数

// 重命名文件
Path src = new Path("/data/sample.txt");
Path dst = new Path("/data/renamed.txt");
fs.rename(src, dst);

// 获取文件信息
FileStatus fileStatus = fs.getFileStatus(dst);
System.out.println("Size: " + fileStatus.getLen());
System.out.println("Block Size: " + fileStatus.getBlockSize());
System.out.println("Replication: " + fileStatus.getReplication());

// 检查是否为文件
boolean isFile = fs.isFile(dst);

四、数据读写 API

1. 文件写入

Path writePath = new Path("/data/write_demo.txt");

// 覆盖写入
try (FSDataOutputStream out = fs.create(writePath)) {
    out.writeUTF("Hello HDFS!\n");
    out.writeBytes("Second line\n");
    out.hflush(); // 确保数据刷到DataNode
}

// 追加写入(需要dfs.support.append=true)
try (FSDataOutputStream out = fs.append(writePath)) {
    out.writeBytes("Appended content\n");
    out.hsync(); // 强制同步到磁盘
}

2. 文件读取

Path readPath = new Path("/data/write_demo.txt");

// 顺序读取
try (FSDataInputStream in = fs.open(readPath)) {
    byte[] buffer = new byte[1024];
    int bytesRead = in.read(buffer);
    System.out.println(new String(buffer, 0, bytesRead));
}

// 随机访问
try (FSDataInputStream in = fs.open(readPath)) {
    in.seek(10); // 跳转到第10字节
    System.out.println(in.readByte());
    
    // 读取整个文件到字节数组
    byte[] fullData = new byte[(int) fs.getFileStatus(readPath).getLen()];
    in.readFully(0, fullData); // 从0位置读取完整数据
}

3. 高效读写技术

// 使用缓冲区提升性能
try (FSDataOutputStream out = fs.create(new Path("/data/buffered"))) {
    BufferedOutputStream bufferedOut = new BufferedOutputStream(out);
    DataOutputStream dataOut = new DataOutputStream(bufferedOut);
    for (int i = 0; i < 10000; i++) {
        dataOut.writeInt(i);
    }
    dataOut.flush();
}

// 使用SequenceFile存储键值对
Path seqPath = new Path("/data/seqfile");
SequenceFile.Writer writer = SequenceFile.createWriter(
    conf, 
    SequenceFile.Writer.file(seqPath),
    SequenceFile.Writer.keyClass(IntWritable.class),
    SequenceFile.Writer.valueClass(Text.class)
);

writer.append(new IntWritable(1), new Text("Value1"));
writer.append(new IntWritable(2), new Text("Value2"));
writer.close();

五、高级文件操作

1. 文件块操作

// 获取文件块位置
FileStatus fileStatus = fs.getFileStatus(new Path("/data/largefile"));
BlockLocation[] blocks = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());

for (BlockLocation block : blocks) {
    System.out.println("Block offset: " + block.getOffset());
    System.out.println("Length: " + block.getLength());
    System.out.println("Hosts: " + Arrays.toString(block.getHosts()));
}

// 设置副本因子
fs.setReplication(new Path("/data/important"), (short) 3);

2. 快照管理

// 创建快照
Path snapDir = new Path("/user/snapshot_dir");
fs.allowSnapshot(snapDir);
fs.createSnapshot(snapDir, "v1");

// 查看快照
Path snapPath = new Path("/user/snapshot_dir/.snapshot/v1");
FileStatus[] snapFiles = fs.listStatus(snapPath);

// 恢复文件
fs.copyFromLocalFile(false, true, 
    new Path(snapPath, "file.txt"), 
    new Path("/user/restored")
);

3. 归档文件操作 (HAR)

// 创建HAR文件
Path harPath = new Path("/archive/2023.har");
HarFileSystem harFs = new HarFileSystem(fs);
harFs.initialize(new URI("har://" + harPath.toString()), conf);

// 添加文件到HAR
try (FSDataOutputStream out = harFs.create(new Path("/doc.txt"))) {
    out.writeBytes("Archived content");
}

// 读取HAR文件
try (FSDataInputStream in = harFs.open(new Path("/doc.txt"))) {
    System.out.println(IOUtils.toString(in, StandardCharsets.UTF_8));
}

六、权限与配额管理

1. 权限控制

Path securePath = new Path("/secure/data");

// 设置权限 (rwxr-x---)
FsPermission permission = new FsPermission(FsAction.ALL, FsAction.READ_EXECUTE, FsAction.NONE);
fs.setPermission(securePath, permission);

// 设置所有者
fs.setOwner(securePath, "admin", "supergroup");

// 检查权限
FileStatus status = fs.getFileStatus(securePath);
System.out.println("Owner: " + status.getOwner());
System.out.println("Permission: " + status.getPermission());

2. 空间配额

// 设置目录配额(文件数量)
fs.setQuota(new Path("/limited"), 1000, HdfsConstants.QUOTA_DONT_SET);

// 设置空间配额(字节数)
fs.setSpaceQuota(new Path("/storage"), 1024 * 1024 * 1024); // 1GB

// 获取配额使用情况
ContentSummary summary = fs.getContentSummary(new Path("/limited"));
System.out.println("File count: " + summary.getFileCount() + "/" + summary.getQuota());
System.out.println("Space used: " + summary.getSpaceConsumed() + "/" + summary.getSpaceQuota());

七、高级特性 API

1. 纠删码

Path ecPath = new Path("/erasurecoded");

// 设置纠删码策略
fs.setErasureCodingPolicy(ecPath, "RS-6-3-1024k");

// 写入文件(自动使用EC)
try (FSDataOutputStream out = fs.create(new Path(ecPath, "ecfile"))) {
    out.writeBytes("Data protected by erasure coding");
}

// 验证EC策略
ErasureCodingPolicy policy = fs.getErasureCodingPolicy(ecPath);
System.out.println("EC Policy: " + policy.getName());

2. 透明加密

Path encryptedPath = new Path("/encrypted_zone");

// 创建加密区域
fs.createEncryptionZone(encryptedPath, "my_key");

// 写入加密文件
try (FSDataOutputStream out = fs.create(new Path(encryptedPath, "secret.txt"))) {
    out.writeBytes("Sensitive data");
}

// 读取(自动解密)
try (FSDataInputStream in = fs.open(new Path(encryptedPath, "secret.txt"))) {
    System.out.println(IOUtils.toString(in, StandardCharsets.UTF_8));
}

八、最佳实践与性能优化

1. 连接管理

// 正确关闭资源(推荐try-with-resources)
try (FileSystem fs = FileSystem.get(conf)) {
    // 所有操作
} // 自动关闭连接

// 避免频繁创建FileSystem实例(开销大)
private static FileSystem fs; // 作为静态成员

@BeforeClass
public static void setup() throws Exception {
    fs = FileSystem.get(conf);
}

@AfterClass
public static void teardown() throws Exception {
    fs.close();
}

2. 缓冲区优化

// 调整缓冲区大小(默认4KB)
conf.setInt("io.file.buffer.size", 65536); // 64KB

// 使用直接缓冲区减少拷贝
conf.setBoolean("fs.hdfs.impl.disable.cache", true);
try (FSDataInputStream in = fs.open(path)) {
    ByteBuffer directBuffer = ByteBuffer.allocateDirect(8192);
    in.read(directBuffer);
}

3. 异常处理模板

try {
    fs.copyFromLocalFile(false, true, 
        new Path(localFile), 
        new Path(hdfsPath)
    );
} catch (FileAlreadyExistsException e) {
    log.warn("File already exists: " + hdfsPath);
} catch (AccessControlException e) {
    log.error("Permission denied for user: " + user);
} catch (IOException e) {
    log.error("HDFS operation failed", e);
    if (e instanceof RemoteException) {
        RemoteException re = (RemoteException) e;
        log.error("Remote error: " + re.getErrorCode());
    }
} finally {
    IOUtils.closeStream(fs);
}

九、API 使用注意事项

  1. 路径处理

    • 使用 Path 对象而非字符串路径
    • 绝对路径以 / 开头
  2. 配置继承

    // 创建新实例时继承配置
    FileSystem newFs = FileSystem.newInstance(fs.getUri(), fs.getConf());
    
  3. 跨集群访问

    // 访问不同集群
    Configuration cluster2Conf = new Configuration();
    cluster2Conf.set("fs.defaultFS", "hdfs://cluster2-nn:8020");
    FileSystem cluster2Fs = FileSystem.get(cluster2Conf);
    
  4. 兼容性处理

    // 处理旧版本Hadoop
    if (fs instanceof DistributedFileSystem) {
        DistributedFileSystem dfs = (DistributedFileSystem) fs;
        // 调用新版特有API
    }
    

十、实用工具类

1. IOUtils 工具

// 高效复制流
try (InputStream in = fs.open(src);
     OutputStream out = fs.create(dst)) {
    IOUtils.copyBytes(in, out, conf, false); // 4096字节缓冲区
}

// 读取完整文件到字符串
String content = IOUtils.toString(
    fs.open(new Path("/data/config.json")), 
    StandardCharsets.UTF_8
);

2. FileUtil 工具

// 合并HDFS文件
FileUtil.copyMerge(fs, new Path("/parts"), 
    fs, new Path("/merged"), 
    false, conf, null
);

// 本地与HDFS互操作
FileUtil.copy(
    new File("local.txt"), 
    fs, new Path("/remote.txt"), 
    false, conf
);

十一、调试与监控

1. 日志配置

// 启用详细日志
org.apache.log4j.Logger.getLogger("org.apache.hadoop.hdfs")
    .setLevel(org.apache.log4j.Level.DEBUG);

2. 指标监控

// 获取HDFS指标
HdfsInstrumentation metrics = ((DistributedFileSystem)fs).getMetrics();
System.out.println("Bytes written: " + metrics.bytesWritten.getCurrent());
System.out.println("Read operations: " + metrics.readOps.getCurrent());

// 使用JMX
JMXServiceURL url = new JMXServiceURL("service:jmx:rmi:///jndi/rmi://localhost:9988/jmxrmi");
JMXConnector connector = JMXConnectorFactory.connect(url);
MBeanServerConnection mbsc = connector.getMBeanServerConnection();
ObjectName name = new ObjectName("Hadoop:service=NameNode,name=NameNodeInfo");
String clusterId = (String) mbsc.getAttribute(name, "ClusterId");

最佳实践建议

  1. 生产环境使用连接池管理FileSystem实例
  2. 大文件读写使用缓冲和进度监听
  3. 定期检查文件完整性
  4. 为长时间操作设置超时
conf.setLong("dfs.client.socket-timeout", 60000); // 60秒超时
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值