有时对于MapReduce等框架来说,需要有一套更底层的API来获取某个指定文件中的一部分数据,而不是一整个文件。一方面来说,获取整个文件是对网络的很大压力;另一方面,可能整个文件非常大,客户端也存不下。因此这篇博客再体验一下HDFS的流式API。
测试类代码
package tech.mrbcy.bigdata.hdfs;
import java.io.FileInputStream;
import java.net.URI;
import java.util.Iterator;
import java.util.Map.Entry;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.junit.Before;
import org.junit.Test;
public class HdfsClientDemo {
private FileSystem fs;
Configuration conf;
@Before
public void init() throws Exception{
conf = new Configuration();
fs = FileSystem.get(new URI("hdfs://amaster:9000"), conf, "root");
}
}
用流方式上传文件
@Test
public void testUploadWithStream() throws Exception{
FSDataOutputStream out = fs.create(new Path("/access_stream.log"), true);
FileInputStream fin = new FileInputStream("c:/access.log");
IOUtils.copy(fin, out);
}
用流方式下载文件
@Test
public void testDownloadWithStream() throws Exception{
FSDataInputStream in = fs.open(new Path("/access_stream.log"));
FileOutputStream out = new FileOutputStream("d:/access_stream.log");
IOUtils.copy(in, out);
}
用流方式读指定长度的文件
@Test
public void testRandomAccess() throws Exception{
FSDataInputStream in = fs.open(new Path("/access_stream.log"));
FileOutputStream out = new FileOutputStream("d:/access_stream.log");
IOUtils.copyLarge(in, out, 1*1024*1024, 1*1024*1024); // 从1M位置开始读,读1M
}
如图所示:
在控制台打印HDFS文件内容
@Test
public void testCat() throws Exception{
FSDataInputStream in = fs.open(new Path("/access_stream.log"));
IOUtils.copy(in,System.out);
}
获取文件块信息
@Test
public void testGetFileBlock() throws Exception{
FileStatus fileStatus = fs.getFileStatus(new Path("/hadoop-2.7.3.tar.gz"));
BlockLocation[] blockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
for (BlockLocation bl : blockLocations) {
System.out.println("block-length:" + bl.getLength() + "--" + "block-offset:" + bl.getOffset());
String[] hosts = bl.getHosts();
for (String host : hosts) {
System.out.println(host);
}
}
}
输出结果为:
block-length:134217728--block-offset:0
anode1.mrbcy.tech
anode2.mrbcy.tech
block-length:79874467--block-offset:134217728
anode2.mrbcy.tech
anode1.mrbcy.tech