一、前言
I/O相关的包如下:.apache.hadoop.io.* ,以下介绍一些常用的hdfs的API操作
二、HDFS API
package hadoop.utils;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
/**
* @author : chenhaipeng
* @date : 2015年8月21日 上午1:02:26
*/
public class HDFSUtils {
public static void WriteToHDFS(String file, String words) throws IOException, URISyntaxException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(file), conf);
Path path = new Path(file);
FSDataOutputStream out = null;
out = fs.create(path); // 创建文件
// 两个方法都用于文件写入,好像一般多使用后者
// out.writeBytes(words);
out.write(words.getBytes("UTF-8"));
out.close();
// 如果是要从输入流中写入,或是从一个文件写到另一个文件(此时用输入流打开已有内容的文件)
// 可以使用如下IOUtils.copyBytes方法。
// FSDataInputStream in = fs.open(new Path(args[0]));
// IOUtils.copyBytes(in, out, 4096, true) //4096为一次复制块大小,true表示复制完成后关闭流
}
public static void ReadFromHDFS(String file) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(file), conf);
Path path = new Path(file);
FSDataInputStream in = null;
try {
in = fs.open(path);
IOUtils.copyBytes(in, System.out, 4096, true);
} finally {
IOUtils.closeStream(in);
}
// 使用FSDataInoutStream的read方法会将文件内容读取到字节流中并返回
/**
* FileStatus stat = fs.getFileStatus(path); // create the buffer byte[]
* buffer = new byte[Integer.parseInt(String.valueOf(stat.getLen()))];
* is.readFully(0, buffer); is.close(); fs.close(); return buffer;
*/
}
public static void DeleteHDFSFile(String file) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(file), conf);
Path path = new Path(file);
// 查看fs的delete API可以看到三个方法。deleteonExit实在退出JVM时删除,下面的方法是在指定为目录是递归删除
fs.delete(path, true);
fs.close();
}
public static void UploadLocalFileHDFS(String src, String dst) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(dst), conf);
Path pathDst = new Path(dst);
Path pathSrc = new Path(src);
fs.copyFromLocalFile(pathSrc, pathDst);
fs.close();
}
public static void ListDirAll(String DirFile) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(DirFile), conf);
Path path = new Path(DirFile);
FileStatus[] status = fs.listStatus(path);
// 方法1
for (FileStatus f : status) {
System.out.println(f.getPath().toString());
}
// 方法2
Path[] listedPaths = FileUtil.stat2Paths(status);
for (Path p : listedPaths) {
System.out.println(p.toString());
}
}
public static void main(String[] args) throws IOException, URISyntaxException {
// 下面做的是显示目录下所有文件
ListDirAll("hdfs://192.168.100.150:9000/user");
String fileWrite = "hdfs://192.168.100.150:9000/user/readme2.txt";
String words = "This words is to write into file!\n";
WriteToHDFS(fileWrite, words);
// 这里我们读取fileWrite的内容并显示在终端
ReadFromHDFS(fileWrite);
// 这里删除上面的fileWrite文件
DeleteHDFSFile(fileWrite);
// 假设本地有一个uploadFile,这里上传该文件到HDFS
String LocalFile = "c:/2015-04-10.txt";
UploadLocalFileHDFS(LocalFile, fileWrite);
}
}
三、Hadoop 压缩
数据压缩能带来相当 大的好处、hadoop 支持的压缩codec如下:
其中:bzip2支持分区,其他不支持
/**
* 使用由文件扩展名推断而来的codec来压缩来对文件进行压缩
* @author : chenhaipeng
* @date : 2015年9月20日 下午7:48:15
*/
public class FileDecompressor {
public static void main(String[] args) throws Exception {
String uri = args[0];
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri),conf);
Path inputPath = new Path(uri);
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(inputPath);
if(codec == null){
System.err.println("No codec found for "+ uri);
System.exit(1);
}
String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
InputStream in = null;
OutputStream out = null;
try {
in = codec.createInputStream(fs.open(inputPath));
out = fs.create(new Path(outputUri));
IOUtils.copyBytes(in, out, conf);
} finally{
IOUtils.closeStream(in);
IOUtils.closeStream(out);
}
}
}
tip:例如
WordCount使用压缩的时候
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
四、MapReduce 的格式与类型
自定义数据类型
import java.io.*;
import org.apache.hadoop.io.*;
public class IntPair implements WritableComparable<IntPair> {
private int first;
private int second;
public IntPair() {
}
public IntPair(int first, int second) {
set(first, second);
}
public void set(int first, int second) {
this.first = first;
this.second = second;
}
public int getFirst() {
return first;
}
public int getSecond() {
return second;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(first);
out.writeInt(second);
}
@Override
public void readFields(DataInput in) throws IOException {
first = in.readInt();
second = in.readInt();
}
@Override
public int hashCode() {
return first * 163 + second;
}
@Override
public boolean equals(Object o) {
if (o instanceof IntPair) {
IntPair ip = (IntPair) o;
return first == ip.first && second == ip.second;
}
return false;
}
@Override
public String toString() {
return first + "\t" + second;
}
@Override
public int compareTo(IntPair ip) {
int cmp = compare(first, ip.first);
if (cmp != 0) {
return cmp;
}
return compare(second, ip.second);
}
/**
* Convenience method for comparing two ints.
*/
public static int compare(int a, int b) {
return (a < b ? -1 : (a == b ? 0 : 1));
}
}