hadoop 自学指南六之IO /HDFS 操作API

最新推荐文章于 2024-04-20 11:04:53 发布

原创最新推荐文章于 2024-04-20 11:04:53 发布 · 616 阅读

1 ·

CC 4.0 BY-SA版权

hadoop 自学指南专栏收录该内容

17 篇文章

订阅专栏

本文介绍了Hadoop自学过程中的HDFS API操作，包括常用API的使用，并探讨了Hadoop数据压缩的益处及支持的压缩codec，如bzip2等。此外，还提及了MapReduce中数据格式和类型的处理。

一、前言

I/O相关的包如下：.apache.hadoop.io.* ,以下介绍一些常用的hdfs的API操作

二、HDFS API

package hadoop.utils;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;

/**
 * @author : chenhaipeng
 * @date : 2015年8月21日 上午1:02:26
 */
public class HDFSUtils {
	public static void WriteToHDFS(String file, String words) throws IOException, URISyntaxException {
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(URI.create(file), conf);
		Path path = new Path(file);
		FSDataOutputStream out = null;
		out = fs.create(path); // 创建文件
		// 两个方法都用于文件写入，好像一般多使用后者
		// out.writeBytes(words);
		out.write(words.getBytes("UTF-8"));
		out.close();

		// 如果是要从输入流中写入，或是从一个文件写到另一个文件（此时用输入流打开已有内容的文件）
		// 可以使用如下IOUtils.copyBytes方法。
		// FSDataInputStream in = fs.open(new Path(args[0]));
		// IOUtils.copyBytes(in, out, 4096, true) //4096为一次复制块大小，true表示复制完成后关闭流
	}

	public static void ReadFromHDFS(String file) throws IOException {
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(URI.create(file), conf);
		Path path = new Path(file);
		FSDataInputStream in = null;
		try {
			in = fs.open(path);
			IOUtils.copyBytes(in, System.out, 4096, true);
		} finally {
			IOUtils.closeStream(in);
		}

		// 使用FSDataInoutStream的read方法会将文件内容读取到字节流中并返回
		/**
		 * FileStatus stat = fs.getFileStatus(path); // create the buffer byte[]
		 * buffer = new byte[Integer.parseInt(String.valueOf(stat.getLen()))];
		 * is.readFully(0, buffer); is.close(); fs.close(); return buffer;
		 */
	}

	public static void DeleteHDFSFile(String file) throws IOException {
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(URI.create(file), conf);
		Path path = new Path(file);
		// 查看fs的delete API可以看到三个方法。deleteonExit实在退出JVM时删除，下面的方法是在指定为目录是递归删除
		fs.delete(path, true);
		fs.close();
	}

	public static void UploadLocalFileHDFS(String src, String dst) throws IOException {
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(URI.create(dst), conf);
		Path pathDst = new Path(dst);
		Path pathSrc = new Path(src);

		fs.copyFromLocalFile(pathSrc, pathDst);
		fs.close();
	}

	public static void ListDirAll(String DirFile) throws IOException {
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(URI.create(DirFile), conf);
		Path path = new Path(DirFile);

		FileStatus[] status = fs.listStatus(path);
		// 方法1
		for (FileStatus f : status) {
			System.out.println(f.getPath().toString());
		}
		// 方法2
		Path[] listedPaths = FileUtil.stat2Paths(status);
		for (Path p : listedPaths) {
			System.out.println(p.toString());
		}
	}

	public static void main(String[] args) throws IOException, URISyntaxException {
		// 下面做的是显示目录下所有文件
		ListDirAll("hdfs://192.168.100.150:9000/user");

		String fileWrite = "hdfs://192.168.100.150:9000/user/readme2.txt";
		String words = "This words is to write into file!\n";
		WriteToHDFS(fileWrite, words);
		// 这里我们读取fileWrite的内容并显示在终端
		ReadFromHDFS(fileWrite);
		// 这里删除上面的fileWrite文件
		DeleteHDFSFile(fileWrite);
		// 假设本地有一个uploadFile，这里上传该文件到HDFS
		String LocalFile = "c:/2015-04-10.txt";
		UploadLocalFileHDFS(LocalFile, fileWrite);
	}

}

三、Hadoop 压缩

数据压缩能带来相当大的好处、hadoop 支持的压缩codec如下：

其中：bzip2支持分区，其他不支持

/**
 * 使用由文件扩展名推断而来的codec来压缩来对文件进行压缩
 * @author : chenhaipeng
 * @date : 2015年9月20日 下午7:48:15
 */
public class FileDecompressor {
	
	public static void main(String[] args) throws Exception {
		String uri = args[0];
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(URI.create(uri),conf);
		Path inputPath = new Path(uri);
		CompressionCodecFactory factory = new CompressionCodecFactory(conf);
		CompressionCodec codec = factory.getCodec(inputPath);
		if(codec == null){
			System.err.println("No codec found for "+ uri);
			System.exit(1);
		}
		String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
		InputStream in = null;
		OutputStream out = null;
		try {
			in = codec.createInputStream(fs.open(inputPath));
			out = fs.create(new Path(outputUri));
			IOUtils.copyBytes(in, out, conf);
		} finally{
			IOUtils.closeStream(in);
			IOUtils.closeStream(out);
		}
		
		

	}

}

tip：例如

WordCount使用压缩的时候

FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

四、MapReduce 的格式与类型

自定义数据类型

import java.io.*;

import org.apache.hadoop.io.*;

public class IntPair implements WritableComparable<IntPair> {

  private int first;
  private int second;
  
  public IntPair() {
  }
  
  public IntPair(int first, int second) {
    set(first, second);
  }
  
  public void set(int first, int second) {
    this.first = first;
    this.second = second;
  }
  
  public int getFirst() {
    return first;
  }

  public int getSecond() {
    return second;
  }

  @Override
  public void write(DataOutput out) throws IOException {
    out.writeInt(first);
    out.writeInt(second);
  }

  @Override
  public void readFields(DataInput in) throws IOException {
    first = in.readInt();
    second = in.readInt();
  }
  
  @Override
  public int hashCode() {
    return first * 163 + second;
  }
  
  @Override
  public boolean equals(Object o) {
    if (o instanceof IntPair) {
      IntPair ip = (IntPair) o;
      return first == ip.first && second == ip.second;
    }
    return false;
  }

  @Override
  public String toString() {
    return first + "\t" + second;
  }
  
  @Override
  public int compareTo(IntPair ip) {
    int cmp = compare(first, ip.first);
    if (cmp != 0) {
      return cmp;
    }
    return compare(second, ip.second);
  }
  
  /**
   * Convenience method for comparing two ints.
   */
  public static int compare(int a, int b) {
    return (a < b ? -1 : (a == b ? 0 : 1));
  }
  
}