package com.cn.demo_xwjhb;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
public class MyInputFormat extends FileInputFormat<NullWritable, BytesWritable> {
/**
* 目的:实现小文件的合并,合并成一个二进制文件
* 返回值代表文件是否分割,因为要合并小文件,不分割
*/
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
/**
* 读取的时候主要使用此程序,需要返回重写的RecordReader
* @param inputSplit 切片
* @param taskAttemptContext 上下文
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
MyRecordReader reader = new MyRecordReader();
reader.initialize(inputSplit,taskAttemptContext);
return reader;
}
}
package com.cn.demo_xwjhb;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class MyRecordReader extends RecordReader<NullWritable, BytesWritable> {
private FileSplit fileSplit;
private Configuration configuration;
private BytesWritable value = new BytesWritable();
private boolean processed = false;
/**
* RecordReader的核心工作逻辑:
* 通过nextKeyValue()方法去读取数据构造将返回的key value,主要重写这个方法的逻辑
* 通过getCurrentKey 和 getCurrentValue来返回上面构造好的key和value
*
*/
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
this.fileSplit = (FileSplit) inputSplit;
this.configuration = taskAttemptContext.getConfiguration();
}
/**
* 获取K1 V1的逻辑
* 定义一个数组,长度必须能够放下一整个切片
* 利用IOUtils工具类的readFully 传入FSDataInputStream流 读入数据
* 结束后关闭流
* @return
* @throws IOException
*/
@Override
public boolean nextKeyValue() throws IOException {
if(!processed){
byte[] contents = new byte[(int) fileSplit.getLength()];
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(configuration);
FSDataInputStream in = null;
try {
in = fs.open(file);
//IOUtils有两个,保持统一去使用即可
IOUtils.readFully(in,contents,0,contents.length);
value.set(contents,0,contents.length);
} catch (IOException e) {
e.printStackTrace();
}finally {
IOUtils.closeStream(in);
}
processed = true;
return true;
}
return false;
}
/**
* 返回k1
*/
@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return NullWritable.get();
}
/**
* 返回V1
*/
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return value;
}
/**
* 获取文件进度的东西,没什么用
*/
@Override
public float getProgress() throws IOException, InterruptedException {
return processed?1.0f:0.0f;
}
@Override
public void close() throws IOException {
}
}
---------------------------------主程序二进制文件输出-------------------------
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileOutputFormat.setOutputPath(job,new Path("file:///D:\\dsj\\baishi课件\\hadoop\\5、大数据离线第五天\\5、大数据离线第五天\\自定义inputformat_小文件合并\\output"));
该博客介绍了如何在Hadoop中通过自定义InputFormat类`MyInputFormat`和RecordReader类`MyRecordReader`来实现小文件的合并。`MyInputFormat`重写了`isSplitable`方法,禁止文件切分,并在`createRecordReader`中创建并初始化`MyRecordReader`。`MyRecordReader`的核心逻辑在于`nextKeyValue`方法,它一次性读取整个文件内容并返回BytesWritable类型的value。最终,博客演示了如何配置Job使用这个自定义InputFormat进行二进制文件输出。
805

被折叠的 条评论
为什么被折叠?



