自定义InputFormat
需求:将多个小文件合并为SequenceFile(存储了多个小文件)
存储的格式:文件路径+文件的内容
public class FuncFileInputFormat extends FileInputFormat<NullWritable,BytesWritable>{
@Override
protected boolean isSplitable(JobContext context, Paht filename){
//不切原来的文件
return false;
}
@Override
public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split,)
throws IOException,InterruptException{
FuncRecordReader RecordReader = new FuncRecordReader();
return RecordReader;
}
}
public class FuncRecordReader extends RecordReader<NullWritable, ByteWritable>{
boolean isProcess = false;
FileSplit split;
Configuration conf;
BytesWritable value = new BytesWritable();
@Override
pubic void initialize(InputSplit split, TaskAttempContext context) throws Exception{
//初始化文件切片
this.split = (FileSplit)split;
//初始化配置信息
conf = context.getConfiguration();
}
@Override
pubic boolean nextKeyValue() throws Exception{
if(!isProcess){
//1、根据切片的长度来创建缓冲区
byte[] buf = new byte[(int)split.getLength()];
FSDataInputStream fis = null;
FileSystem fs = null;
//2、获取路径
Path path = split.getPath();
//3、根据路径获取文件系统
fs = path.getFileSystem(conf);
//4、拿到输入流
fis = fs.open(path);
//5、数据拷贝
IOUtils.readFully(fis,buf,0,buf.length);
//6、拷贝缓存到最终的输出
value.set(buf,0,buf.length);
//7、关流
IOUtils.closeStream(fis);
IOUtils.closeStream(fs);
isProcess = true;
return true;
}
return false;
}
@Override
pubic NullWritable getCurrentKey() throws Exception{
return NullWritable.get();
}
@Override
pubic BytesWritable getCurrentValue() throws Exception{
return value;
}
@Override
pubic float getProgress() throws Exception{
return 0;
}
@Override
pubic void close() throws Exception{
}
}
public class SequenceFileMapper extends Mapper<LongWritable, BytesWritable, Text, BytesWritable>{
Text k = new Text();
@Override
public void setup(Context context)
throws IOException, InterruptException{
//1.获取文件的路径和信息
FileSplit Split = (FileSplit)context.getInputSplit();
//2、路径
Path path = split.getPath();
//3、即带路径又带名称
k.set(path.toString);
}
@Override
public void map(NullWritable key, ByteWritable value, Context context)
throw IOException, InterruptException{
//输出
context.write(k,value);
}
}
public class SequenceFileReducer extends Reducer<Text, BytesWritable, Text, BytesWritable>{
public void reduce(Text key, Interable<BytesWritable> value, Context context)
throws IOException, InterruptException{
for(BytesWritable v :value){
context.write(key,value);
}
}
}
public class SequenceFileDriver{
public static void main(String[] args) throws IOException, ClassNotFoundException,InterruptException{
Configuration conf = new Configuration();
Job job = Job.getInstance();
job.setJarByClass(SequenceFileDriver.class);
job.setMapperClass(SequenceFileMapper.class);
job.setReducerClass(SequenceFileReducer.class);
//设置自定义的读取方式
job.setInputFormatClass(FuncFileInputFormat.class);
//设置默认的输出方式--合并
job.setOutputFormatClass(SequenceFileOutputFormat.class)
job.setMapOutputValueClass(BytesWritable.class);
job.setMapOutputKeyClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
FileInputFormat.setInputPaths(job, new Path("C:/in"));
FileOutputFormat.setOutputPath(job, new Path("c:/out"));
job.waitForCompletion(true);
}
}