Hadoop的输出格式和输入格式对应,这里仅列出基于OutputFormat的实现类如图所示:
FileOutputFormat和他的子类可以输出一个文件目录,每个reducer都生成一个文件。文件的命名规则:part-r-xxxx。 但有时需要控制每个reducer输出不同格式的文件目录,可以使用MultipleOutputs多目录输出类。
多目录输出(Multiple outputs)
FileOutputFormat和他的子类可以输出一个文件目录,每个reducer都生成一个文件。文件的命名规则:part-r-xxxx。 但有时需要控制每个reducer输出不同格式的文件目录,可以使用MultipleOutputs多目录输出类。
(1)定义Mapper
public class MOMapper extends Mapper<LongWritable, Text, Text, Text> {
private NcdcRecordParser parse = new NcdcRecordParser();
Text k = new Text();
Text v = new Text();
@Override
protected void map(LongWritable key, Text value,
Context context) throws IOException,
InterruptedException {
parse.parse(value);
k.set(parse.getStationId());
v.set(value);
context.write(k, v);
}
}
(2)定义Reducer
public class MOReducer extends Reducer<Text, Text, NullWritable, Text> {
MultipleOutputs<NullWritable, Text> multipleOutputs;
NcdcRecordParser parse = new NcdcRecordParser();
/**
* 实例化multipleOutputs实例
*
* @param context
* the TaskInputOutputContext object
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);
}
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException,
InterruptedException {
for (Text value : values) {
parse.parse(value);
String basePath = String.format("%s/%s/part", parse.getStationId(), parse.getYear());
multipleOutputs.write(NullWritable.get(), value, basePath);
}
/*
* 默认: 029070-99999-r-00000 ,其中: 029070-99999 为key
* 修改成:029070-99999/1901/part-r-00000,其中029070-99999/1901/part为基本名字,后面r-00000自动生成
*/
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
multipleOutputs.close();
}
}
(3)定义Job
/**
* 使用MultipleOutputs多路目录输出实现: 不同气象站数据输出到不同的目录
*
* input/ncdc/all output
* @author shenfl
*
*/
public class UsingMultipleOutputs extends Configured implements Tool{
public int run(String[] args) throws Exception {
Job job = JobBuilder.parseInputAndOutput(this,getConf(), args);
if(job==null){
return -1;
}
job.setMapperClass(MOMapper.class);
job.setReducerClass(MOReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
int run = ToolRunner.run(new UsingMultipleOutputs(), args);
System.exit(run);
}
}
(4)输入结果
[hadoop@mycluster ~]$ hdfs dfs -ls -R output
2016-01-26 02:32 output/029070-99999
2016-01-26 02:32 output/029070-99999/1901
2016-01-26 02:32 output/029070-99999/1901/part-r-00000
2016-01-26 02:32 output/029070-99999/1902
2016-01-26 02:32 output/029070-99999/1902/part-r-00000
2016-01-26 02:32 output/029500-99999
2016-01-26 02:32 output/029500-99999/1901
2016-01-26 02:32 output/029500-99999/1901/part-r-00000
2016-01-26 02:32 output/029500-99999/1902
2016-01-26 02:32 output/029500-99999/1902/part-r-00000
2016-01-26 02:32 output/029600-99999
2016-01-26 02:32 output/029600-99999/1901
2016-01-26 02:32 output/029600-99999/1901/part-r-00000
2016-01-26 02:32 output/029600-99999/1902
2016-01-26 02:32 output/029600-99999/1902/part-r-00000
2016-01-26 02:32 output/029720-99999
2016-01-26 02:32 output/029720-99999/1901
2016-01-26 02:32 output/029720-99999/1901/part-r-00000
2016-01-26 02:32 output/029720-99999/1902
2016-01-26 02:32 output/029720-99999/1902/part-r-00000
2016-01-26 02:32 output/029810-99999
2016-01-26 02:32 output/029810-99999/1901
2016-01-26 02:32 output/029810-99999/1901/part-r-00000
2016-01-26 02:32 output/029810-99999/1902
2016-01-26 02:32 output/029810-99999/1902/part-r-00000
2016-01-26 02:32 output/227070-99999
2016-01-26 02:32 output/227070-99999/1901
2016-01-26 02:32 output/227070-99999/1901/part-r-00000
2016-01-26 02:32 output/227070-99999/1902
2016-01-26 02:32 output/227070-99999/1902/part-r-00000
总结,通过多目录输出,我们发现可以按照指定的指标字段进行输出,这样可以进行分类。