在写MR时,如果需要根据记录内容进行,每条记录的动态目录输出可以使用MultipleOutputs类配置。
public static class hdfs2HiveMap extends Mapper<LongWritable,Text,NullWritable,Text>
{
private MultipleOutputs<NullWritable,Text> multipleOutputs; //声明
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
multipleOutputs.close();
}
@Override
protected void setup(Context context) throws IOException, InterruptedException {
multipleOutputs=new MultipleOutputs<NullWritable, Text>(context); //初始化
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String vals_array=value.toString().split(" ")[0];
String dateHour=vals_array.substring(0,10);
String basePath=String.format("%s/part",dateHour); //根据每条记录中的时间字段来决定该记录写入到哪个目录下
multipleOutputs.write(NullWritable.get(), value,basePath); //basePath是相对于main函数中设置的根目录,
//这里输出目录就是/cctest4/20160102/part*
}
}
public static void main(String[] args) throws Exception {
Job job=Job.getInstance();
job.setJarByClass(hdfs2Hive.class);
job.setMapperClass(hdfs2HiveMap.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
job.setNumReduceTasks(0);
FileInputFormat.addInputPath(job, new Path("/test.txt"));
FileOutputFormat.setOutputPath(job, new Path("/cctest4/"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}