假设要用MR来求一堆数据的平均数,我们一方面在MR的过程中要记住和,一方面还需要记住数字的总个数。
总个数在mapper端是无法或者的,因此只能在mapper过程中记住,使用hadoop中的计数器可以完成任务。
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobID;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class WordCount extends Configured implements Tool{
public static final String GROUP = "group";
public static final String KEY = "key";
public static class MapClass extends MapReduceBase
implements Mapper<LongWritable, Text, Text,IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value,
OutputCollector<Text,IntWritable> output, Reporter reporter) throws IOException{
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line);
while(itr.hasMoreTokens()) {
word.set(itr.nextToken());
output.collect(word, one);
reporter.getCounter(GROUP, KEY).increment(1);
}
}
}
public static class Reduce extends MapReduceBase implements
Reducer<Text, IntWritable, Text, DoubleWritable> {
private long SUMM = 0;
@Override
public void configure(JobConf conf) {
try {
JobClient client = new JobClient(conf);
RunningJob parentJob = client.getJob(JobID.forName( conf.get("mapred.job.id") ));
SUMM = parentJob.getCounters().getGroup(GROUP).getCounter(KEY);
} catch (IOException e) {
e.printStackTrace();
}
super.configure(conf);
}
public void reduce(Text key, Iterator<IntWritable> values,
OutputCollector<Text, DoubleWritable> output, Reporter report)
throws IOException {
double sum = 0;
while(values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new DoubleWritable(sum/SUMM));
}
}
static int printUsage() {
System.out.println("wordcount [-m <maps>] [-r <reduces>] <input> <output>");
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
public int run(String[] args) throws Exception {
JobConf conf = new JobConf(getConf(),WordCount.class);
conf.setJobName("wordcount.test");
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(IntWritable.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(DoubleWritable.class);
conf.setMapperClass(MapClass.class);
//conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
List<String> other_args = new ArrayList<String>();
for(int i=0;i<args.length;i++) {
try{
if("-m".equals(args[i])) {
conf.setNumMapTasks(Integer.parseInt(args[++i]));
} else if("-r".equals(args[i])) {
conf.setNumReduceTasks(Integer.parseInt(args[++i]));
}else{
other_args.add(args[i]);
}
}catch(NumberFormatException except) {
System.out.println("ERROR: Integer expected instead of "
+ args[i]);
return printUsage();
}catch (ArrayIndexOutOfBoundsException except) {
System.out.println("ERROR: Required parameter missing from "
+ args[i - 1]);
return printUsage();
}
}
if (other_args.size() != 2) {
System.out.println("ERROR: Wrong number of parameters: "
+ other_args.size() + " instead of 2.");
return printUsage();
}
FileInputFormat.setInputPaths(conf, other_args.get(0));
FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(res);
}
}