在Hadoop2.0中MapReduce程序的都需要继承org.apache.hadoop.mapreduce.Mapper 和 org.apache.hadoop.mapreduce.Reducer这两个基础类,来定制自己的mapreduce功能,源码中主要的函数如下
Mapper.java
public void run(Context context) throws IOException, InterruptedException {
setup(context); // Called once at the beginning of the task.
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
cleanup(context); // Called once at the end of the task.
}
}
/**
* Called once for each key/value pair in the input split. Most applications
* should override this, but the default is the identity function.
*/
protected void map(KEYIN key, VALUEIN value,
Context context) throws IOException, InterruptedException {
context.write((KEYOUT) key, (VALUEOUT) value);
}
Reducer.java public void run(Context context) throws IOException, InterruptedException {
setup(context); // Called once at the beginning of the task.
while (context.nextKey()) {
reduce(context.getCurrentKey(), context.getValues(), context);
}
cleanup(context); // Called once at the end of the task.
}
/**
* This method is called once for each key. Most applications will define
* their reduce class by overriding this method. The default implementation
* is an identity function.
*/
protected void reduce(KEYIN key, Iterable<VALUEIN> values, Context context
) throws IOException, InterruptedException {
for(VALUEIN value: values) {
context.write((KEYOUT) key, (VALUEOUT) value);
}
}
在Mapper和Reducer类中,都有一个run()方法不断提供(key,value)来调用map()和reduce()函数来处理,我们一般只需重写其中的map和reduce方法。在mapreduce中只有支持序列化的类才能作为键值,其中的key还必须要是可比较的,故 key要实现WritableComparable接口,value只需要实现Writable接口。
如下给出自己参照源码写的MyWordCount.java
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class MyWordCount {
public static class WordCountMapper extends Mapper<Object,Text,Text,IntWritable> {
private static final IntWritable one = new IntWritable(1);
private Text word = new Text();
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer words = new StringTokenizer(line);
while(words.hasMoreTokens()) {
word.set(words.nextToken());
context.write(word, one);
}
}
}
public static class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable totalNum = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context context)
throws IOException, InterruptedException {
int sum = 0;
Iterator<IntWritable> it = values.iterator();
while(it.hasNext()) {
sum += it.next().get();
}
totalNum.set(sum);
context.write(key,totalNum);
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = new Job(conf,"MyWordCount");
job.setJarByClass(MyWordCount.class); //设置运行jar中的class名称
job.setMapperClass(WordCountMapper.class);//设置mapreduce中的mapper reducer combiner类
job.setReducerClass(WordCountReducer.class);
job.setCombinerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class); //设置输出结果键值对类型
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job,new Path(args[0]));//设置mapreduce输入输出文件路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0:1);
}
}