一、Mapper类源码
Mapper |
class Context -- 保存了作业运行的上下文信息,例如作业配置信息、InputSplit信息、任务ID setup() -- map前的准备工作(可重写) map() -- 承担主要的对键值对的处理工作(可重写) cleanup() -- 收尾工作,例如关闭文件、执行map后的键值对分发(可重写) run() -- 提供setup-->map-->cleanup的执行模板(可重写) |
package org.apache.hadoop.mapreduce;
import java.io.IOException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.task.MapContextImpl;
public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
public abstract class Context
implements MapContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {
}
/**
* Called once at the beginning of the task.
*/
protected void setup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
/**
* Called once for each key/value pair in the input split. Most applications
* should override this, but the default is the identity function.
*/
protected void map(KEYIN key, VALUEIN value,
Context context) throws IOException, InterruptedException {
context.write((KEYOUT) key, (VALUEOUT) value);
}
/**
* Called once at the end of the task.
*/
protected void cleanup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
/**
* Expert users can override this method for more complete control over the
* execution of the Mapper.
* @param context
* @throws IOException
*/
public void run(Context context) throws IOException, InterruptedException {
setup(context);
try {
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
} finally {
cleanup(context);
}
}
}
二、Hadoop自带的Mapper实现类
1.InverseMapper类:实现了键值对调换后输出的功能,如下源码:
package org.apache.hadoop.mapreduce.lib.map;
import java.io.IOException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.mapreduce.Mapper;
public class InverseMapper<K, V> extends Mapper<K,V,V,K> {
/** The inverse function. Input keys and values are swapped.*/
@Override
public void map(K key, V value, Context context
) throws IOException, InterruptedException {
context.write(value, key);
}
}
2.TokenCounterMapper类:实现了单词计数的功能,如下源码:
package org.apache.hadoop.mapreduce.lib.map;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class TokenCounterMapper extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
@Override
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
三、自定义Mapper类
自定义MyMapper类需要继承Mapper(org.apache.hadoop.mapreduce.Mapper),重写map函数。
package com.hadoop.hello;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/*
* MyMapper类
* LongWritable: map输入键值对的键类型
* Text: map输入键值对的值类型
* Text: map输出键值对的键类型
* IntWritable: map输出键值对的值类型
*/
public class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
/*
* @param key: 输入分块inputsplit的行号
* @param value: 摄入分块inputsplit的行内容
*/
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//自定义map处理逻辑
//context.write(key, value);
}
}