map端的联结比reduce端的联结实现起来复杂,而且限制也多,一般我们将小表置于内存中, 对于大表的一个纪录我们在内存中查找即可。
改例子摘自hadoop基础教程, 我们实现sales和accounts的联结, 其中sales记录的顾客的销售信息,accounts纪录的是用户的账户信息,我们的目的是统计每个用户消费的次数和消费总额。
数据如下:
sales.txt
002 12.29 2004-07-02
004 13.42 2005-12-20
003 499.99 2010-12-20
001 78.95 2012-04-02
002 21.99 2006-11-30
002 93.45 2008-09-10
001 9.99 2012-05-17
accounts.txt
002 Abigail SmithPremium 2004-07-13
003 April StevensStandard 2010-12-20
004 Nasser HafezPremium 2001-04-23
代码如下:
import java.io.*;
import java.util.*;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class MapJoin {
public static class MapJoinMapper extends Mapper<Object, Text, Text, Text> {
public Map<String, String> joinData = new HashMap();
//执行连接操作
public void map(Object key, Text value, Context context) throws IOException, InterruptedException{
String[] values = value.toString().split("\t");
context.write(new Text(joinData.get(values[0])), value);
}
//加载小表
public void setup(Context context) throws IOException, InterruptedException{
Path[] path = DistributedCache.getLocalCacheFiles(context.getConfiguration());
BufferedReader reader = new BufferedReader(new FileReader(path[0].toString()));
String str = null;
while((str = reader.readLine()) != null) {
String[] s = str.split("\t");
joinData.put(s[0], s[1]);
}
}
}
public static class MapJoinReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
int ci = 0;
double total = 0.0;
for(Text val : values) {
ci ++;
String[] v = val.toString().split("\t");
total += Float.parseFloat(v[1]);
}
String str = String.format("%d\t%f", ci, total);
context.write(key, new Text(str));
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
DistributedCache.addCacheFile(new Path(args[1]).toUri(), conf);
Job job = new Job(conf, "MapJoin");
//设置相关类
job.setJarByClass(MapJoin.class);
job.setMapperClass(MapJoinMapper.class);
job.setReducerClass(MapJoinReducer.class);
//设置map输出格式
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//设置输入输出文件
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[2]));
//等待作业执行完毕
System.exit(job.waitForCompletion(true)?0:1);
}
}