一、概述
在(4)中我们很快的实现了join的功能,但是在实际的生产中,会有一个严重的问题,由于数据量比较大,最后的分区比如都根据hashpartion来处理,就会导致数据的倾斜,有的reduceTask就会工作量太大,有的工作量就会太小,其实,我们可以看到,maptask阶段的任务分配其实还是比较均匀的,所以如果能在map阶段,把所有的工作都给处理掉就好了,这样我们就会想到缓存,数据量不大的一张表缓存起来放在redis上,但是由于redis还是要经过网络传输,有本地缓存是最好不过的,其实在hadoop中有这么distributedcache这么个功能能解决我们的缓存问题。
二、代码实现
MapSideJoin.java
package mapjoin;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
/**
*
* Created by tianjun on 2017/3/19.
*/
public class MapSideJoin {
static class MapSideJoinMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
Map<String,String> pdInfoMap = new HashMap<>();
Text k =new Text();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("pd.txt")));
String line = null;
while (StringUtils.isNotEmpty(line = br.readLine())){
String[] fields = line.split(",");
pdInfoMap.put(fields[0],fields[1]);
}
br.close();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String orderline = value.toString();
String[] fields = orderline.split(",");
String pdName = pdInfoMap.get(fields[2]);
k.set(orderline + ',' +pdName);
context.write(k,NullWritable.get());
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
String os = System.getProperty("os.name").toLowerCase();
if (os.contains("windows")) {
System.setProperty("HADOOP_USER_NAME", "root");
}
Configuration conf = new Configuration();
conf.set("mapreduce.framework.name","yarn");
conf.set("yarn.resourcemanager.hostname","mini01");
conf.set("fs.defaultFS","hdfs://mini01:9000/");
// 默认就是local模式
// conf.set("mapreduce.framework.name","local");
// conf.set("mapreduce.jobtracker.address","local");
// conf.set("fs.defaultFS","file:///");
Job wcjob = Job.getInstance(conf);
wcjob.setJar("F:/myWorkPlace/java/dubbo/demo/dubbo-demo/mr-demo1/target/mr.demo-1.0-SNAPSHOT.jar");
//如果从本地拷贝,是不行的,这时需要使用setJar
// wcjob.setJarByClass(Rjoin.class);
wcjob.setMapperClass(MapSideJoinMapper.class);
// wcjob.setReducerClass(RjoinReducer.class);
//设置我们的业务逻辑Mapper类的输出key和value的数据类型
wcjob.setMapOutputKeyClass(Text.class);
wcjob.setMapOutputValueClass(NullWritable.class);
//不需要reduce
wcjob.setNumReduceTasks(0);
//设置我们的业务逻辑Reducer类的输出key和value的数据类型
// wcjob.setOutputKeyClass(InfoBean1.class);
// wcjob.setOutputValueClass(NullWritable.class);
//如果不设置InputFormat,默认就是使用TextInputFormat.class
// wcjob.setInputFormatClass(CombineFileInputFormat.class);
// CombineFileInputFormat.setMaxInputSplitSize(wcjob,4194304);
// CombineFileInputFormat.setMinInputSplitSize(wcjob,2097152);
FileSystem fs = FileSystem.get(new URI("hdfs://mini01:9000"), new Configuration(), "root");
Path path = new Path("hdfs://mini01:9000/wc/rjoin");
if (fs.exists(path)) {
fs.delete(path, true);
}
//指定要处理的数据所在的位置
FileInputFormat.setInputPaths(wcjob, new Path("hdfs://mini01:9000/input/rjoin/order.txt"));
//指定处理完成之后的结果所保存的位置
FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://mini01:9000/wc/mapjoin"));
//指定需要缓存一个文件到所有的maptask运行节点工作目录
/*wcjob.addArchiveToClassPath(archive);*/ //缓存jar包到task运行节点的classpath中
/*wcjob.addCacheArchive(uri);*/ //缓存压缩包到task运行节点的工作目录
/*wcjob.addCacheFile(uir);*/ //缓存普通文件到task运行节点的工作目录
wcjob.addCacheFile(new URI("hdfs://mini01:9000/input/rjoin/pd.txt"));
boolean res = wcjob.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
三、计算结果
[root@mini03 ~]# hdfs dfs -cat /wc/mapjoin/*
1001,20150710,P0001,2,xiaomi5
1002,20150710,P0001,3,xiaomi5
1002,20150710,P0002,3,chuiziT1
1001,20150710,P0001,2,xiaomi5
1002,20150710,P0003,3,meizu
1003,20150710,P0002,3,chuiziT1