MapReduce 处理两个或多个数据源是经常的事,此时就要根据某个key将两个数据源进行Join操作,Join操作的原则是:
1) 如果两个数据源大小差不多,则直接使用DataJoin类进行Join操作
2) 如果两个数据源大小相差很大,则使用Distributed Cache机制将较小的数据源复制到所有节点并缓存起来,然后过滤大的数据源。如果缓存不下较小的数据源,可以先用Distributed Cache机制将较小的数据源的Key复制到所有节点并缓存,然后过滤大的数据源,过滤后的数据再与较小的数据源进行Join操作。
本周遇到的问题就是第二种情况的较小数据源无法被缓存地下(抛出OOM错误)。
解决方案是使用两个Job
第一个Job 将较小的数据源 切割成只有 key的数据
第二个Job的第一个Mapper用Distributed Cache机制将上面的只有key的数据复制到所有节点并进行缓存,然后过滤大的数据源,输出的key为数据的唯一ID,第二个Mapper加载较小的数据源,输出的key为数据的唯一ID,Reducer会接收到相同ID的两个数据源数据,然后进行自己的操作
废话少说直接上有注释的代码:
/**
* 用户基础属性数据反向校验Job
* Created by luweijie@xiaomi.com on 15-7-20.
*/
public class BasicDataVerifyJob {
private static Logger logger = LoggerFactory.getLogger(BasicDataVerifyJob.class);
private static final String SEPARATOR = "\t";
/**
* 程序Job入口
* @param args
*/
public static void main (String args[]) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
new GenericOptionsParser(conf, args).getRemainingArgs();
if (StringUtils.isBlank(conf.get("path")) || StringUtils.isBlank(conf.get("date")) ||
StringUtils.isBlank(conf.get("output"))) {
System.err.println("jvm args: -Dpath -Ddate -Doutput must be specified!");
System.exit(2);
}
FileSystem fs = FileSystem.get(conf);
String path = conf.get("path");
if (!fs.exists(new Path(path))) {
logger.error("input path not exists : " + path);
System.exit(-1);
}
// output
String output = conf.get("output");
Path outputPath = new Path(output);
if (!output.endsWith("/")) {
output += "/";
}
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
// 产生mappingID文件Job
String mappingIDFileOutput = output + "mappingIDFile";
MicloudMRJob job1 = MicloudMRJob.getInstance(conf, "Generate mappingID file: " + path + " --> " + mappingIDFileOutput);
job1.setJarByClass(BasicDataVerifyJob.class);
job1.setMapperClass(MappingIDGeneratorMapper.class);
job1.setMapOutputKeyClass(Text.class);
job1.setMapOutputValueClass(NullWritable.class);
MultipleInputs.addInputPath(job1, new Path(path), TextInputFormat.class, MappingIDGeneratorMapper.class);
job1.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job1, new Path(mappingIDFileOutput));
// mappingIDFile产生成功后, 根据mappingIDFile过滤原数据并比较验证
if (job1.waitForCompletion(true)) {
PathManager pathManager = PathManager.create(conf.get("date"));
MicloudHDFSInputConfig inputConfig = new MicloudHDFSInputConfig("UserPublicInformation",
FilterMapper.class, BasicDataVerifyJob.class,
pathManager, Text.class, DataField.class);
String verifyFile = output + "verify";
OutputConfig outputConfig = new OutputConfig();
outputConfig.setOutputPath(verifyFile);
outputConfig.setOutputFormatClass(TextOutputFormat.class);
outputConfig.setOutKeyClass(Text.class);
outputConfig.setOutValueClass(IntWritable.class);
outputConfig.setReducerClass(VerifyReducer.class);
MicloudMRJob job2 = MicloudDataContext.getInstance().createJob(conf, inputConfig, outputConfig, "Verify Basic Data Job: " + pathManager.getOutputPath() + " --> " + verifyFile);
MultipleInputs.addInputPath(job2, new Path(path), TextInputFormat.class, VerifyHDFSMapper.class);
// 将较小的数据源放置到Distributed Cache文件中
FileStatus[] fileStatuses = fs.listStatus(new Path(mappingIDFileOutput));
URI[] uris = new URI[fileStatuses.length];
for (int i = 0; i < fileStatuses.length; i++) {
uris[i] = fileStatuses[i].getPath().toUri();
}
job2.setCacheFiles(uris);
// Reducer Num
job2.setNumReduceTasks(50);
// 统计比较验证的结果,以下的代码就是处理代码
if (job2.waitForCompletion(true)) {
String resultFile = output + "result";
MicloudMRJob job3 = MicloudMRJob.getInstance(conf, "Statistics Verify Data Job: " + verifyFile + " --> " + resultFile);
job3.setJarByClass(BasicDataVerifyJob.class);
job3.setMapperClass(StatisticsMapper.class);
job3.setMapOutputKeyClass(Text.class);
job3.setMapOutputValueClass(IntWritable.class);
MultipleInputs.addInputPath(job3, new Path(verifyFile), TextInputFormat.class, StatisticsMapper.class);
job3.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job3, new Path(resultFile));
job3.setCombinerClass(StatisticsCombiner.class);
job3.setReducerClass(StatisticsReducer.class);
job3.setNumReduceTasks(3);
job3.waitForCompletion(true);
}
}
}
/**
* 产生mappingID文件 Mapper
*/
public static class MappingIDGeneratorMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
private Text outputKey = new Text();
@Override
protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = new String(value.copyBytes());
String terms[] = line.split(SEPARATOR);
if (terms == null || terms.length < 2) {
return;
}
outputKey.set(terms[0]);
context.write(outputKey, NullWritable.get());
}
}
/**
* 字符编码
*/
private static final String CHARSET = "ISO-8859-1";
/**
* 过滤Mapper
*/
public static class FilterMapper extends MicloudHDFSMapper<Text, DataField> {
private static Set<String> joinData = new HashSet<String>();
private Text outputKey = new Text();
@Override
protected void setup (Context context){
// 将distributed cache file 装入各个Map节点本地的内存数据joinData中
FileSystem hdfs = null;
try {
String line = null;
Path[] cacheFiles = context.getLocalCacheFiles();
if (cacheFiles != null && cacheFiles.length > 0) {
for (int i = 0; i < cacheFiles.length; i++) {
BufferedReader joinReader = new BufferedReader(new FileReader(cacheFiles[i].toString()));
while ((line = joinReader.readLine()) != null) {
joinData.add(line);
}
}
}
} catch (IOException e) {
e.printStackTrace();
System.exit(0);
} finally {
if (hdfs != null) {
try {
hdfs.close();
} catch (IOException e) {
// Do Nothing
}
}
}
}
@Override
public void persistMapOutput(Mapper.Context context, StructuredData data) throws IOException, InterruptedException {
String mappingID = String.valueOf(data.getData("MappingID"));
if (joinData.contains(mappingID)) {
outputKey.set(mappingID);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
ObjectOutputStream out = new ObjectOutputStream(bos);
out.writeObject(data);
DataField outputValue = new DataField("A", bos.toString(CHARSET));
context.write(outputKey, outputValue);
out.close();
}
}
}
/**
* 来自于HDFS输入文件的 Mapper
*/
public static class VerifyHDFSMapper extends Mapper<LongWritable, Text, Text, DataField> {
private Text outputKey = new Text();
@Override
protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = new String(value.copyBytes());
String terms[] = line.split(SEPARATOR);
if (terms == null || terms.length < 2) {
return;
}
outputKey.set(terms[0]);
DataField outputValue = new DataField("B", terms[1]);
context.write(outputKey, outputValue);
}
}
/**
* Verify Reducer
*/
public static class VerifyReducer extends Reducer<Text, DataField, Text, IntWritable> {
private Text outputKey = new Text();
private IntWritable outputValue = new IntWritable();
@Override
protected void reduce (Text key, Iterable<DataField> values, Context context) throws IOException, InterruptedException {
if (values != null) {
StructuredData structuredData = null;
String stringData = null;
for (DataField value : values) {
if ("A".equalsIgnoreCase(value.getKey())) {
ByteArrayInputStream bis = new ByteArrayInputStream(value.getValue().getBytes(CHARSET));
ObjectInputStream in = new ObjectInputStream(bis);
try {
structuredData = (StructuredData) in.readObject();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
in.close();
} else if ("B".equalsIgnoreCase(value.getKey())) {
stringData = value.getValue();
}
}
Map<String, Integer> resultMap = VerifyUtil.verifyBasicData(stringData, structuredData);
if (resultMap != null) {
Iterator<Map.Entry<String, Integer>> it = resultMap.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<String, Integer> entry = it.next();
outputKey.set(entry.getKey());
outputValue.set(entry.getValue());
context.write(outputKey, outputValue);
}
}
}
}
}
/**
* 统计结果Mapper
*/
public static class StatisticsMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private Text outputKey = new Text();
private IntWritable outputValue = new IntWritable();
@Override
protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = new String(value.getBytes());
String[] terms = line.split(SEPARATOR);
if (terms.length != 2) {
return;
}
outputKey.set(terms[0]);
outputValue.set(Integer.valueOf(terms[1]));
context.write(outputKey, outputValue);
}
}
/**
* 统计结果Combiner,合并相同key,减少网络传输量
*/
public static class StatisticsCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable outputValue = new IntWritable();
@Override
protected void reduce (Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
outputValue.set(sum);
context.write(key, outputValue);
}
}
/**
* 统计结果Reducer
*/
public static class StatisticsReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable outputValue = new IntWritable();
@Override
protected void reduce (Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
outputValue.set(sum);
context.write(key, outputValue);
}
}
}