MapReduce Join操作

最新推荐文章于 2022-02-05 19:35:08 发布

buptdavid

最新推荐文章于 2022-02-05 19:35:08 发布

阅读量1k

点赞数

CC 4.0 BY-SA版权

分类专栏： Hadoop 大数据

本文链接：https://blog.youkuaiyun.com/buptdavid/article/details/47046281

Hadoop 同时被 2 个专栏收录

6 篇文章

订阅专栏

大数据

4 篇文章

订阅专栏

本文详细介绍了在处理大数据时，面对小数据源无法被缓存的情况，如何通过设计两个Job流程来解决该问题。首先，通过Job将小数据源切割成只包含key的形式；其次，利用DistributedCache机制将这些key数据复制到所有节点进行缓存，过滤大源数据，然后与小数据源进行Join操作。整个过程涉及Job设计、数据切分、缓存管理和数据流处理等关键步骤。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

MapReduce 处理两个或多个数据源是经常的事，此时就要根据某个key将两个数据源进行Join操作，Join操作的原则是：
1) 如果两个数据源大小差不多，则直接使用DataJoin类进行Join操作
2) 如果两个数据源大小相差很大，则使用Distributed Cache机制将较小的数据源复制到所有节点并缓存起来，然后过滤大的数据源。如果缓存不下较小的数据源，可以先用Distributed Cache机制将较小的数据源的Key复制到所有节点并缓存，然后过滤大的数据源，过滤后的数据再与较小的数据源进行Join操作。

本周遇到的问题就是第二种情况的较小数据源无法被缓存地下（抛出OOM错误）。
解决方案是使用两个Job
第一个Job 将较小的数据源切割成只有 key的数据
第二个Job的第一个Mapper用Distributed Cache机制将上面的只有key的数据复制到所有节点并进行缓存，然后过滤大的数据源，输出的key为数据的唯一ID，第二个Mapper加载较小的数据源，输出的key为数据的唯一ID，Reducer会接收到相同ID的两个数据源数据，然后进行自己的操作

废话少说直接上有注释的代码：

/**
 * 用户基础属性数据反向校验Job
 * Created by luweijie@xiaomi.com on 15-7-20.
 */
public class BasicDataVerifyJob {
    private static Logger logger = LoggerFactory.getLogger(BasicDataVerifyJob.class);
    private static final String SEPARATOR = "\t";

    /**
     * 程序Job入口
     * @param args
     */
    public static void main (String args[]) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        new GenericOptionsParser(conf, args).getRemainingArgs();
        if (StringUtils.isBlank(conf.get("path")) || StringUtils.isBlank(conf.get("date")) ||
                StringUtils.isBlank(conf.get("output"))) {
            System.err.println("jvm args: -Dpath -Ddate -Doutput must be specified!");
            System.exit(2);
        }

        FileSystem fs = FileSystem.get(conf);
        String path = conf.get("path");
        if (!fs.exists(new Path(path))) {
            logger.error("input path not exists : " + path);
            System.exit(-1);
        }

        // output
        String output = conf.get("output");
        Path outputPath = new Path(output);
        if (!output.endsWith("/")) {
            output += "/";
        }
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }

        // 产生mappingID文件Job
        String mappingIDFileOutput = output + "mappingIDFile";
        MicloudMRJob job1 = MicloudMRJob.getInstance(conf, "Generate mappingID file: " + path + " --> " + mappingIDFileOutput);
        job1.setJarByClass(BasicDataVerifyJob.class);
        job1.setMapperClass(MappingIDGeneratorMapper.class);
        job1.setMapOutputKeyClass(Text.class);
        job1.setMapOutputValueClass(NullWritable.class);

        MultipleInputs.addInputPath(job1, new Path(path), TextInputFormat.class, MappingIDGeneratorMapper.class);
        job1.setOutputFormatClass(TextOutputFormat.class);
        FileOutputFormat.setOutputPath(job1, new Path(mappingIDFileOutput));

        // mappingIDFile产生成功后， 根据mappingIDFile过滤原数据并比较验证
        if (job1.waitForCompletion(true)) {
            PathManager pathManager = PathManager.create(conf.get("date"));
            MicloudHDFSInputConfig inputConfig = new MicloudHDFSInputConfig("UserPublicInformation",
                    FilterMapper.class, BasicDataVerifyJob.class,
                    pathManager, Text.class, DataField.class);

            String verifyFile = output + "verify";
            OutputConfig outputConfig = new OutputConfig();
            outputConfig.setOutputPath(verifyFile);
            outputConfig.setOutputFormatClass(TextOutputFormat.class);
            outputConfig.setOutKeyClass(Text.class);
            outputConfig.setOutValueClass(IntWritable.class);
            outputConfig.setReducerClass(VerifyReducer.class);

            MicloudMRJob job2 = MicloudDataContext.getInstance().createJob(conf, inputConfig, outputConfig, "Verify Basic Data Job: " + pathManager.getOutputPath() + " --> " + verifyFile);
            MultipleInputs.addInputPath(job2, new Path(path), TextInputFormat.class, VerifyHDFSMapper.class);

            // 将较小的数据源放置到Distributed Cache文件中
            FileStatus[] fileStatuses = fs.listStatus(new Path(mappingIDFileOutput));
            URI[] uris = new URI[fileStatuses.length];
            for (int i = 0; i < fileStatuses.length; i++) {
                uris[i] = fileStatuses[i].getPath().toUri();
            }
            job2.setCacheFiles(uris);

            // Reducer Num
            job2.setNumReduceTasks(50);

            // 统计比较验证的结果，以下的代码就是处理代码
            if (job2.waitForCompletion(true)) {
                String resultFile = output + "result";

                MicloudMRJob job3 = MicloudMRJob.getInstance(conf, "Statistics Verify Data Job: " + verifyFile + " --> " + resultFile);
                job3.setJarByClass(BasicDataVerifyJob.class);
                job3.setMapperClass(StatisticsMapper.class);
                job3.setMapOutputKeyClass(Text.class);
                job3.setMapOutputValueClass(IntWritable.class);

                MultipleInputs.addInputPath(job3, new Path(verifyFile), TextInputFormat.class, StatisticsMapper.class);
                job3.setOutputFormatClass(TextOutputFormat.class);
                FileOutputFormat.setOutputPath(job3, new Path(resultFile));

                job3.setCombinerClass(StatisticsCombiner.class);

                job3.setReducerClass(StatisticsReducer.class);
                job3.setNumReduceTasks(3);

                job3.waitForCompletion(true);
            }
        }

    }

    /**
     * 产生mappingID文件 Mapper
     */
    public static class MappingIDGeneratorMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
        private Text outputKey = new Text();

        @Override
        protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = new String(value.copyBytes());
            String terms[] = line.split(SEPARATOR);
            if (terms == null || terms.length < 2) {
                return;
            }
            outputKey.set(terms[0]);
            context.write(outputKey, NullWritable.get());
        }
    }

    /**
     * 字符编码
     */
    private static final String CHARSET = "ISO-8859-1";

    /**
     * 过滤Mapper
     */
    public static class FilterMapper extends MicloudHDFSMapper<Text, DataField> {
        private static Set<String> joinData = new HashSet<String>();
        private Text outputKey = new Text();

        @Override
        protected  void setup (Context context){
            // 将distributed cache file 装入各个Map节点本地的内存数据joinData中
            FileSystem hdfs = null;
            try {
                String line = null;
                Path[] cacheFiles = context.getLocalCacheFiles();
                if (cacheFiles != null && cacheFiles.length > 0) {
                    for (int i = 0; i < cacheFiles.length; i++) {
                        BufferedReader joinReader = new BufferedReader(new FileReader(cacheFiles[i].toString()));
                        while ((line = joinReader.readLine()) != null) {
                            joinData.add(line);
                        }
                    }
                }
            } catch (IOException e) {
                e.printStackTrace();
                System.exit(0);
            } finally {
                if (hdfs != null) {
                    try {
                        hdfs.close();
                    } catch (IOException e) {
                        // Do Nothing
                    }
                }
            }
        }

        @Override
        public void persistMapOutput(Mapper.Context context, StructuredData data) throws IOException, InterruptedException {
            String mappingID = String.valueOf(data.getData("MappingID"));
            if (joinData.contains(mappingID)) {
                outputKey.set(mappingID);

                ByteArrayOutputStream bos = new ByteArrayOutputStream();
                ObjectOutputStream out = new ObjectOutputStream(bos);
                out.writeObject(data);
                DataField outputValue = new DataField("A", bos.toString(CHARSET));
                context.write(outputKey, outputValue);

                out.close();
            }
        }
    }

    /**
     * 来自于HDFS输入文件的 Mapper
     */
    public static class VerifyHDFSMapper extends Mapper<LongWritable, Text, Text, DataField> {
        private Text outputKey = new Text();

        @Override
        protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = new String(value.copyBytes());
            String terms[] = line.split(SEPARATOR);
            if (terms == null || terms.length < 2) {
                return;
            }

            outputKey.set(terms[0]);

            DataField outputValue = new DataField("B", terms[1]);

            context.write(outputKey, outputValue);
        }
    }

    /**
     * Verify Reducer
     */
    public static class VerifyReducer extends Reducer<Text, DataField, Text, IntWritable> {
        private Text outputKey = new Text();
        private IntWritable outputValue = new IntWritable();

        @Override
        protected void reduce (Text key, Iterable<DataField> values, Context context) throws IOException, InterruptedException {
            if (values != null) {
                StructuredData structuredData = null;
                String stringData = null;
                for (DataField value : values) {
                    if ("A".equalsIgnoreCase(value.getKey())) {
                        ByteArrayInputStream bis = new ByteArrayInputStream(value.getValue().getBytes(CHARSET));
                        ObjectInputStream in = new ObjectInputStream(bis);
                        try {
                            structuredData = (StructuredData) in.readObject();
                        } catch (ClassNotFoundException e) {
                            e.printStackTrace();
                        }

                        in.close();
                    } else if ("B".equalsIgnoreCase(value.getKey())) {
                        stringData = value.getValue();
                    }
                }
                Map<String, Integer> resultMap = VerifyUtil.verifyBasicData(stringData, structuredData);
                if (resultMap != null) {
                    Iterator<Map.Entry<String, Integer>> it = resultMap.entrySet().iterator();
                    while (it.hasNext()) {
                        Map.Entry<String, Integer> entry = it.next();
                        outputKey.set(entry.getKey());
                        outputValue.set(entry.getValue());

                        context.write(outputKey, outputValue);
                    }
                }
            }
        }
    }

    /**
     * 统计结果Mapper
     */
    public static class StatisticsMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        private Text outputKey = new Text();
        private IntWritable outputValue = new IntWritable();

        @Override
        protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = new String(value.getBytes());
            String[] terms = line.split(SEPARATOR);
            if (terms.length != 2) {
                return;
            }

            outputKey.set(terms[0]);
            outputValue.set(Integer.valueOf(terms[1]));
            context.write(outputKey, outputValue);
        }
    }

    /**
     * 统计结果Combiner,合并相同key,减少网络传输量
     */
    public static class StatisticsCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable outputValue = new IntWritable();

        @Override
        protected void reduce (Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }

            outputValue.set(sum);
            context.write(key, outputValue);
        }
    }

    /**
     * 统计结果Reducer
     */
    public static class StatisticsReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable outputValue = new IntWritable();

        @Override
        protected void reduce (Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }

            outputValue.set(sum);
            context.write(key, outputValue);
        }
    }
}