MapReduce Join操作

本文详细介绍了在处理大数据时,面对小数据源无法被缓存的情况,如何通过设计两个Job流程来解决该问题。首先,通过Job将小数据源切割成只包含key的形式;其次,利用DistributedCache机制将这些key数据复制到所有节点进行缓存,过滤大源数据,然后与小数据源进行Join操作。整个过程涉及Job设计、数据切分、缓存管理和数据流处理等关键步骤。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

MapReduce 处理两个或多个数据源是经常的事,此时就要根据某个key将两个数据源进行Join操作,Join操作的原则是:
1) 如果两个数据源大小差不多,则直接使用DataJoin类进行Join操作
2) 如果两个数据源大小相差很大,则使用Distributed Cache机制将较小的数据源复制到所有节点并缓存起来,然后过滤大的数据源。如果缓存不下较小的数据源,可以先用Distributed Cache机制将较小的数据源的Key复制到所有节点并缓存,然后过滤大的数据源,过滤后的数据再与较小的数据源进行Join操作。

本周遇到的问题就是第二种情况的较小数据源无法被缓存地下(抛出OOM错误)。
解决方案是使用两个Job
第一个Job 将较小的数据源 切割成只有 key的数据
第二个Job的第一个Mapper用Distributed Cache机制将上面的只有key的数据复制到所有节点并进行缓存,然后过滤大的数据源,输出的key为数据的唯一ID,第二个Mapper加载较小的数据源,输出的key为数据的唯一ID,Reducer会接收到相同ID的两个数据源数据,然后进行自己的操作

废话少说直接上有注释的代码:

/**
 * 用户基础属性数据反向校验Job
 * Created by luweijie@xiaomi.com on 15-7-20.
 */
public class BasicDataVerifyJob {
    private static Logger logger = LoggerFactory.getLogger(BasicDataVerifyJob.class);
    private static final String SEPARATOR = "\t";

    /**
     * 程序Job入口
     * @param args
     */
    public static void main (String args[]) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        new GenericOptionsParser(conf, args).getRemainingArgs();
        if (StringUtils.isBlank(conf.get("path")) || StringUtils.isBlank(conf.get("date")) ||
                StringUtils.isBlank(conf.get("output"))) {
            System.err.println("jvm args: -Dpath -Ddate -Doutput must be specified!");
            System.exit(2);
        }

        FileSystem fs = FileSystem.get(conf);
        String path = conf.get("path");
        if (!fs.exists(new Path(path))) {
            logger.error("input path not exists : " + path);
            System.exit(-1);
        }

        // output
        String output = conf.get("output");
        Path outputPath = new Path(output);
        if (!output.endsWith("/")) {
            output += "/";
        }
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }

        // 产生mappingID文件Job
        String mappingIDFileOutput = output + "mappingIDFile";
        MicloudMRJob job1 = MicloudMRJob.getInstance(conf, "Generate mappingID file: " + path + " --> " + mappingIDFileOutput);
        job1.setJarByClass(BasicDataVerifyJob.class);
        job1.setMapperClass(MappingIDGeneratorMapper.class);
        job1.setMapOutputKeyClass(Text.class);
        job1.setMapOutputValueClass(NullWritable.class);

        MultipleInputs.addInputPath(job1, new Path(path), TextInputFormat.class, MappingIDGeneratorMapper.class);
        job1.setOutputFormatClass(TextOutputFormat.class);
        FileOutputFormat.setOutputPath(job1, new Path(mappingIDFileOutput));

        // mappingIDFile产生成功后, 根据mappingIDFile过滤原数据并比较验证
        if (job1.waitForCompletion(true)) {
            PathManager pathManager = PathManager.create(conf.get("date"));
            MicloudHDFSInputConfig inputConfig = new MicloudHDFSInputConfig("UserPublicInformation",
                    FilterMapper.class, BasicDataVerifyJob.class,
                    pathManager, Text.class, DataField.class);

            String verifyFile = output + "verify";
            OutputConfig outputConfig = new OutputConfig();
            outputConfig.setOutputPath(verifyFile);
            outputConfig.setOutputFormatClass(TextOutputFormat.class);
            outputConfig.setOutKeyClass(Text.class);
            outputConfig.setOutValueClass(IntWritable.class);
            outputConfig.setReducerClass(VerifyReducer.class);

            MicloudMRJob job2 = MicloudDataContext.getInstance().createJob(conf, inputConfig, outputConfig, "Verify Basic Data Job: " + pathManager.getOutputPath() + " --> " + verifyFile);
            MultipleInputs.addInputPath(job2, new Path(path), TextInputFormat.class, VerifyHDFSMapper.class);

            // 将较小的数据源放置到Distributed Cache文件中
            FileStatus[] fileStatuses = fs.listStatus(new Path(mappingIDFileOutput));
            URI[] uris = new URI[fileStatuses.length];
            for (int i = 0; i < fileStatuses.length; i++) {
                uris[i] = fileStatuses[i].getPath().toUri();
            }
            job2.setCacheFiles(uris);

            // Reducer Num
            job2.setNumReduceTasks(50);

            // 统计比较验证的结果,以下的代码就是处理代码
            if (job2.waitForCompletion(true)) {
                String resultFile = output + "result";

                MicloudMRJob job3 = MicloudMRJob.getInstance(conf, "Statistics Verify Data Job: " + verifyFile + " --> " + resultFile);
                job3.setJarByClass(BasicDataVerifyJob.class);
                job3.setMapperClass(StatisticsMapper.class);
                job3.setMapOutputKeyClass(Text.class);
                job3.setMapOutputValueClass(IntWritable.class);

                MultipleInputs.addInputPath(job3, new Path(verifyFile), TextInputFormat.class, StatisticsMapper.class);
                job3.setOutputFormatClass(TextOutputFormat.class);
                FileOutputFormat.setOutputPath(job3, new Path(resultFile));

                job3.setCombinerClass(StatisticsCombiner.class);

                job3.setReducerClass(StatisticsReducer.class);
                job3.setNumReduceTasks(3);

                job3.waitForCompletion(true);
            }
        }

    }

    /**
     * 产生mappingID文件 Mapper
     */
    public static class MappingIDGeneratorMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
        private Text outputKey = new Text();

        @Override
        protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = new String(value.copyBytes());
            String terms[] = line.split(SEPARATOR);
            if (terms == null || terms.length < 2) {
                return;
            }
            outputKey.set(terms[0]);
            context.write(outputKey, NullWritable.get());
        }
    }

    /**
     * 字符编码
     */
    private static final String CHARSET = "ISO-8859-1";

    /**
     * 过滤Mapper
     */
    public static class FilterMapper extends MicloudHDFSMapper<Text, DataField> {
        private static Set<String> joinData = new HashSet<String>();
        private Text outputKey = new Text();

        @Override
        protected  void setup (Context context){
            // 将distributed cache file 装入各个Map节点本地的内存数据joinData中
            FileSystem hdfs = null;
            try {
                String line = null;
                Path[] cacheFiles = context.getLocalCacheFiles();
                if (cacheFiles != null && cacheFiles.length > 0) {
                    for (int i = 0; i < cacheFiles.length; i++) {
                        BufferedReader joinReader = new BufferedReader(new FileReader(cacheFiles[i].toString()));
                        while ((line = joinReader.readLine()) != null) {
                            joinData.add(line);
                        }
                    }
                }
            } catch (IOException e) {
                e.printStackTrace();
                System.exit(0);
            } finally {
                if (hdfs != null) {
                    try {
                        hdfs.close();
                    } catch (IOException e) {
                        // Do Nothing
                    }
                }
            }
        }

        @Override
        public void persistMapOutput(Mapper.Context context, StructuredData data) throws IOException, InterruptedException {
            String mappingID = String.valueOf(data.getData("MappingID"));
            if (joinData.contains(mappingID)) {
                outputKey.set(mappingID);

                ByteArrayOutputStream bos = new ByteArrayOutputStream();
                ObjectOutputStream out = new ObjectOutputStream(bos);
                out.writeObject(data);
                DataField outputValue = new DataField("A", bos.toString(CHARSET));
                context.write(outputKey, outputValue);

                out.close();
            }
        }
    }

    /**
     * 来自于HDFS输入文件的 Mapper
     */
    public static class VerifyHDFSMapper extends Mapper<LongWritable, Text, Text, DataField> {
        private Text outputKey = new Text();

        @Override
        protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = new String(value.copyBytes());
            String terms[] = line.split(SEPARATOR);
            if (terms == null || terms.length < 2) {
                return;
            }

            outputKey.set(terms[0]);

            DataField outputValue = new DataField("B", terms[1]);

            context.write(outputKey, outputValue);
        }
    }

    /**
     * Verify Reducer
     */
    public static class VerifyReducer extends Reducer<Text, DataField, Text, IntWritable> {
        private Text outputKey = new Text();
        private IntWritable outputValue = new IntWritable();

        @Override
        protected void reduce (Text key, Iterable<DataField> values, Context context) throws IOException, InterruptedException {
            if (values != null) {
                StructuredData structuredData = null;
                String stringData = null;
                for (DataField value : values) {
                    if ("A".equalsIgnoreCase(value.getKey())) {
                        ByteArrayInputStream bis = new ByteArrayInputStream(value.getValue().getBytes(CHARSET));
                        ObjectInputStream in = new ObjectInputStream(bis);
                        try {
                            structuredData = (StructuredData) in.readObject();
                        } catch (ClassNotFoundException e) {
                            e.printStackTrace();
                        }

                        in.close();
                    } else if ("B".equalsIgnoreCase(value.getKey())) {
                        stringData = value.getValue();
                    }
                }
                Map<String, Integer> resultMap = VerifyUtil.verifyBasicData(stringData, structuredData);
                if (resultMap != null) {
                    Iterator<Map.Entry<String, Integer>> it = resultMap.entrySet().iterator();
                    while (it.hasNext()) {
                        Map.Entry<String, Integer> entry = it.next();
                        outputKey.set(entry.getKey());
                        outputValue.set(entry.getValue());

                        context.write(outputKey, outputValue);
                    }
                }
            }
        }
    }

    /**
     * 统计结果Mapper
     */
    public static class StatisticsMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        private Text outputKey = new Text();
        private IntWritable outputValue = new IntWritable();

        @Override
        protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = new String(value.getBytes());
            String[] terms = line.split(SEPARATOR);
            if (terms.length != 2) {
                return;
            }

            outputKey.set(terms[0]);
            outputValue.set(Integer.valueOf(terms[1]));
            context.write(outputKey, outputValue);
        }
    }

    /**
     * 统计结果Combiner,合并相同key,减少网络传输量
     */
    public static class StatisticsCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable outputValue = new IntWritable();

        @Override
        protected void reduce (Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }

            outputValue.set(sum);
            context.write(key, outputValue);
        }
    }

    /**
     * 统计结果Reducer
     */
    public static class StatisticsReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable outputValue = new IntWritable();

        @Override
        protected void reduce (Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }

            outputValue.set(sum);
            context.write(key, outputValue);
        }
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值