Partitioner编程

最新推荐文章于 2023-08-16 17:01:11 发布

原创最新推荐文章于 2023-08-16 17:01:11 发布 · 402 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#编程

hadoop学习笔记专栏收录该内容

21 篇文章

订阅专栏

本文介绍了一种在Hadoop MapReduce应用中通过自定义Partitioner来优化数据处理的方法。通过案例演示了如何按手机号前缀分配不同运营商的数据到指定Reducer，从而实现更高效的统计和查询。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

partitioner编程

目的：针对上篇博客中输出的结果放在不同的分区中

为什么要用分区？

可以按照不同的属性分别存放，统计比较方便。
例如：统计全国各个市的短信和电话使用情况，考虑到全国各地的人经常会出差，以及大量的旅游人员，比如北京的人，出差或旅游上海，他在上海的电话以及短信使用情况就会保存在离他较为近的基站中，这样，信息较为分散，查询效率比较低，使用分区，将同一个市中的短信及流量等在全国进行收集，放到一个分区中，较为方便。下面是一个简单的模拟分区使用情况。

只需要在上篇博客中，加入一个partitioner类即可实现该目的。

package cn.master1.hadoop.mr.dc;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class DataCount {

    public static void main(String[] args) throws Exception {
         Configuration conf = new Configuration();
         Job job = Job.getInstance(conf);

         job.setJarByClass(DataCount.class);

         job.setMapperClass(DCMapper.class);
         /*当k2 v2 和 k3 v3 类型一一对应时，此行和下面一行可以省略。*/
         job.setMapOutputKeyClass(Text.class);
         job.setMapOutputValueClass(DataBean.class);
         FileInputFormat.setInputPaths(job, new Path(args[0]));

         job.setReducerClass(DCReducer.class);
         job.setOutputKeyClass(Text.class);
         job.setOutputValueClass(DataBean.class);
         FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //增加内容
         job.setPartitionerClass(ProviderPartitioner.class);

         //增加内容
         job.setNumReduceTasks(Integer.parseInt(args[2]));

         job.waitForCompletion(true);
    }

    public static class DCMapper extends Mapper<LongWritable, Text, Text, DataBean>{

        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, DataBean>.Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            String[] fileds = line.split("\t");
            String telNo = fileds[1];
            long up = Long.parseLong(fileds[8]);
            long down = Long.parseLong(fileds[9]);
            DataBean bean  = new DataBean(telNo, up, down);
            context.write(new Text(telNo), bean);
        }

    }

    public static class DCReducer extends Reducer<Text, DataBean, Text, DataBean>{

        @Override
        protected void reduce(Text key, Iterable<DataBean> v2s, Context context)
                throws IOException, InterruptedException {
            long up_sum = 0;
            long down_sum = 0;
            for(DataBean bean : v2s){
                up_sum += bean.getUpPayLoad();
                down_sum += bean.getDownPayLoad();
            }
            DataBean bean = new DataBean("", up_sum, down_sum);
            context.write(key, bean);
        }
    }

    //增加类容
    public static class ProviderPartitioner extends Partitioner<Text, DataBean>{

        //先初始化providerMap 同为static ，从上往下执行。
        private static Map<String,Integer> providerMap = new HashMap<String, Integer>();
        static {
            providerMap.put("135", 1);
            providerMap.put("136", 1);
            providerMap.put("137", 1);
            providerMap.put("138", 1);
            providerMap.put("139", 1);
            providerMap.put("150", 2);
            providerMap.put("159", 2);
            providerMap.put("182", 3);
            providerMap.put("183", 3);

        }
        @Override
        public int getPartition(Text key, DataBean value, int numPartitions) {
            String account = key.toString();
            String sub_acc = account.substring(0,3);//从0开始取三位
            Integer code = providerMap.get(sub_acc);
            if(code == null){
                code = 0;
            }
            return code;
        }
    }
    //结束
}