partitioner编程
目的: 针对上篇博客中输出的结果放在不同的分区中
为什么要用分区?
可以按照不同的属性分别存放,统计比较方便。
例如:统计全国各个市的短信和电话使用情况,考虑到全国各地的人经常会出差,以及大量的旅游人员,比如北京的人,出差或旅游上海,他在上海的电话以及短信使用情况就会保存在离他较为近的基站中,这样,信息较为分散,查询效率比较低,使用分区,将同一个市中的短信及流量等在全国进行收集,放到一个分区中,较为方便。下面是一个简单的模拟分区使用情况。
只需要在上篇博客中,加入一个partitioner类即可实现该目的。
package cn.master1.hadoop.mr.dc;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DataCount {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(DataCount.class);
job.setMapperClass(DCMapper.class);
/*当k2 v2 和 k3 v3 类型一一对应时,此行和下面一行可以省略。*/
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DataBean.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
job.setReducerClass(DCReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DataBean.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//增加内容
job.setPartitionerClass(ProviderPartitioner.class);
//增加内容
job.setNumReduceTasks(Integer.parseInt(args[2]));
job.waitForCompletion(true);
}
public static class DCMapper extends Mapper<LongWritable, Text, Text, DataBean>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, DataBean>.Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] fileds = line.split("\t");
String telNo = fileds[1];
long up = Long.parseLong(fileds[8]);
long down = Long.parseLong(fileds[9]);
DataBean bean = new DataBean(telNo, up, down);
context.write(new Text(telNo), bean);
}
}
public static class DCReducer extends Reducer<Text, DataBean, Text, DataBean>{
@Override
protected void reduce(Text key, Iterable<DataBean> v2s, Context context)
throws IOException, InterruptedException {
long up_sum = 0;
long down_sum = 0;
for(DataBean bean : v2s){
up_sum += bean.getUpPayLoad();
down_sum += bean.getDownPayLoad();
}
DataBean bean = new DataBean("", up_sum, down_sum);
context.write(key, bean);
}
}
//增加类容
public static class ProviderPartitioner extends Partitioner<Text, DataBean>{
//先初始化providerMap 同为static ,从上往下执行。
private static Map<String,Integer> providerMap = new HashMap<String, Integer>();
static {
providerMap.put("135", 1);
providerMap.put("136", 1);
providerMap.put("137", 1);
providerMap.put("138", 1);
providerMap.put("139", 1);
providerMap.put("150", 2);
providerMap.put("159", 2);
providerMap.put("182", 3);
providerMap.put("183", 3);
}
@Override
public int getPartition(Text key, DataBean value, int numPartitions) {
String account = key.toString();
String sub_acc = account.substring(0,3);//从0开始取三位
Integer code = providerMap.get(sub_acc);
if(code == null){
code = 0;
}
return code;
}
}
//结束
}
以下流程和上篇相同。
注意:在最后命令执行时候有所区别 hadoop jar /root/examples.jar cn.master1.hadoop.mr.dc.DataCount /data.txt /data-p 4 最后4这个参数为args[2])所需的参数。