原理是每个map保留TopN数据,然后发送给reduce, reduce只设置一个,也就是说如果有10个map, 那么每个map处理好数据保留TopN列表,然后10个map就是100条数据,然后发送给reduce. 然后由reduce来过滤出10条数据。
每个Mapper开始之前有setup方法,结束之后有cleanup, 通常不会使用,setup, cleanup仅仅是每个mapper开始或者结束后才调用,仅仅调用一次。 上述算法原理是map来过滤出数据,然后统一交个cleanup发送给reduce.
reduce层原理也一样,假设10个map数据传给reduce,那么会有reduce会接受top100条数据,然后交给cleanup一次性过滤,然后output. 这也是为什么仅仅设置一个reduce的原因。
原始数据如下:
30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no" 33;"services";"married";"secondary";"no";4789;"yes";"yes";"cellular";11;"may";220;1;339;4;"failure";"no" 35;"management";"single";"tertiary";"no";1350;"yes";"no";"cellular";16;"apr";185;1;330;1;"failure";"no" 30;"management";"married";"tertiary";"no";1476;"yes";"yes";"unknown";3;"jun";199;4;-1;0;"unknown";"no" 59;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";226;1;-1;0;"unknown";"no" 35;"management";"single";"tertiary";"no";747;"no";"no";"cellular";23;"feb";141;2;176;3;"failure";"no" 36;"self-employed";"married";"tertiary";"no";307;"yes";"no";"cellular";14;"may";341;1;330;2;"other";"no" 39;"technician";"married";"secondary";"no";147;"yes";"no";"cellular";6;"may";151;2;-1;0;"unknown";"no" 41;"entrepreneur";"married";"tertiary";"no";221;"yes";"no";"unknown";14;"may";57;2;-1;0;"unknown";"no" 43;"services";"married";"primary";"no";-88;"yes";"yes";"cellular";17;"apr";313;1;147;2;"failure";"no" 39;"services";"married";"secondary";"no";9374;"yes";"no";"unknown";20;"may";273;1;-1;0;"unknown";"no" 43;"admin.";"married";"secondary";"no";264;"yes";"no";"cellular";17;"apr";113;2;-1;0;"unknown";"no" 36;"technician";"married";"tertiary";"no";1109;"no";"no";"cellular";13;"aug";328;2;-1;0;"unknown";"no" 20;"student";"single";"secondary";"no";502;"no";"no";"cellular";30;"apr";261;1;-1;0;"unknown";"yes" 31;"blue-collar";"married";"secondary";"no";360;"yes";"yes";"cellular";29;"jan";89;1;241;1;"failure";"no" 40;"management";"married";"tertiary";"no";194;"no";"yes";"cellular";29;"aug";189;2;-1;0;"unknown";"no" 56;"technician";"married";"secondary";"no";4073;"no";"no";"cellular";27;"aug";239;5;-1;0;"unknown";"no" 37;"admin.";"single";"tertiary";"no";2317;"yes";"no";"cellular";20;"apr";114;1;152;2;"failure";"no" 25;"blue-collar";"single";"primary";"no";-221;"yes";"no";"unknown";23;"may";250;1;-1;0;"unknown";"no" 31;"services";"married";"secondary";"no";132;"no";"no";"cellular";7;"jul";148;1;152;1;"other";"no" 38;"management";"divorced";"unknown";"no";0;"yes";"no";"cellular";18;"nov";96;2;-1;0;"unknown";"no" 42;"management";"divorced";"tertiary";"no";16;"no";"no";"cellular";19;"nov";140;3;-1;0;"unknown";"no" 44;"services";"single";"secondary";"no";106;"no";"no";"unknown";12;"jun";109;2;-1;0;"unknown";"no".............................. 上面只是数据样例,实际还有很多行.
实现代码如下:
package com.isesol.mapreduce;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.commons.net.nntp.NewsgroupInfo;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SplitLineReader;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
public class topN {
public static class TokenizerMapper extends Mapper<Object, Text, Text, Text> {
private SortedMap top10 = new TreeMap();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String str = "";
String[] val = value.toString().split(";");
for (int i = 1; i < val.length - 1; i++) {
str += val[i];
}
top10.put(val[0], str);
if (top10.size() > 10) {
top10.remove(top10.firstKey());
}
}
public void cleanup(Context context) throws IOException, InterruptedException {
Iterator iterator = top10.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry ent = (Map.Entry) iterator.next();
String key = ent.getKey().toString();
String value = ent.getValue().toString();
context.write(new Text(key), new Text(value));
}
}
}
public static class IntSumReducer extends Reducer<Text, Text, Text, Text> {
private SortedMap top10 = new TreeMap();
public void reduce(Text key, Iterable<Text> value, Context context) throws IOException, InterruptedException {
for( Text val : value) {
top10.put(key.toString(), val.toString());
}
}
public void cleanup(Context context) throws IOException, InterruptedException {
System.out.println("top10 size isa: " + top10.size());
while(top10.size() > 10) {
top10.remove(top10.firstKey());
}
Iterator iterator = top10.entrySet().iterator();
System.out.println("top10 size is: " + top10.size());
while (iterator.hasNext()) {
Map.Entry ent = (Map.Entry) iterator.next();
String key01 = ent.getKey().toString();
String value01 = ent.getValue().toString();
context.write(new Text(key01), new Text(value01));
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "topN");
job.setJarByClass(topN.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(1);
// job.setPartitionerClass(twopartitions.class);
// job.setOutputFormatClass(fakeOutPutFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
结果如下:
76 "retired""married""primary""no"2590"no""no""telephone"9"feb"6812-10"unknown" 77 "retired""married""primary""no"680"no""no""telephone"27"nov"3414943"failure" 78 "retired""married""tertiary""no"226"no""no""telephone"6"nov"1361-10"unknown" 79 "retired""divorced""unknown""no"2628"no""no""telephone"8"jul"22074502"failure" 80 "housemaid""married""primary""no"0"no""no""cellular"23"feb"63911891"failure" 81 "retired""married""secondary""no"1"no""no""cellular"19"aug"655-10"unknown" 83 "retired""divorced""primary""no"1097"no""no""telephone"5"mar"1811-10"unknown" 84 "retired""divorced""primary""no"639"no""no""telephone"18"may"3533-10"unknown" 86 "retired""married""secondary""no"1503"no""no""telephone"18"mar"16531011"other" 87 "retired""married""primary""no"230"no""no""cellular"30"oct"1441-10"unknown"上面没有考虑有重复数据,如果需要考虑重复数据也很好处理,稍微加工一下即可。
上面的TOP N是固定的,通常在实际些程序不会把N固定,比如我想TOP 100,不可能再去修改参数,因此赋予变量是最好的方式。Mpreduce通过conf.set(name, value)的方式设置变量,然后通过get获取变量。
conf.set("topn", args[2]);
topn = context.getConfiguration().get("topn");
if(topn == null) {
topn = "10";
}
8382

被折叠的 条评论
为什么被折叠?



