算法(2) TopN Mapreduce/Spark_hadoop mapreduce topn csdn-优快云博客

原理是每个map保留TopN数据，然后发送给reduce, reduce只设置一个，也就是说如果有10个map, 那么每个map处理好数据保留TopN列表，然后10个map就是100条数据，然后发送给reduce. 然后由reduce来过滤出10条数据。

每个Mapper开始之前有setup方法，结束之后有cleanup，通常不会使用，setup, cleanup仅仅是每个mapper开始或者结束后才调用，仅仅调用一次。上述算法原理是map来过滤出数据，然后统一交个cleanup发送给reduce.

reduce层原理也一样，假设10个map数据传给reduce,那么会有reduce会接受top100条数据，然后交给cleanup一次性过滤，然后output. 这也是为什么仅仅设置一个reduce的原因。

原始数据如下：

30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no"
33;"services";"married";"secondary";"no";4789;"yes";"yes";"cellular";11;"may";220;1;339;4;"failure";"no"
35;"management";"single";"tertiary";"no";1350;"yes";"no";"cellular";16;"apr";185;1;330;1;"failure";"no"
30;"management";"married";"tertiary";"no";1476;"yes";"yes";"unknown";3;"jun";199;4;-1;0;"unknown";"no"
59;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";226;1;-1;0;"unknown";"no"
35;"management";"single";"tertiary";"no";747;"no";"no";"cellular";23;"feb";141;2;176;3;"failure";"no"
36;"self-employed";"married";"tertiary";"no";307;"yes";"no";"cellular";14;"may";341;1;330;2;"other";"no"
39;"technician";"married";"secondary";"no";147;"yes";"no";"cellular";6;"may";151;2;-1;0;"unknown";"no"
41;"entrepreneur";"married";"tertiary";"no";221;"yes";"no";"unknown";14;"may";57;2;-1;0;"unknown";"no"
43;"services";"married";"primary";"no";-88;"yes";"yes";"cellular";17;"apr";313;1;147;2;"failure";"no"
39;"services";"married";"secondary";"no";9374;"yes";"no";"unknown";20;"may";273;1;-1;0;"unknown";"no"
43;"admin.";"married";"secondary";"no";264;"yes";"no";"cellular";17;"apr";113;2;-1;0;"unknown";"no"
36;"technician";"married";"tertiary";"no";1109;"no";"no";"cellular";13;"aug";328;2;-1;0;"unknown";"no"
20;"student";"single";"secondary";"no";502;"no";"no";"cellular";30;"apr";261;1;-1;0;"unknown";"yes"
31;"blue-collar";"married";"secondary";"no";360;"yes";"yes";"cellular";29;"jan";89;1;241;1;"failure";"no"
40;"management";"married";"tertiary";"no";194;"no";"yes";"cellular";29;"aug";189;2;-1;0;"unknown";"no"
56;"technician";"married";"secondary";"no";4073;"no";"no";"cellular";27;"aug";239;5;-1;0;"unknown";"no"
37;"admin.";"single";"tertiary";"no";2317;"yes";"no";"cellular";20;"apr";114;1;152;2;"failure";"no"
25;"blue-collar";"single";"primary";"no";-221;"yes";"no";"unknown";23;"may";250;1;-1;0;"unknown";"no"
31;"services";"married";"secondary";"no";132;"no";"no";"cellular";7;"jul";148;1;152;1;"other";"no"
38;"management";"divorced";"unknown";"no";0;"yes";"no";"cellular";18;"nov";96;2;-1;0;"unknown";"no"
42;"management";"divorced";"tertiary";"no";16;"no";"no";"cellular";19;"nov";140;3;-1;0;"unknown";"no"
44;"services";"single";"secondary";"no";106;"no";"no";"unknown";12;"jun";109;2;-1;0;"unknown";"no"

.............................. 上面只是数据样例，实际还有很多行.

实现代码如下：

package com.isesol.mapreduce;

import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.commons.net.nntp.NewsgroupInfo;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SplitLineReader;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

public class topN {

	public static class TokenizerMapper extends Mapper<Object, Text, Text, Text> {

		private SortedMap top10 = new TreeMap();

		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

			String str = "";
			String[] val = value.toString().split(";");
			for (int i = 1; i < val.length - 1; i++) {
				str += val[i];
			}
			top10.put(val[0], str);
			
			
			
			if (top10.size() > 10) {
				top10.remove(top10.firstKey());
			}
		}

		public void cleanup(Context context) throws IOException, InterruptedException {

			Iterator iterator = top10.entrySet().iterator();
			while (iterator.hasNext()) {
				Map.Entry ent = (Map.Entry) iterator.next();
				String key = ent.getKey().toString();
				String value = ent.getValue().toString();
				
				context.write(new Text(key), new Text(value));
			}
		}
	}

	public static class IntSumReducer extends Reducer<Text, Text, Text, Text> {
		
		private SortedMap top10 = new TreeMap();

		public void reduce(Text key, Iterable<Text> value, Context context) throws IOException, InterruptedException {
			
			for( Text val : value) {
				top10.put(key.toString(), val.toString());
			}
					
			
		}

		public void cleanup(Context context) throws IOException, InterruptedException {

			System.out.println("top10 size isa: " + top10.size());

			while(top10.size() > 10) {
				top10.remove(top10.firstKey());
			}
			
			Iterator iterator = top10.entrySet().iterator();
			
			System.out.println("top10 size is: " + top10.size());
			
			while (iterator.hasNext()) {
				Map.Entry ent = (Map.Entry) iterator.next();
				String key01 = ent.getKey().toString();
				String value01 = ent.getValue().toString();
				context.write(new Text(key01), new Text(value01));
			} 
			
		}

	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf, "topN");
		job.setJarByClass(topN.class);
		job.setMapperClass(TokenizerMapper.class);
		job.setReducerClass(IntSumReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		job.setNumReduceTasks(1);
		// job.setPartitionerClass(twopartitions.class);
		// job.setOutputFormatClass(fakeOutPutFormat.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

结果如下：

76	"retired""married""primary""no"2590"no""no""telephone"9"feb"6812-10"unknown"
77	"retired""married""primary""no"680"no""no""telephone"27"nov"3414943"failure"
78	"retired""married""tertiary""no"226"no""no""telephone"6"nov"1361-10"unknown"
79	"retired""divorced""unknown""no"2628"no""no""telephone"8"jul"22074502"failure"
80	"housemaid""married""primary""no"0"no""no""cellular"23"feb"63911891"failure"
81	"retired""married""secondary""no"1"no""no""cellular"19"aug"655-10"unknown"
83	"retired""divorced""primary""no"1097"no""no""telephone"5"mar"1811-10"unknown"
84	"retired""divorced""primary""no"639"no""no""telephone"18"may"3533-10"unknown"
86	"retired""married""secondary""no"1503"no""no""telephone"18"mar"16531011"other"
87	"retired""married""primary""no"230"no""no""cellular"30"oct"1441-10"unknown"

上面没有考虑有重复数据，如果需要考虑重复数据也很好处理，稍微加工一下即可。

上面的TOP N是固定的，通常在实际些程序不会把N固定，比如我想TOP 100，不可能再去修改参数，因此赋予变量是最好的方式。Mpreduce通过conf.set(name, value)的方式设置变量，然后通过get获取变量。

conf.set("topn", args[2]);

			topn = context.getConfiguration().get("topn");
			if(topn == null) {
				topn = "10";
			}