package com.mapreduce.topn;
import org.apache.hadoop.io.Text;
public class Item implements Comparable<Item>{
private Long count;
private String content;
public Item() {
}
public Item(Long count, String content) {
this.count = count;
this.content = content;
}
public Long getCount() {
return count;
}
public void setCount(Long count) {
this.count = count;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
@Override
public int compareTo(Item o) {
return Long.compare(count, o.getCount());
}
}
TopNMapper.java
package com.mapreduce.topn;
import java.io.IOException;
import java.util.PriorityQueue;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
public class TopNMapper extends Mapper<Object, Text, NullWritable, Text> {
private int N;
private PriorityQueue<Item> priorityQueue = new PriorityQueue();
private static Logger LOGGER = Logger.getLogger(TopNMapper.class);
@Override
protected void setup(Context context) throws IOException, InterruptedException {
this.N = context.getConfiguration().getInt("N", 5);
}
@Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String[] words = value.toString().split("\t");
if (words.length < 2) {
return;
}
LOGGER.info("Debug: Map get text = [" + value.toString() + "]");
Long count = Long.parseLong(words[1]);
Item item = new Item(count, value.toString());
if (priorityQueue.size() < N || count > priorityQueue.peek().getCount()) {
priorityQueue.offer(item);
}
if (priorityQueue.size() > N) {
priorityQueue.poll();
}
}
@Override
public void cleanup(Context context) throws IOException, InterruptedException {
for (Item item : priorityQueue) {
LOGGER.info("Debug: Map write to context = [" + item.getContent().toString() + "]");
context.write(NullWritable.get(), new Text(item.getContent()));
}
}
public static void main(String[] args) {
}
}
TopNReducer.java
package com.mapreduce.topn;
import java.io.IOException;
import java.util.PriorityQueue;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.log4j.Logger;
public class TopNReducer extends Reducer<NullWritable, Text, NullWritable, Text> {
private int N;
private PriorityQueue<Item> priorityQueue = new PriorityQueue();
private static Logger LOGGER = Logger.getLogger(TopNReducer.class);
@Override
protected void setup(Context context) throws IOException, InterruptedException {
this.N = context.getConfiguration().getInt("N", 5);
}
@Override
protected void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
LOGGER.info("Debug: text = [" + value.toString() + "]");
String[] words = value.toString().split("\t");
Long count = Long.parseLong(words[1]);
Item item = new Item(count, value.toString());
if (priorityQueue.size() < N || count > priorityQueue.peek().getCount()) {
priorityQueue.offer(item);
}
if (priorityQueue.size() > N) {
priorityQueue.poll();
}
}
for (Item topN : priorityQueue) {
context.write(NullWritable.get(), new Text(topN.getContent()));
}
}
}
TopNDriver.java
package com.mapreduce.topn;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TopNDriver extends Configured implements Tool {
private static Logger LOGGER = Logger.getLogger(TopNDriver.class);
public int run(String[] args) throws Exception {
Job job = new Job(getConf());
job.setJarByClass(TopNDriver.class);
int N = Integer.parseInt(args[0]); // top N
job.getConfiguration().setInt("N", N);
job.setJobName("TopNDriver");
job.setMapperClass(TopNMapper.class);
job.setReducerClass(TopNReducer.class);
job.setNumReduceTasks(1);
// map() output (K,V)
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
// reduce() output (K,V)
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
// args[1] = input directory
// args[2] = output directory
FileInputFormat.setInputPaths(job, new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path(args[2]));
boolean status = job.waitForCompletion(true);
LOGGER.info("run(): status="+status);
return status ? 0 : 1;
}
public static void main(String[] args) throws Exception {
if (args.length != 3) {
LOGGER.warn("usage TopNDriver <N> <input> <output>");
System.exit(1);
}
LOGGER.info("N=" + args[0]);
LOGGER.info("inputDir=" + args[1]);
LOGGER.info("outputDir=" + args[2]);
int returnStatus = ToolRunner.run(new TopNDriver(), args);
System.exit(returnStatus);
}
}