1、使用hadoop的map/reduce解决TopN问题(唯一键)
首先使用SequenceFileWriterForTopN函数生成sequence文件,
SequenceFileWriterForTopN函数代码如下:
package chap03.mapreduce;
import java.io.IOException;
import java.net.URI;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.intervalLiteral_return;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import shapeless.newtype;
public class SequenceFileWriterForTopN {
public static void main(String[] args) throws IOException {
if(args.length!=2)
throw new IOException("usage:java org.chap03.mapreduce.SequenceFileWriterForTopN <hdfs-path> <number-of-entries>");
Random rand=new Random();
final String uri=args[0]; //HDFS路径
final int N=Integer.parseInt(args[1]); //sequence文件中输入个数
Configuration conf=new Configuration();
FileSystem fs=FileSystem.get(URI.create(uri), conf);
Path path=new Path(uri);
Text key=new Text();
IntWritable value=new IntWritable();
SequenceFile.Writer writer=null;
writer=SequenceFile.createWriter(fs, conf, path,key.getClass(),value.getClass());
try {
for(int i=1;i<N;i++) {
int randomInt=rand.nextInt(1000);//生成1000个数
key.set("cat"+i);
value.set(randomInt);
System.out.println(key+"\t"+value);
writer.append(key, value);
}
} finally {
IOUtils.closeStream(writer);
}
}
}
TopNMapper函数代码如下:
package chap03.mapreduce;
import java.io.IOException;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
//NullWritable是Writable的一个特殊类&