MapReduce之简单随机抽样
这篇博客和上一篇有点类似,模式描述和应用场景都和MapReduce之过滤(一)类似
简单随机抽样
从一个较大的数据集中以一定概率抓取一个数据集,其中每条记录均有相同的抽取概率
问题描述
一个较大的数据集中以一定概率抓取一个数据集
样例输入
与MapReduce之过滤(一)数据类似
样例输出
数据集随机生成,可能存在不同
mapper阶段任务
在map函数中,简单的生成一个(0,1)之间的随机数并与之指定的阈值进行比较,来确定是否保存数据
mapper阶段编码如下
public static class SRSMapper extends Mapper<Object,Text,NullWritable,Text>{
private Random rands=new Random();
//在这里假设阈值为0.3
private Double percentage=0.3;
public void map(Object key,Text value,Context context) throws IOException,InterruptedException{
if(rands.nextDouble()<percentage){
context.write(NullWritable.get(),value);
}
}
}
reducer阶段
无具体任务,仅仅是简单的从文件系统中将抓取的数据抓取出来,可以用reduce或者使用hadoop fd -cat收集,也可以控制台打印(意义不大)
public static class SRSReduce extends Reducer<NullWritable,Text,Text,NullWritable>{
public void reducer(NullWritable key, Iterator<Text> values,Context context) throws IOException,InterruptedException{
while(values.hasNext()){
String value=values.toString();
context.write(new Text(value),NullWritable.get());
}
}
}
完整编码如下
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.omg.IOP.IOR;
import java.io.IOException;
import java.util.Iterator;
import java.util.Random;
public class SRS {
public static class SRSMapper extends Mapper<Object,Text,NullWritable,Text>{
private Random rands=new Random();
private Double percentage=0.3;
public void map(Object key,Text value,Context context) throws IOException,InterruptedException{
if(rands.nextDouble()<percentage){
context.write(NullWritable.get(),value);
}
}
}
public static class SRSReduce extends Reducer<NullWritable,Text,Text,NullWritable>{
public void reducer(NullWritable key, Iterator<Text> values,Context context) throws IOException,InterruptedException{
while(values.hasNext()){
String value=values.toString();
context.write(new Text(value),NullWritable.get());
}
}
}
public static void main(String[] args) throws Exception{
FileUtil.deleteDir("output");
Configuration configuration=new Configuration();
String[] otherArgs=new String[]{"input/file.txt","output"};
if(otherArgs.length!=2){
System.err.println("参数错误");
System.exit(2);
}
Job job=new Job(configuration,"Inverse");
job.setJarByClass(Grep.class);
job.setMapperClass(SRSMapper.class);
job.setReducerClass(SRSReduce.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job,new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job,new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}