倒排索引(Inverted Index),也常被称为反向索引、置入档案或反向档案,是一种索引方法,被用来存储在全文搜索下某个单词在一个文档或者一组文档中的存储位置的映射。它是文档检索系统中最常用的数据结构。
有两份数据:
mapreduce-4-1.txt
huangbo love xuzheng
huangxiaoming love baby huangxiaoming love yangmi
liangchaowei love liujialing
huangxiaoming xuzheng huangbo wangbaoqiang
mapreduce-4-2.txt
hello huangbo
hello xuzheng
hello huangxiaoming
编写 MapReduce 求出以下格式的结果数据:统计每个关键词在每个文档中当中的第几行出现了多少次
例如,huangxiaoming
关键词的格式:huangixaoming mapreduce-4-1.txt:2,2; mapreduce-4-1.txt:4,1;mapreduce-4-2.txt:3,1
以上答案的意义:
关键词 huangxiaoming 在第一份文档 mapreduce-4-1.txt 中的第 2 行出现了 2 次
关键词 huangxiaoming 在第一份文档 mapreduce-4-1.txt 中的第 4 行出现了 1 次
关键词 huangxiaoming 在第二份文档 mapreduce-4-2.txt 中的第 3 行出现了 1 次
解题思路:通过map的context的context.getInputSplit(),获取当前map读取的文件名字,关键词的个数统计就是Wordcount案例内容。关键点在于行数的统计,如果在Mapper内部类中使用全局变量,对于一个split文件可以,当文件较大时,设计并行处理,全局变量就没法使用。在此处可以通过改写源码,使用key值记录行数。
在mapper阶段,key值没有使用,它记录的是每次读取一行的偏移量,在源码LineRecordReader中修改key的取值即可:
具体做法:在src目录下建立package,包名为:org.apache.hadoop.mapreduce.lib.input,然后将整个LineRecordReader类复制到包里面:
代码修改:
1、添加全局变量 : private int num = 1;///////////////设置行数
2、修改方法nextKeyValue中的key值,具体修改如下注释的地方,其他地方均不需修改。保存即可
public boolean nextKeyValue() throws IOException {
if (key == null) {
key = new LongWritable();
}
if (value == null) {
value = new Text();
}
int newSize = 0;
while (getFilePosition() <= end || in.needAdditionalRecordAfterSplit()) {
if (pos == 0) {
newSize = skipUtfByteOrderMark();
key.set(num);//首行设置
// System.out.println(key.toString()+"********");
} else {
num ++;//逐行增加
key.set(num);//设置key值
System.out.println(key.toString()+"********");
newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos));
pos += newSize;
}
if ((newSize == 0) || (newSize < maxLineLength)) {
break;
}
// line too long. try again
LOG.info("Skipped line of size " + newSize + " at pos " +
(pos - newSize));
}
if (newSize == 0) {
key = null;
value = null;
return false;
} else {
return true;
}
}
主体代码:
/**
* @author: lpj
* @date: 2018年3月16日 下午7:16:47
* @Description:
*/
package exam3;
import java.io.IOException;
import java.io.LineNumberInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import exam2.ExamGoodsMR;
import exam2.GoodsBean;
import exam2.ExamGoodsMR.ExamGoodsMR2_Mapper;
import exam2.ExamGoodsMR.ExamGoodsMR2_Reducer;
/**
*
*/
public class WordCount2MR {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);//默认使用本地
Job job = Job.getInstance(conf);
job.setJarByClass(WordCount2MR.class);
job.setMapperClass(WordCount2MR_Mapper.class);
job.setReducerClass(WordCount2MR_Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//
// String inputpath = args[0];
// String outpath = args[1];
Path inputPath = new Path("d:/bb/exam3");
Path outputPath = new Path("d:/bb/exam3_1_n");
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
//---------------------------------
FileSystem fs2 = FileSystem.get(conf);//默认使用本地
Job job2 = Job.getInstance(conf);
job2.setJarByClass(WordCount2MR.class);
job2.setMapperClass(WordCount2MR2_Mapper.class);
job2.setReducerClass(WordCount2MR2_Reducer.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(Text.class);
Path inputPath2 = new Path("d:/bb/exam3_1_n");
Path outputPath2 = new Path("d:/bb/exam3_2_n");
if (fs.exists(outputPath2)) {
fs.delete(outputPath2, true);
}
FileInputFormat.setInputPaths(job2, inputPath2);
FileOutputFormat.setOutputPath(job2, outputPath2);
//-----------------------------------
ControlledJob aJob = new ControlledJob(job.getConfiguration());
ControlledJob bJob = new ControlledJob(job2.getConfiguration());
aJob.setJob(job);
bJob.setJob(job2);
JobControl jc = new JobControl("jc");
jc.addJob(aJob);
jc.addJob(bJob);
bJob.addDependingJob(aJob);
Thread thread = new Thread(jc);
thread.start();
while(!jc.allFinished()){
thread.sleep(1000);
}
jc.stop();
}
public static class WordCount2MR_Mapper extends Mapper<LongWritable, Text, Text, Text>{
Text kout = new Text();
Text valueout = new Text();
// int linenum = 0;
@Override
public void run(Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
super.run(context);
}
@Override
protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
// linenum ++;
System.out.println(key.toString() + "-------------------------");
String [] reads = value.toString().trim().split(" ");
InputSplit inputSplit = context.getInputSplit();
FileSplit fileSplit = (FileSplit)inputSplit;
String filename = fileSplit.getPath().getName();
for(int i = 0; i < reads.length; i++){
if (reads[i].length() > 0) {
String kk = reads[i];
kout.set(kk + "\t" + filename + "\t" + key.toString());
String vv = 1 +"";
context.write(kout, valueout);
}
}
}
}
public static class WordCount2MR_Reducer extends Reducer<Text, Text, Text, Text>{
Text kout = new Text();
Text valueout = new Text();
//danci 文档 行数 词频
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
int count = 0;
for(Text text : values){
count++;
}
context.write(key, new Text(count + ""));
}
}
//---------------------------
public static class WordCount2MR2_Mapper extends Mapper<LongWritable, Text, Text, Text>{
Text kout = new Text();
Text valueout = new Text();
int linenum = 0;
@Override
protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
String [] reads = value.toString().trim().split("\t");
String kk = reads[0];
String vv = reads[1] + ":" + reads[2] + "," + reads[3];
kout.set(kk);
valueout.set(vv);
context.write(kout, valueout);
}
}
public static class WordCount2MR2_Reducer extends Reducer<Text, Text, Text, Text>{
Text kout = new Text();
Text valueout = new Text();
//danci 文档 行数 词频
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
StringBuffer bf = new StringBuffer();
for(Text text : values){
bf.append(text.toString()).append(";");
}
String kk = bf.toString().substring(0,bf.length() - 1);
context.write(key, new Text(kk));
}
}
}
第一步统计结果:
baby mapreduce-4-1.txt 2 1
hello mapreduce-4-2.txt 1 1
hello mapreduce-4-2.txt 2 1
hello mapreduce-4-2.txt 3 1
huangbo mapreduce-4-1.txt 1 1
huangbo mapreduce-4-2.txt 1 1
huangxiaoming mapreduce-4-1.txt 2 2
huangxiaoming mapreduce-4-2.txt 3 1
liangchaowei mapreduce-4-1.txt 3 1
liujialing mapreduce-4-1.txt 3 1
love mapreduce-4-1.txt 1 1
love mapreduce-4-1.txt 2 2
love mapreduce-4-1.txt 3 1
xuzheng mapreduce-4-1.txt 1 1
xuzheng mapreduce-4-2.txt 2 1
yangmi mapreduce-4-1.txt 2 1
第二步统计结果
baby mapreduce-4-1.txt:2,1
hello mapreduce-4-2.txt:1,1;mapreduce-4-2.txt:2,1;mapreduce-4-2.txt:3,1
huangbo mapreduce-4-1.txt:1,1;mapreduce-4-2.txt:1,1
huangxiaoming mapreduce-4-2.txt:3,1;mapreduce-4-1.txt:2,2
liangchaowei mapreduce-4-1.txt:3,1
liujialing mapreduce-4-1.txt:3,1
love mapreduce-4-1.txt:1,1;mapreduce-4-1.txt:2,2;mapreduce-4-1.txt:3,1
xuzheng mapreduce-4-1.txt:1,1;mapreduce-4-2.txt:2,1
yangmi mapreduce-4-1.txt:2,1
总结:解题关键,理解需求。修改源码,合理使用给定的参数。后续需要逐行变量,可以类似考虑使用key值。