倒排索引(Inverted Index):倒排索引是实现“单词-文档矩阵”的一种具体存储形式,通过倒排索引,可以根据单词快速获取包含这个单词的文档列表。倒排索引主要由两个部分组成:“单词词典”和“倒排文件”。
比如有文档
a.txt
hello tom
hello jerry
hello tom
b.txt
hello jerry
hello jerry
tom jerry
c.txt
hello jerry
hello tom
要求统计结果为:
hello “a.txt->3 b.txt->2 c.txt->2”
jerry “a.txt->1 b.txt->3 c.txt->1”
tom “a.txt->2 b.txt->1 c.txt->1”
分析:
—————————Mapper阶段—————————–
将单词和文章名称作为key, 循环context.write(key, 1)
context.write(“hello->a.txt”,1);
context.write(“hello->a.txt”,1);
context.write(“hello->a.txt”,1);
reducer即将接收的数据:<”hello->a.txt”, {1, 1, 1}>
—————————Reducer阶段—————————-
context.write(“hello->a.txt”, 3);
context.write(“hello->b.txt”, 2);
context.write(“hello->c.txt”, 2);
—————————Mapper阶段—————————–
context.write(“hello”,”a.txt->3”);
context.write(“hello”,”b.txt->2”);
context.write(“hello”,”c.txt->2”);
—————————Reducer阶段—————————-
context.write(“hello”,”a.txt->3 b.txt->2 c.txt->2”);
—————————–最终结果—————————–
hello “a.txt->3 b.txt->2 c.txt->2”
jerry “a.txt->1 b.txt->3 c.txt->1”
tom “a.txt->2 b.txt->1 c.txt->1”
简单实现
package com.zz.hadoop.dc.inverse;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 倒排索引
*
*/
public class InverseIndex {
public static void main(String[] args) throws Exception {
InverseIndex.class.newInstance().init(args);
}
public void init(String[] args)
throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 设置jar
job.setJarByClass(InverseIndex.class);
// 设置Mapper相关的属性
job.setMapperClass(IndexMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
// 设置Reducer相关属性
job.setReducerClass(IndexReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 设置combiner
job.setCombinerClass(IndexCombiner.class);
// 提交任务
job.waitForCompletion(true);
}
public class IndexMapper
extends Mapper<LongWritable, Text, Text, LongWritable> {
private Text k2 = new Text();
private LongWritable v2 = new LongWritable();
/**
* 将单词和文章名称作为key, 循环context.write(key, 1)
* context.write("hello->a.txt", 1);
*/
@Override
protected void map(LongWritable k1, Text v1,
Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
String[] fields = v1.toString().split(" ");
// 获取切片路径(每一个mapper对应一个split)
FileSplit inputSplit = (FileSplit) context.getInputSplit();
Path path = inputSplit.getPath();
String fileName = path.getName();
for (String field : fields) {
this.k2.set(field + "->" + fileName);
this.v2.set(1);
context.write(k2, v2);
}
}
}
/**
* 进行一次combiner 将单词在每一篇文章出现的次数进行一次小计
*/
public class IndexCombiner extends Reducer<Text, LongWritable, Text, Text> {
private Text k = new Text();
private Text v = new Text();
/**
* 接入数据输入为:<"hello->a.txt", {1, 1, 1}>
* 将数据输出变为:context.write("hello","a.txt->3");
*/
@Override
protected void reduce(Text k2, Iterable<LongWritable> v2s,
Reducer<Text, LongWritable, Text, Text>.Context context)
throws IOException, InterruptedException {
String[] wordAndFileName = k2.toString().split("->");
int counter = 0;
for (LongWritable v2 : v2s) {
counter += v2.get();
}
k.set(wordAndFileName[0]);
v.set(wordAndFileName[1] + "->" + counter);
context.write(k, v);
}
}
public class IndexReducer extends Reducer<Text, Text, Text, Text> {
private Text val = new Text();
/**
* 接入数据输入为:<"hello", {"a.txt->3", "b.txt->2", "c.txt->1"}>
* 将数据输出变为:<"hello", "a.txt->3 b.txt->2 c.txt->1">
*/
@Override
protected void reduce(Text k3, Iterable<Text> v3s,
Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
StringBuffer sb = new StringBuffer();
for (Text v3 : v3s) {
sb.append(v3.toString() + " ");
}
val.set(sb.toString().trim());
context.write(k3, val);
}
}
}