MapReduce 实现倒序索引_mapreduc wordcounte倒排索引java-优快云博客

本文链接：https://blog.youkuaiyun.com/zhengshidao/article/details/76586141

倒排索引是文档检索系统的关键数据结构，常用于全文搜索引擎。通过它，可以根据内容快速查找文档。本文将介绍如何使用MapReduce在Hadoop上实现倒排索引。在MapReduce中，数据首先由inputformat切分，然后RecordReader读取并交给mapper处理。在确保Hadoop集群运行正常并与Eclipse连接后，即可进行操作。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

倒序索引简介

“倒排索引”是文档检索系统中最常用的数据结构，被广泛地应用于全文搜索引擎。它主要是用来存储某个单词（或词组）在一个文档或一组文档中的存储位置的映射，即提供了一种根据内容来查找文档的方式。由于不是根据文档来确定文档所包含的内容，而是进行相反的操作，因而称为倒排索引（Inverted Index）。

关于倒序索引更加详细的介绍MapReduce实现倒序索引

Hadoop 将数据传给 map 进行处理前会使用inputformat对数据进行处理：
1. 对数据进行切分，生成一组split分片，一个split分片会被分给一个mapper处理
2. 针对每个split，再创建一个RecordReaders读取split，并按照

package hadoop.invertedindex;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class InvertedIndex {
    public static class Map extends Mapper<LongWritable, Text, Text, Text>{
//      word 用来储存单词和URI one 用来储存词频  
        private static Text word = new Text();  
        private static Text one = new Text();

        protected void map(LongWritable key, Text value, Context context)
                throws java.io.IOException ,InterruptedException {

//          (FileSplit)context.getInputSplit() 获取<key, value> 对所属的FileSplit对象
//          在这里由于文件不大，每一个Split分片即是一个对应的文件

//          获取当前Split下的文件名称     
            String fileName = ((FileSplit)context.getInputSplit()).getPath().getName();
//            StringTokenizer 是用来把字符串截取成一个个标记或单词的
            StringTokenizer st = new StringTokenizer(value.toString());
            while(st.hasMoreTokens()){

                word.set(st.nextToken()+"\t"+fileName);
                context.write(word, one);   
            }
        };
    }
    /**
     * Combine 的作用是完成词频统计
     * @author Administrator
     *
     */
    public static class Combine extends Reducer<Text, Text, Text, Text>{
        private static Text word = new Text();
        private static Text index = new Text();

        protected void reduce(Text key, Iterable<Text> values, Context context)
                throws java.io.IOException ,InterruptedException {
//          对key进行操作， 截取分开 单词 和 URI
            String[] splits = key.toString().split("\t");
            if(splits.length != 2){
                return ;
            }
//            统计词频
            long count = 0;
            for (Text v : values) {
                count++;
            }

//            设置key 为 splits[0] 单词  value 为 splits[1] 文件名 + 次数
            word.set(splits[0]);
            index.set(splits[1]+":"+count);
            context.write(word, index);
        };
    }
    /**
     * Reduce 的作用是生成文档列表
     * @author Administrator
     *
     */
    public static class Reduce extends Reducer<Text, Text, Text, Text>{
        private static StringBuilder sub = new StringBuilder(256);
        private static Text index = new Text();

        protected void reduce(Text word, Iterable<Text> values, Context context)
                throws java.io.IOException ,InterruptedException {
            for (Text v : values) {
                sub.append(v.toString()).append(";");
            }
            index.set(sub.toString());
            context.write(word, index);
            sub.delete(0,sub.length());
        };
    }
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
        if(otherArgs.length != 2){
            System.err.println("Usage:InvertedIndex");
            System.exit(2);
        }

        Job job = new Job(conf, "InvertedIndex");
        job.setJarByClass(InvertedIndex.class);

        //设置Map Combine Reduce 处理类
        job.setMapperClass(Map.class);
        job.setCombinerClass(Combine.class);
        job.setReducerClass(Reduce.class);


        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        //设置输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //设置输入和输出目录
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}