hadoop中倒排索引的实践

最新推荐文章于 2023-04-10 20:43:19 发布

原创最新推荐文章于 2023-04-10 20:43:19 发布 · 1k 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#Hadoop #倒排索引

Hadoop 专栏收录该内容

7 篇文章

订阅专栏

本文详细介绍了如何使用Hadoop构建倒排索引，以实现对文档的高效检索。通过创建文件并上传至DFS，定义了InvertedIndex类，实现了倒排索引的构建过程，并最终在inverted-index目录下生成了倒排索引文件。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

个人原创，转载请注明。

倒排索引是文档检索系统中最常用的数据结构，被广泛的应用于全文搜索引擎。它主要用来存储某个单词（或词组），在一个文档或一组文档中的存储位置的映射，即提供了一种根据内容来查找文档的方式，由于不是根据文档来确定文档所包含的内容，而是进行了相反的操作，因而被称为倒排索引。

在home中新建三个文件file1，file2，file3，其内容分别如下：

file1:MapReduce is simple

file2:MapReduce is powerful is simple

file3:Hello MapReduce bye MapReduce

在DFS Locations中新建一个目录，叫inverted-index。

在新建的 inverted-index目录下选择上传文件到DFS Locations上。

按照这样，把三个文件file1，file2，file3都上传。

接下来新建项目Proj_invertedIndex, 新建 invertedIndex类，代码如下：

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;


public class InvertedIndex {

    public static class InvertedIndexMapper extends Mapper<Object, Text, Text, Text>{
        private Text keyInfo=new Text();
        private Text valueInfo=new Text();
        private FileSplit split;
        
        public void map(Object key,Text value,Context context)throws IOException,InterruptedException {
            //获得<key,value>对所属的对象
            split=(FileSplit)context.getInputSplit();
            StringTokenizer itr=new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
                //key值有单词和url组成，如"mapreduce:1.txt"
                keyInfo.set(itr.nextToken()+":"+split.getPath().toString());
                valueInfo.set("1");
                context.write(keyInfo, valueInfo);
            }
            
        }
    }
    public static class InvertedIndexCombiner extends Reducer<Text, Text, Text, Text>{
            private Text info=new Text();
            public void reduce(Text key,Iterable<Text> values,Context context)throws IOException,InterruptedException {
                //统计词频
                int sum=0;
                for (Text value:values) {
                    sum+=Integer.parseInt(value.toString());
                }
                
                int splitIndex=key.toString().indexOf(":");
                //重新设置value值由url和词频组成
                info.set(key.toString().substring(splitIndex+1)+":"+sum);
                //重新设置key值为单词
                key.set(key.toString().substring(0,splitIndex));
                context.write(key, info);
            }
        }
    public static class InvertedIndexReduce extends Reducer<Text, Text, Text, Text> {
            private Text result=new Text();
            public void reduce(Text key,Iterable<Text>values,Context context) throws IOException,InterruptedException{
                //生成文档列表
                String fileList=new String();
                for (Text value:values) {
                    fileList+=value.toString()+";";
                }
                result.set(fileList);
                context.write(key, result);
            }
        }
    
    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub
        Configuration conf=new Configuration();
        String[] otherArgs=new GenericOptionsParser(conf,args).getRemainingArgs();
        if (otherArgs.length!=2) {
            System.err.println("Usage:invertedindex<in><out>");
            System.exit(2);
        }
        Job job=new Job(conf,"InvertedIndex");
        job.setJarByClass(InvertedIndex.class);
        
        job.setMapperClass(InvertedIndexMapper.class);
        
        job.setMapOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        
        job.setCombinerClass(InvertedIndexCombiner.class);
        job.setReducerClass(InvertedIndexReduce.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        
        System.exit(job.waitForCompletion(true)?0:1);

    }

}

然后配置run configurations。