Mr倒排索引

最新推荐文章于 2025-06-02 16:15:33 发布

It_sucks

最新推荐文章于 2025-06-02 16:15:33 发布

阅读量160

点赞数

文章标签： mr hadoop mapreduce

本文链接：https://blog.youkuaiyun.com/weixin_71132847/article/details/130797961

版权

该代码示例展示了如何使用HadoopMapReduce框架来构建倒排索引。InvertedIndexMapper类负责从输入文件中提取字段并生成键值对，InvertedIndexCombiner进行局部词频统计，而InvertedIndexReducer则整合所有中间结果，最终输出每个词及其关联的文件列表。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

InvertedIndexMapper

package invertedIndex;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class InvertedIndexMapper extends Mapper<LongWritable, Text, Text, Text> {

   private static Text keyInfo = new Text();//

   private static final Text valueInfo = new Text("1");//

   @Override
   protected void map(LongWritable key, Text value, Context context)
           throws IOException, InterruptedException {
       String line = value.toString();
       String[] fields = StringUtils.split(line, " ");//
       FileSplit fileSplit = (FileSplit) context.getInputSplit();//
       String fileName = fileSplit.getPath().getName();//
       for (String field : fields) {

           keyInfo.set(field + ":" + fileName);

           context.write(keyInfo, valueInfo);
       }
   }
}

InvertedIndexCombiner

package invertedIndex;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class InvertedIndexCombiner extends Reducer<Text, Text, Text, Text> {

   private static Text info = new Text();

   // 输入： <MapReduce:file3 {1,1,...}>
   // 输出：<MapReduce file3:2>
   @Override
   protected void reduce(Text key, Iterable<Text> values, Context context)
           throws IOException, InterruptedException {
       int sum = 0;// 统计词频
       for (Text value : values) {
           sum += Integer.parseInt(value.toString());
       }

       int splitIndex = key.toString().indexOf(":");
       // 重新设置 value 值由 URL 和词频组成
       info.set(key.toString().substring(splitIndex + 1) + ":" + sum);
       // 重新设置 key 值为单词
       key.set(key.toString().substring(0, splitIndex));

       context.write(key, info);
   }
}

InvertedIndexReducer

package invertedIndex;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class InvertedIndexReducer extends Reducer<Text, Text, Text, Text> {

private static Text result = new Text();

/*
*
*/
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
//show file list

String fileList = new String();
for (Text value : values) {
fileList += value.toString() + ";";
}

result.set(fileList);
context.write(key, result);
}
}

InvertedIndexRunner

package invertedIndex;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class InvertedIndexRunner {
public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);

job.setJarByClass(InvertedIndexRunner.class);

job.setMapperClass(InvertedIndexMapper.class);
job.setCombinerClass(InvertedIndexCombiner.class);
job.setReducerClass(InvertedIndexReducer.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);

FileInputFormat.setInputPaths(job, new Path("D:\\mr\\intput"));

FileOutputFormat.setOutputPath(job, new Path("D:\\mr\\output"));

boolean res = job.waitForCompletion(true);

System.exit(res ? 0 : 1);
}
}