倒排索引创建（Mapreduce）_map reduce构建正排倒排索引python-优快云博客

本文介绍使用Hadoop MapReduce进行文本文件中单词频率统计的方法。通过两个阶段的MapReduce作业，实现了对多个文本文件中单词出现次数的精确统计，并以直观的格式输出结果。

需求描述：假如有如下文本文件：

a.txt b.txt c.txt

hello tom hello jack hello jerry

hello jerry hello jim hello java

hello jim hello kitty hello c++

hello kitty hello rose hello c++

需要得到以下结果：（每个单词在每个文件中出现次数的统计）

hello a.txt-->4 b.txt-->4 c.txt-->4

java c.txt-->1

jerry b.txt-->1 c.txt-->1

....

思路：

（1）首先，写一个mapreduce程序：统计出每个单词在每个文件中的总次数，如：

hello-a.txt 4

hello-b.txt 4

jerry-b.txt 1

.......

（2）然后再写第二个mapreduce程序，读取上述结果数据，并形成最终需要格式输出：

map：根据“-”切，以单词做key，后面一段作为value

reduce：拼接values里面的每一段，以单词做key，拼接结果做value，输出即可

第一步代码如下：

package ldp.index;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class IndexStep1 {

		public static class IndexStep1Mapper extends Mapper<LongWritable, Text, Text, IntWritable>{
			
			
			@Override
			protected void map(LongWritable key, Text value,
					Mapper<LongWritable, Text, Text, IntWritable>.Context context)
					throws IOException, InterruptedException {
			
				FileSplit inputSplit = (FileSplit) context.getInputSplit();
				String name = inputSplit.getPath().getName();				
				
				String[] words = value.toString().split(" ");
				for (String word : words) {
					context.write(new Text(word+"-"+name), new IntWritable(1));
				}							
			}
		}			
		public static class IndexStep1Reducer extends Reducer<Text, IntWritable, Text, IntWritable>{
			
			@Override
			protected void reduce(Text key, Iterable<IntWritable> values,
					Reducer<Text, IntWritable, Text, IntWritable>.Context context)
					throws IOException, InterruptedException {
				
				int count = 0 ;
				for (IntWritable value : values) {
					count += value.get();
				}
				context.write(key, new IntWritable(count));				
			}
						
		}
			
			
		public static void main(String[] args) throws Exception {

				Configuration conf = new Configuration();				
				Job job = Job.getInstance(conf);
				job.setJarByClass(IndexStep1.class);
				job.setMapperClass(IndexStep1Mapper.class);
				job.setReducerClass(IndexStep1Reducer.class);
				job.setMapOutputKeyClass(Text.class);
				job.setMapOutputValueClass(IntWritable.class);				
				job.setOutputKeyClass(Text.class);
				job.setOutputValueClass(IntWritable.class);
				FileInputFormat.setInputPaths(job, new Path("E:\\hadoopdatas\\index_data\\input"));
				FileOutputFormat.setOutputPath(job, new Path("E:\\hadoopdatas\\index_data\\output1"));
				job.setNumReduceTasks(3);				
				job.waitForCompletion(true);				
			}
}

第二步代码如下：

package ldp.index;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.nustaq.serialization.util.test;

public class IndexStep2 {

		public static class IndexStep2Mapper extends Mapper<LongWritable, Text, Text, Text>{
						
			@Override
			protected void map(LongWritable key, Text value,
					Mapper<LongWritable, Text, Text, Text>.Context context)
					throws IOException, InterruptedException {
				
			String[] words = value.toString().split("-");	
			context.write(new Text(words[0]), new Text(words[1].replaceAll("\t", "-->")));				
			}
		}			
		public static class IndexStep2Reducer extends Reducer<Text, Text, Text, Text>{
			
			@Override
			protected void reduce(Text key, Iterable<Text> values,
					Reducer<Text, Text, Text, Text>.Context context)
					throws IOException, InterruptedException {
				
				StringBuilder sb = new StringBuilder();
				for (Text value : values) {
					sb.append(value.toString()).append("\t");
				}				
				context.write(key, new Text(sb.toString()));
			}					
		}
			
			
		public static void main(String[] args) throws Exception {

				Configuration conf = new Configuration();				
				Job job = Job.getInstance(conf);
				job.setJarByClass(IndexStep2.class);
				job.setMapperClass(IndexStep2Mapper.class);
				job.setReducerClass(IndexStep2Reducer.class);
				job.setMapOutputKeyClass(Text.class);
				job.setMapOutputValueClass(Text.class);				
				job.setOutputKeyClass(Text.class);
				job.setOutputValueClass(Text.class);
				FileInputFormat.setInputPaths(job, new Path("E:\\hadoopdatas\\index_data\\output1"));
				FileOutputFormat.setOutputPath(job, new Path("E:\\hadoopdatas\\index_data\\output2"));
				job.setNumReduceTasks(1);				
				job.waitForCompletion(true);
				
			}
}