需求描述:假如有如下文本文件:
a.txt b.txt c.txt
hello tom hello jack hello jerry
hello jerry hello jim hello java
hello jim hello kitty hello c++
hello kitty hello rose hello c++
需要得到以下结果:(每个单词在每个文件中出现次数的统计)
hello a.txt-->4 b.txt-->4 c.txt-->4
java c.txt-->1
jerry b.txt-->1 c.txt-->1
....
思路:
(1)首先,写一个mapreduce程序:统计出每个单词在每个文件中的总次数,如:
hello-a.txt 4
hello-b.txt 4
jerry-b.txt 1
.......
(2)然后再写第二个mapreduce程序,读取上述结果数据,并形成最终需要格式输出:
map: 根据“-”切,以单词做key,后面一段作为value
reduce: 拼接values里面的每一段,以单词做key,拼接结果做value,输出即可
第一步代码如下:
package ldp.index;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class IndexStep1 {
public static class IndexStep1Mapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String name = inputSplit.getPath().getName();
String[] words = value.toString().split(" ");
for (String word : words) {
context.write(new Text(word+"-"+name), new IntWritable(1));
}
}
}
public static class IndexStep1Reducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
int count = 0 ;
for (IntWritable value : values) {
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(IndexStep1.class);
job.setMapperClass(IndexStep1Mapper.class);
job.setReducerClass(IndexStep1Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("E:\\hadoopdatas\\index_data\\input"));
FileOutputFormat.setOutputPath(job, new Path("E:\\hadoopdatas\\index_data\\output1"));
job.setNumReduceTasks(3);
job.waitForCompletion(true);
}
}
第二步代码如下:
package ldp.index;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.nustaq.serialization.util.test;
public class IndexStep2 {
public static class IndexStep2Mapper extends Mapper<LongWritable, Text, Text, Text>{
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String[] words = value.toString().split("-");
context.write(new Text(words[0]), new Text(words[1].replaceAll("\t", "-->")));
}
}
public static class IndexStep2Reducer extends Reducer<Text, Text, Text, Text>{
@Override
protected void reduce(Text key, Iterable<Text> values,
Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
StringBuilder sb = new StringBuilder();
for (Text value : values) {
sb.append(value.toString()).append("\t");
}
context.write(key, new Text(sb.toString()));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(IndexStep2.class);
job.setMapperClass(IndexStep2Mapper.class);
job.setReducerClass(IndexStep2Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("E:\\hadoopdatas\\index_data\\output1"));
FileOutputFormat.setOutputPath(job, new Path("E:\\hadoopdatas\\index_data\\output2"));
job.setNumReduceTasks(1);
job.waitForCompletion(true);
}
}
本文介绍使用Hadoop MapReduce进行文本文件中单词频率统计的方法。通过两个阶段的MapReduce作业,实现了对多个文本文件中单词出现次数的精确统计,并以直观的格式输出结果。
5210

被折叠的 条评论
为什么被折叠?



