一. 前言
之前笔记里记得案例今天看到了,所以拿出来分享。
**首先介绍一下案例需求,统计出hadoop上的一个hello目录下的文件不同单词的个数,并输出统计结果。**
MapReduce 是一种分布式计算模型,主要分为Map和Reduce两部分,用户只需要实现map()和reduce()函数就可以,一般两个函数之间以key和value这种键值对传递参数

二.代码
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
public class WordCountApp {
static final String INPUT_PATH = "hdfs://hadoop1:9000/hello";
static final String OUT_PATH = "hdfs://hadoop1:9000/out";
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
final Job job = new Job(conf,WordCountApp.class.getSimpleName());
FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
Path outPath = new Path(OUT_PATH);
if(fileSystem.exists(outPath)){
fileSystem.delete(outPath, true);
}
FileInputFormat.setInputPaths(job, INPUT_PATH);
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setPartitionerClass(HashPartitioner.class);
job.setNumReduceTasks(1);
job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
job.setOutputFormatClass(TextOutputFormat.class);
job.waitForCompletion(true);
}
/**
* k1 每一行起始的位置
* v1 每一行的文本内容
* k2 每一行中的每个单词
* v2 每一行中的每个单词出现的次数,固定值1
* @author mademin
*
*/
static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
@Override
protected void map(LongWritable k1, Text v1,
Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
String[] splited = v1.toString().split("\t");
for(String word : splited){
context.write(new Text(word), new LongWritable(1L));
}
}
}
/**
* k2
* v2
* k3 整个文件中的不同单词
* v3 整个文件中的不同单词出现总数
* @author mademin
*
*/
static class MyReduce extends Reducer<Text, LongWritable, Text, LongWritable>{
@Override
protected void reduce(Text k2, Iterable<LongWritable> v2s,
Reducer<Text, LongWritable, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
long sum = 0L;
for(LongWritable v2 : v2s){
sum += v2.get();
}
context.write(k2, new LongWritable(sum));
}
}
}