【转】Hadoop示例程序WordCount详解及实例

最新推荐文章于 2024-06-07 19:41:02 发布

转载最新推荐文章于 2024-06-07 19:41:02 发布 · 155 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：http://www.cnblogs.com/wangsouc/articles/3832692.html

文章标签：

#大数据 #java

本文详细解析了MapReduce的工作原理，通过图解展示了其处理流程，并提供了WordCount的实际代码示例，深入理解分布式计算的基本概念。

【转自】http://blog.youkuaiyun.com/xw13106209/article/details/6116323

1.图解MapReduce

2.简历过程：

Input:

Hello World Bye World

Hello Hadoop Bye Hadoop

Bye Hadoop Hello Hadoop

Map:

<Hello,1>

<World,1>

<Bye,1>

<World,1>

<Hello,1>

<Hadoop,1>

<Bye,1>

<Hadoop,1>

<Bye,1>

<Hadoop,1>

<Hello,1>

<Hadoop,1>

Sort:

<Bye,1>

<Hadoop,1>

<Hello,1>

<World,1>

Combine:

<Bye,1,1,1>

<Hadoop,1,1,1,1>

<Hello,1,1,1>

<World,1,1>

Reduce:

<Bye,3>

<Hadoop,4>

<Hello,3>

<World,2>

3.代码实例：

  1 package com.felix;
  2 import java.io.IOException;
  3 import java.util.Iterator;
  4 import java.util.StringTokenizer;
  5 import org.apache.hadoop.fs.Path;
  6 import org.apache.hadoop.io.IntWritable;
  7 import org.apache.hadoop.io.LongWritable;
  8 import org.apache.hadoop.io.Text;
  9 import org.apache.hadoop.mapred.FileInputFormat;
 10 import org.apache.hadoop.mapred.FileOutputFormat;
 11 import org.apache.hadoop.mapred.JobClient;
 12 import org.apache.hadoop.mapred.JobConf;
 13 import org.apache.hadoop.mapred.MapReduceBase;
 14 import org.apache.hadoop.mapred.Mapper;
 15 import org.apache.hadoop.mapred.OutputCollector;
 16 import org.apache.hadoop.mapred.Reducer;
 17 import org.apache.hadoop.mapred.Reporter;
 18 import org.apache.hadoop.mapred.TextInputFormat;
 19 import org.apache.hadoop.mapred.TextOutputFormat;
 20 /**
 21  * 
 22  * 描述：WordCount explains by Felix
 23  * @author Hadoop Dev Group
 24  */
 25 public class WordCount
 26 {
 27     /**
 28      * MapReduceBase类:实现了Mapper和Reducer接口的基类（其中的方法只是实现接口，而未作任何事情）
 29      * Mapper接口：
 30      * WritableComparable接口：实现WritableComparable的类可以相互比较。所有被用作key的类应该实现此接口。
 31      * Reporter 则可用于报告整个应用的运行进度，本例中未使用。 
 32      * 
 33      */
 34     public static class Map extends MapReduceBase implements
 35             Mapper<LongWritable, Text, Text, IntWritable>
 36     {
 37         /**
 38          * LongWritable, IntWritable, Text 均是 Hadoop 中实现的用于封装 Java 数据类型的类，这些类实现了WritableComparable接口，
 39          * 都能够被串行化从而便于在分布式环境中进行数据交换，你可以将它们分别视为long,int,String 的替代品。
 40          */
 41         private final static IntWritable one = new IntWritable(1);
 42         private Text word = new Text();
 43         
 44         /**
 45          * Mapper接口中的map方法：
 46          * void map(K1 key, V1 value, OutputCollector<K2,V2> output, Reporter reporter)
 47          * 映射一个单个的输入k/v对到一个中间的k/v对
 48          * 输出对不需要和输入对是相同的类型，输入对可以映射到0个或多个输出对。
 49          * OutputCollector接口：收集Mapper和Reducer输出的<k,v>对。
 50          * OutputCollector接口的collect(k, v)方法:增加一个(k,v)对到output
 51          */
 52         public void map(LongWritable key, Text value,
 53                 OutputCollector<Text, IntWritable> output, Reporter reporter)
 54                 throws IOException
 55         {
 56             String line = value.toString();
 57             StringTokenizer tokenizer = new StringTokenizer(line);
 58             while (tokenizer.hasMoreTokens())
 59             {
 60                 word.set(tokenizer.nextToken());
 61                 output.collect(word, one);
 62             }
 63         }
 64     }
 65     public static class Reduce extends MapReduceBase implements
 66             Reducer<Text, IntWritable, Text, IntWritable>
 67     {
 68         public void reduce(Text key, Iterator<IntWritable> values,
 69                 OutputCollector<Text, IntWritable> output, Reporter reporter)
 70                 throws IOException
 71         {
 72             int sum = 0;
 73             while (values.hasNext())
 74             {
 75                 sum += values.next().get();
 76             }
 77             output.collect(key, new IntWritable(sum));
 78         }
 79     }
 80     public static void main(String[] args) throws Exception
 81     {
 82         /**
 83          * JobConf：map/reduce的job配置类，向hadoop框架描述map-reduce执行的工作
 84          * 构造方法：JobConf()、JobConf(Class exampleClass)、JobConf(Configuration conf)等
 85          */
 86         JobConf conf = new JobConf(WordCount.class);
 87         conf.setJobName("wordcount");           //设置一个用户定义的job名称
 88         conf.setOutputKeyClass(Text.class);    //为job的输出数据设置Key类
 89         conf.setOutputValueClass(IntWritable.class);   //为job输出设置value类
 90         conf.setMapperClass(Map.class);         //为job设置Mapper类
 91         conf.setCombinerClass(Reduce.class);      //为job设置Combiner类
 92         conf.setReducerClass(Reduce.class);        //为job设置Reduce类
 93         conf.setInputFormat(TextInputFormat.class);    //为map-reduce任务设置InputFormat实现类
 94         conf.setOutputFormat(TextOutputFormat.class);  //为map-reduce任务设置OutputFormat实现类
 95         /**
 96          * InputFormat描述map-reduce中对job的输入定义
 97          * setInputPaths():为map-reduce job设置路径数组作为输入列表
 98          * setInputPath()：为map-reduce job设置路径数组作为输出列表
 99          */
100         FileInputFormat.setInputPaths(conf, new Path(args[0]));
101         FileOutputFormat.setOutputPath(conf, new Path(args[1]));
102         JobClient.runJob(conf);         //运行一个job
103     }
104 }