MapReduce的编程规范

最新推荐文章于 2024-03-15 06:30:00 发布

原创最新推荐文章于 2024-03-15 06:30:00 发布 · 444 阅读

6 ·

CC 4.0 BY-SA版权

文章标签：

#hadoop #大数据

Hadoop 专栏收录该内容

21 篇文章

订阅专栏

本文详细介绍了Hadoop MapReduce框架下的WordCount实现过程，包括Mapper、Reducer和Driver三个核心组件的设计与实现，以及完整的代码示例。

⽤户编写的程序分为3 个部分： Mapper 、 Reducer 、 Driver （提交 mr 程序的客户端）

一、Mapper部分

1. ⾃定义类，继承 Mapper 类型

2. 定义 K1,V1,K2,V2 的泛型（ K1,V1 是 Mapper 的输⼊数据类型， K2,V2 是 Mapper 的输出数据类型）

3. 重写 map ⽅法（处理逻辑）

参考下图：

注意 : map ⽅法，每⼀个 KV 对都会调⽤⼀次。

二、Reducer部分

1. ⾃定义类，继承 Reducer 类型

2. 定义 K2,V2,K3,V3 的泛型（ K2,V2 是 Reducer 的输⼊数据类型， K3,V3 是 Reducer的输出数据类型）

3. 重写 reduce ⽅法的处理逻辑

参考下图：

注意 : reduce ⽅法，默认按 key 分组，每⼀组都调⽤⼀次。

三、Driver部分

整个程序需要⼀个Driver 来进⾏提交，提交的是⼀个描述了各种必要信息的 job 对象，如下

1. 获取 Job 对象

2. 指定驱动类

3. 设置 Mapper 和 Reducer 类型

4. 设置 Mapper 的输出 K2 、 V2 的类型（如果类型和 K3,V3 相同 , 可省略）

5. 设置 Reducer 的输出 K3 、 V3 的类型

6. 设置 Reduce 的个数（默认为 1 ）

7. 设置 Mapper 的输⼊数据的路径

8. 设置 Reducer 的输出数据的路径

9. 提交作业

参考下图：

四、wordcount演示实例

4.1需求：

统计⽂档中的单词数量

4.2测试数据：

a.txt

hello qianfeng hello 1999 hello beijing hello

world hello hello java good

b.txt

hello xisanqi hello bingbing

hello chenchen hello

ACMilan hello china

c.txt

hello hadoop hello java hello storm hello spark hello redis

hello zookeeper

hello hive hello hbase hello flume

4.3代码实现

⾸先注意我们要有⼀个主类 WordCount, 在主类⾥⾯有两个内部类 WordCountMapper,WordCountReducer和 main ⽅法

下⾯是wordcount类的框架

public static class WordCountMapper extends Mapper<LongWritable,Text, Text, IntWritable>{
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException,InterruptedException {

    }
}

public static class WordCountReducer extends Reducer<Text,IntWritable, Text, LongWritable>{
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException,InterruptedException {

    }
}

//注意:这⾥我们可以直接使⽤WordCount充当主类,所以主类的功能可以简化成main⽅法
    public static void main(String[] args) throws IOException,ClassNotFoundException, InterruptedException {

}

4.3.1 pom.xml⽂件的配置

<dependencies>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>2.7.6</version>
    </dependency>

    <!--https://mvnrepository.com/artifact/org.apache.hadoop/hadoopclient -->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>2.7.6</version>
    </dependency>

     <!--https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs-->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-hdfs</artifactId>
        <version>2.7.6</version>
    </dependency>
</dependencies>

4.3.2 定义⼀个mapper内部类

/**
* @Description 写⼀个wordcount程序的mapper类型
*
* 读取块⽂件时，K1是⾏偏移量，使⽤LongWritable类型
* v1是⾏记录， 使⽤Text类型
* 经过map函数处理后，
* k2是单词 使⽤Text类型
* v2是1 使⽤IntWritable类型
*
*/
public static class WordCountMapper extends Mapper<LongWritable,Text,Text, IntWritable> {
    /**
     * 重写Mapper类⾥提供的map⽅法
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException,InterruptedException {
    //map⽅法的key就是k1, ⾏偏移量不需要，因此不需要处理。只需要处理value，因为value就是v1，⾏记录
    //⼀对k1,v1就会调⽤⼀次map函数，因此map⽅法执⾏的次数和⾏记录数有关系
    //1: 将value的类型转为java的String类型
    String line = value.toString();
    //2: 使⽤空格对⾏记录进⾏切分成字符串数组
    String[] words = line.split(" ");
    //3: 遍历数组
    for (String word : words) {
    //要将word类型转为Text类型 ，当成k2 IntWritable类型的1作为value
    Text k2 = new Text(word);
    IntWritable v2 = new IntWritable(1);
    //4 将k2,v2,作为输出数据写出去，写到shuffle流程中的缓存区
    context.write(k2,v2);
    }
}

4.3.3 定义⼀个reducer内部类

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
/**
 * @Description 写⼀个wordcount案例的Reducer类型
 *
 * 输⼊数据是map阶段产⽣的数据，经过shuffle阶段 进⾏了fetch和归并排序，并且按key分组，value整合成列表（迭代器）
 * k2就是Map阶段的k2,因此是Text类型
 * v2就是Map阶段的v2的列表,因此是IntWritable类型
 * 输出数据：k3是单词，因此和k2的类型⼀致
 * v3是叠加的数字，因此是IntWritable类型
 */
public static class WordCountReducer extends Reducer<Text,IntWritable, Text,IntWritable> {
    /**
     * 重写reduce⽅法
     * reduce的参数key就是k2
     * 参数values就是v2的列表
     * <"hello",<1,1,1,1,1,1,1,1,1>>
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException,InterruptedException {
    //1: 获取迭代器对象
    Iterator<IntWritable> iterator = values.iterator();
    int sum = 0;//计数器
    //2: 进⾏询问有没有下⼀个元素
    while(iterator.hasNext()){
        //3: 取出元素
        IntWritable v2 = iterator.next();
        //4: 将IntWritable类型转为int类型进⾏叠加
        sum+=v2.get();
    }
    //5:将累加和转成IntWritable类型
    IntWritable v3 = new IntWritable(sum);
    //6: 将k2作为k3,和v3⼀起写出去    键值对应该是：<hello,8>
    context.write(key,v3);
    }
}

4.3.4 定义⼀个Driver类

定义⼀个主类，⽤来描述job并提交job

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
 * @Description 写⼀个wordcount案例的驱动类
 */
public static void main(String[] args) throws IOException,ClassNotFoundException, InterruptedException {
    //1:获取配置信息
    Configuration conf = new Configuration();
    //2:获取job对象
    Job job = Job.getInstance(conf);
    //设置驱动类型
    job.setJarByClass(wordcount.class);

    //3:设置mapper和reducer类型
    job.setMapperClass(WordCountMapper.class);
    job.setReducerClass(WordCountReducer.class);

    //4: 设置map的输出类型k2和v2，因为k2和k3类型相同，v2和v3类型相同，因此可以省略
    /* job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);*/

    //5:设置reduce的输出k3和v3的类型
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    //6:可以设置reduceTask的个数，默认值是1
    job.setNumReduceTasks(2);

    //7:设置mapreduce程序的输⼊路径和输出路径
    FileInputFormat.setInputPaths(job,new Path(args[0]));
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

    //8:提交
    System.exit(job.waitForCompletion(true)?0:1);
 }