Hadoop-WordCount

最新推荐文章于 2025-03-16 22:40:22 发布

原创最新推荐文章于 2025-03-16 22:40:22 发布

· 530 阅读

2 ·

版权

本文介绍如何使用Hadoop MapReduce实现WordCount程序，包括Mapper、Reducer和Driver三个核心组件的具体实现过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

首先，明确需要3个类WordCountMapper、WordCountReducer、WordCountDriver

一、WordCountMapper

流程：实现map方法
		1、每次获取一行数据，并且需要将Text序列化类型转为字符串类型
		2、对每一行数据进行处理，切割，拿到每一个单词
		3、把单词输出，遍历数组中每个单词

序列化：就是把结构化对象转化为字节流

package cn.kgc.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * <keyin,valuein,keyout,valueout>  (0,hello)   ——>   (hello,1)
 *  输入的key    (0,hello world)    0  文本的偏移量
 *  输入的value    hello world
 *  输出的key   (hello ,1)   hello
 *  输出的value    1
 */

/*string    Text
int       IntWritable
long      LongWritable
float     FloatWritable
double    DoubleWritable
boolean   BooleanWritable
long      LongWritable
map       MapWritable
array     ArrayWritable*/

// 因为需要经过网络传输
// Mapper和Reducer中主要写业务逻辑
public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {

    Text k = new Text();
    // 因为要输出，一个单词记成1
    IntWritable v = new IntWritable(1);

    // 1、实现map方法

    @Override
    // Context :是一个上下文对象，可以把结果进行写出
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1、每次是获取一行数据，并且要将Text类型转为字符串类型
        String line = value.toString();

        // 2、对每一行数据进行处理  对单词统计就需要拿到每一个单词
        // 切割是一个数组。里面是一个一个单词
        String[] words = line.split("\\s+");   // 用空格切割可能会存在一些问题，使用正则

        // 3、对单词进行一个一个输出，输出成(word,1)这样的形式
        for (String word : words) {
            // 这个时候单词是字符串类型，写出需要序列化，转换成Text类型
            // 调用set方法
           // Text k = new Text();
            k.set(word);

            //IntWritable v = new IntWritable(1);
            // 输出结果
            context.write(k,v);

            // 放到上面的原因：有多少个单词就创建多少个IntWritable对象
            // 造成大量的垃圾清理
            // JVM 经常需要垃圾清理，非常消耗资源，所以尽量少创建对象
        }
        
    }
}

二、WordCountReducer

流程：实现reduce方法
	 定义sum方法

package cn.kgc.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;


/**
 * keyin,valuein.keyout,valueout
 * 输入的key   map端输出的key
 * 输入的value   map端输出的value
 * 输出的key    (hello,5)   hello
 * 输出的value    5
 * 相同的key调用一次reduce方法 
 */

public class WordCountReducer extends Reducer<Text, IntWritable,Text,IntWritable> {

    int sum;
    IntWritable count = new IntWritable();
	
	// 为每个key调用一次，把相同的key放到一起
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        // (hello,1) (hello,1) ——>    (hello,(1,1))
         // 定义初始变量
        //int num = 0;
        // 每一个reduce 方法创建一个sum对象
        sum=0;

        for (IntWritable value : values) {
            // 转型
         sum+=value.get();
        }
        // sum=2
        //IntWritable count = new IntWritable();

		/**
         * 输出的时候是Text,IntWritable类型，不能直接输出，需要再创建一个
         */
        count.set(sum);
        context.write(key,count);
    }
}

三、WordCountDriver

流程：程序的入口
	 获取配置信息以及创建任务
	 设置Driver类所在程序路径
	 指定Mapper和Reducer
	 指定Mapper端的输出类型
	 指定最终的结果输出类型
	 指定输入文件和输出文件路径
	 提交任务代码

package cn.kgc.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordCountDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        // 1、获取配置信息以及封装任务
        Configuration conf = new Configuration();
        Job job = Job.getInstance();  // 获取一个实例


        // 2、指定Driver类程序jar所在的路径
        // 需要的是一个.class对象
        job.setJarByClass(WordCountDriver.class);


        // 3、指定Mapper和Reducer
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);


        // 4、指定Mapper端的输出类型(key value)  (hello,1)
        job.setMapOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);



        // 5、指定最终的结果输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);



        // 6、指定输入文件和输出文件的路径
        // 到路径就可以，不要到文件
        FileInputFormat.setInputPaths(job,new Path("file:///D:\\直播课\\codes\\wordcount\\data\\workout"));

		// 这个路径不能存在，不然会报错
        FileOutputFormat.setOutputPath(job,new Path("file:///D:\\直播课\\codes\\wordcount\\data\\workout\\output"));


        // 7、提交任务执行代码
        // 等待它完成，返回一个结果
        boolean result = job.waitForCompletion(true);

        // result为真就为0，为假就为1    result 是个布尔类型
        System.exit(result?0:1);
        // 0正常退出  1 异常退出

    }
}

在这里插入图片描述

crc是校验文件，success是完成的标志，part-r-00000是统计结果
在这里插入图片描述
补充：

mapreduce的编码规范
1、所有数据的入口和出口都为键值对
2、Mapper端
	入口键值对为行偏移量和行内容，一行内容一个Mapper对象
	在Mapper对象的map()方法中将行的内容解析成N个新的键值对输出
3、Reducer端
	Reducer端从所有Mapper端获取属于该分区的减值并根据键分组，一个分组一个Reducer对象
	在Reducer对象的reduce()方法中对改组数据进行聚合后以键值对输出
	
	
传输类型为什么要序列化？常用的序列化类型是什么？
1、支持序列化的传输类型才可以以流的方式在节点间传输和写盘
2、MapReduce常用序列化类型
	string    Text
	int       IntWritable
	long      LongWritable
	float     FloatWritable
	double    DoubleWritable
	boolean   BooleanWritable
	long      LongWritable
	map       MapWritable
	array     ArrayWritable
	

自定义可序列化类有哪些注意事项
1、自定义类型必须继承Writable接口
   避免使用java的Serializable接口，使得序列化过程太重
3、自定义键类型必须同时实现WritableComparable
	MapReduce过程需要多次根据key排序