hadoop学习之单词计数及变形（一）

最新推荐文章于 2025-01-06 00:01:17 发布

小鹿茶_

最新推荐文章于 2025-01-06 00:01:17 发布

阅读量662

点赞数 3

CC 4.0 BY-SA版权

分类专栏： hadoop

本文链接：https://blog.youkuaiyun.com/Lxl121181/article/details/86422216

hadoop 专栏收录该内容

3 篇文章

订阅专栏

本文分享了作者在大数据公司实习期间的学习经历，重点介绍了如何使用MapReduce实现单词计数及其变形——统计单词首字母出现次数。通过具体代码示例，详细解释了Mapper和Reducer的工作原理。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

最近入职了一家大数据公司实习，之前没有接触过大数据，所以入职之后就从基础开始学起，先做了单词计数

一、统计单词个数：

思路很简单，单词之前有空格，在mapper中将单词按照空格读进来，写到context中，{am,1},{hello,1},{word,1}...这样的形式，在reducer中读入mapper的输入，对于每一个key，统计单词出现的次数。在mapper之后，reducer处理之前，mapreduce会默认对单词进行分区排序，会将相同的单词放到一起，这也是单词计数的原理。代码很多博客都有，这里就不贴了（也可参考下方的代码）。

变形一：统计单词首字母出现的个数

和单词计数原理一样，不同的是之前的key整个单词，现在key是单词的首字母，同样是用空格分开读如单词，将首字母取出，写入context中，mapper的输入作为reducer的输出。

代码：


import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {


    @Override
    protected void map(LongWritable key, Text value,Context context) throws IOException,InterruptedException {
        String oneLine = value.toString().replace("\\s+"," ");
        String[] words = oneLine.split(" ");

        //将单词存入context
        String initials = " ";
        for(String word : words) {
            if(word.length() == 1)
                initials = word;
            else if(word.length()>1){
                initials = word.substring(0,1);
            }
            //System.out.println(initials);
            context.write(new Text(initials),new IntWritable(1));
        }
    }

}

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

public class WordCountReducer extends Reducer<Text, IntWritable,Text,IntWritable> {

    @Override
    protected void reduce(Text word,Iterable<IntWritable> valueIterator, Context context) throws IOException,InterruptedException {
        int count = 0;
        Iterator<IntWritable> iterator =  valueIterator.iterator();
        while ( iterator.hasNext()) {

            iterator.next();
            count++;
        }

        context.write(word,new IntWritable(count));
    }
}

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


public class WordCountJob {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        //创建一个job实例
        Job job = Job.getInstance(conf,"wordcount");

        job.setJarByClass(WordCountJob.class);

        job.setMapperClass(WordCountMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setReducerClass(WordCountReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        TextInputFormat.setInputPaths(job,new Path(args[0]));
        TextOutputFormat.setOutputPath(job,new Path(args[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}