Hadoop-第一个MapReduce程序（WordCount）开发

最新推荐文章于 2022-08-29 08:18:34 发布

置顶文文鑫

最新推荐文章于 2022-08-29 08:18:34 发布

阅读量266

点赞数 1

CC 4.0 BY-SA版权

分类专栏：笔记 # Hadoop 文章标签： hadoop mapreduce

本文链接：https://blog.youkuaiyun.com/dgssd/article/details/110323039

笔记同时被 2 个专栏收录

14 篇文章

订阅专栏

Hadoop

10 篇文章

订阅专栏

该博客介绍了如何使用Hadoop MapReduce进行词频统计。首先，数据被上传到HDFS，然后通过配置Hadoop环境并开发MapReduce作业，包括Map阶段将文本拆分成单词，Reduce阶段对单词进行求和。最后，作业被提交到YARN集群执行，并通过HDFS命令查看结果。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1.准备数据文件
aa.log(注意空格)
wenxin xaiowen wangwu
xiaowen xiaoxin wenxin
xiaowen zhangshan lisi
2. 启动Hadoop集群
3. 将数据文件上传到HDFS文件系统中

[root@Cluster00 ~]# hdfs dfs -mkdir /wordcount
[root@Cluster00 ~]# hdfs dfs -put aa.lpg /wordcount

在这里插入图片描述

3.开发MapReduce

引入相关依赖

    <properties>
        <hadoop.version>2.7.3</hadoop.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-examples</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
    </dependencies>

开发一个job作业

开发map模块

 //map阶段 hadoop包装 long->LongWritable String->Text
    public static class WordCountMap extends Mapper<LongWritable, Text,Text, IntWritable>{

        @Override//inputFormat 输出一次就会调用一次这个方法
        //参数1：行首字母的偏移量 参数2：当前行的值
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] words=value.toString().split(" ");
            for(String word:words){
                context.write(new Text(word),new IntWritable(1));
            }
//            super.map(key, value, context);
        }
    }

开发reduce模块

    //reduuce阶段
    public static class WordCountReduce extends Reducer<Text,IntWritable,Text,IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

            int sum=0;
            for(IntWritable value:values){
                sum+=value.get();
            }
            //输出结果
            context.write(key,new IntWritable(sum));
            //            super.reduce(key, values, context);
        }
    }

开发job模块

    public static void main(String[] args) throws  Exception{
        ///指定job作业任务的对象是谁
        ToolRunner.run(new wordcount(),args);

    }

    public int run(String[] strings) throws Exception{
        //创建job作业对象
        Job job=Job.getInstance(getConf());
        job.setJarByClass(wordcount.class);
        //设置inputFormat
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("/wordcount/aa.log"));
        //设置map
        job.setMapperClass(WordCountMap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //设置shuffle
        //设置reduce
        job.setReducerClass(WordCountReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //设置outputFormat
        job.setOutputFormatClass(TextOutputFormat.class);
        //        一定要保证outputFormate输出结果必须不存在
        TextOutputFormat.setOutputPath(job,new Path("/wordcount/result"));
        //提交job作业
//        job.submit();
        boolean status=job.waitForCompletion(true);
        System.out.println("wordcount"+status);

        return 0;
    }

完整代码

package com.wenxin.wordcount;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;



public class wordcount extends Configured implements Tool {


    public static void main(String[] args) throws  Exception{
        ///指定job作业任务的对象是谁
        ToolRunner.run(new wordcount(),args);

    }

    public int run(String[] strings) throws Exception{
        //创建job作业对象
        Job job=Job.getInstance(getConf());
        job.setJarByClass(wordcount.class);
        //设置inputFormat
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("/wordcount/aa.log"));
        //设置map
        job.setMapperClass(WordCountMap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //设置shuffle
        //设置reduce
        job.setReducerClass(WordCountReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //设置outputFormat
        job.setOutputFormatClass(TextOutputFormat.class);
        //        一定要保证outputFormate输出结果必须不存在
        TextOutputFormat.setOutputPath(job,new Path("/wordcount/result"));
        //提交job作业
//        job.submit();
        boolean status=job.waitForCompletion(true);
        System.out.println("wordcount"+status);

        return 0;
    }



    //map阶段 hadoop包装 long->LongWritable String->Text
    public static class WordCountMap extends Mapper<LongWritable, Text,Text, IntWritable>{

        @Override//inputFormat 输出一次就会调用一次这个方法
        //参数1：行首字母的偏移量 参数2：当前行的值
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] words=value.toString().split(" ");
            for(String word:words){
                context.write(new Text(word),new IntWritable(1));
            }
//            super.map(key, value, context);
        }
    }
    //reduuce阶段
    public static class WordCountReduce extends Reducer<Text,IntWritable,Text,IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

            int sum=0;
            for(IntWritable value:values){
                sum+=value.get();
            }
            //输出结果
            context.write(key,new IntWritable(sum));
            //            super.reduce(key, values, context);
        }
    }
    
}

执行作业
将项目上传到yarn集群中
执行命令：

[root@Cluster00 ~]# yarn jar hadoop-wordcount-1.0-SNAPSHOT.jar com.wenxin.wordcount.wordcount

在这里插入图片描述

执行命令查看结果：

[root@Cluster00 ~]# hdfs dfs -cat /wordcount/result/part-r-00000

在这里插入图片描述