MapReduce简单实例解析map、reduce、combiner、partition一条龙

本文介绍了一个使用MapReduce处理《红楼梦》文本的情感词汇统计项目。该项目统计了文本中出现的笑、喜、哭、怒等词汇数量,并通过使用combiner减少IO操作,采用自定义的partitioner将结果分别输出到两个文件中。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

需求:通过MapReduce对红楼梦TXT文件统计笑、喜、哭、怒在全书的数量,使用combiner减少IO,通过partition输出到两个文件中。
通过MapReduce插件创建MapReduce project,这样需要的包都会自动导入

主函数:

package com.zhiyou100;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class MyApp {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();
        conf.set("mapreduce.output.textoutputformat.separator", ":");

        Path inputPath = new Path("hdfs://master:9000/mark/hlm-utf8.txt");
        Path outputPath = new Path("hdfs://master:9000/result/hml02");
        FileSystem fs = FileSystem.newInstance(conf);
        // 如果文件已存在就删除
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }
        fs.close();

        // job相当于一个model
        Job job = Job.getInstance(conf, "HLM");
        job.setJarByClass(MyApp.class);

        // 指定输入目录
        FileInputFormat.addInputPath(job, inputPath);
        // 指定对输入数据进行格式化处理的类(可以省略)
        job.setInputFormatClass(TextInputFormat.class);
        // 指定自定义的Mapper类
        job.setMapperClass(MyMapper.class);

        // map输入
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 分区(可以省略)
        job.setPartitionerClass(MyPartition.class);
        // 设置要运行的Reducer的数量(可以省略)
        job.setNumReduceTasks(2);

        // 指定自定义的Reducer类
        job.setReducerClass(MyReducer.class);

        job.setCombinerClass(MyCombiner.class);

        // 泛型类在编译时会被当成?所以要指定
        // 指定map输出的<K,V>类型(如果<k3,v3>的类型与<k2,v2>的类型一致则可以省略)
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 指定输出目录
        FileOutputFormat.setOutputPath(job, outputPath);

        // 指定对输出数据进行格式化处理的类(可以省略)
        job.setOutputFormatClass(TextOutputFormat.class);

        // 把任务提交到集群,轮寻方式
        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

mapper:

package com.zhiyou100;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class MyMapper extends Mapper<Object, Text, Text, IntWritable> {
    static {
        System.out.println("my_-mapper");
    }

    private IntWritable num = new IntWritable();
    private Text word = new Text();
    private int no = 0;

    public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

        StringTokenizer st = new StringTokenizer(value.toString(), "《 》 、 ! , 。 ? :;  “ ” ‘ ’ ");
        while (st.hasMoreElements()) {
            String text = st.nextElement().toString().trim();
            no += 1;
            context.getCounter("ZY", "statement").increment(1);

            if (text.contains("笑")) {
                word.set("笑");
                num.set(no);
                context.write(word, num);
            }
            if (text.contains("喜")) {
                word.set("喜");
                num.set(no);
                context.write(word, num);
            }

            if (text.contains("哭")) {
                word.set("哭");
                num.set(no);
                context.write(word, num);
            }
            if (text.contains("怒")) {
                word.set("怒");
                num.set(no);
                context.write(word, num);
            }
        }

    }

}

reduce:

package com.zhiyou100;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    static {
        System.out.println("my_-reducer");
    }
    private IntWritable result = new IntWritable();

    @Override
    public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {

        int sum = 0;
        for(IntWritable val : values) {
            sum += val.get();
        }

        result.set(sum);
        context.write(key, result);

    }
}

combiner:

package com.zhiyou100;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class MyCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {
    static {
        System.out.println("my_-combiner");
    }

    private IntWritable result = new IntWritable();

    @Override
    public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {

        int sum = 0;
        for(IntWritable val : values) {
            sum += 1;
        }

        result.set(sum);
        context.write(key, result);

    }
}

partition:

package com.zhiyou100;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class MyPartition extends Partitioner<Text, IntWritable>{
    static {
        System.out.println("my_-partition");
    }
    @Override
    public int getPartition(Text key, IntWritable value, int numPartitions) {
        if(key.toString().contains("笑") || key.toString().contains("喜")) {
            return 0;
        }else {
            return 1;
        }
    }

}

输出结果:
这里写图片描述

通过每个类上定义的静态方法打印的日志也可以看出job在调用MapReduce 及 combiner partition的先后顺序
my_-mapper –> my_-combiner –> my_-partition –> my_-reducer

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值