day02_mapreduce编写，运行原理，案例_mapreduce 高级案例-优快云博客

本文链接：https://blog.youkuaiyun.com/m0_55044643/article/details/118677780

来源：doit教育

1 MR设计思想

任务如何合理的分配？一般来说为了方便管理，DN机器内存大部分是一致的，所以划分任务的时候按照大小划分，DN收到任务后，并行地运算数据。
数据运算结束后，需要将数据发送到存储数据的机器上，可能需要多个机器来存储数据，为了保证数据的分布式存储（每台机器上对同一个数据要完整的存储例如：a只存储在DN1机器，不是在DN1和ND2上），引入了分区器。
分区器对map阶段处理的数据得hash值%2 ，根据得到0或者1的结果，把相应的数据存储到DN机器上。这个存储的过程就是reduce过程

2 入门程序

模拟mr在本地测试实现

package com._51doit.mr.day03;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * Description:
 * 统计文本中单词出现的次数
 *    逻辑 代码   测试环境下编写代码
 *  本地测试环境: 在本地模拟分布式运行, 加载本地数据
 */
public class MR1_Demo {

    // 1 map阶段
    /**
     * 继承Mapper类
     *  **** 类上有泛型:在Mr内部处理数据是以KV的形式处理的
     *  1 输入:从文件中读取数据 , 一行一行的读
     *  2 四个泛型
     *  KIN   行的偏移量  Long
     *  VIN   行数据     String
     *  KOUT  单词       String
     *  VOUT   1个       Integer
     *  3  输出的数据 会 传递给  下一个阶段(reduce阶段)  网络传输数据 所以  KVOUT要序列化
     *  JDK的序列化 : 完整的类结构 , 类名 属性类型 值 ... 1,zss,23  10 --> 90byte
     *  JDK 的序列化臃肿, 不适合大数据的数据传输场景
     *  hdp中有自己的序列化方式
     *  Long--> LongWritable
     *  String--> Text
     *  Integer--> IntWritable
     *
     */
    static class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable>{
       // 重写一个map方法
        // map方法一行执行一次   ,循环执行 , 当前任务有多少行 ,指定多少次
        @Override
        protected void map(LongWritable key, Text line, Context context) throws IOException, InterruptedException {
            // 获取一行数据  处理
            String lineStr = line.toString();
            String[] words = lineStr.split("\\s+");
            for (String word : words) {
                Text kout = new Text(word);
                IntWritable vout = new IntWritable(1);
                // context上下文对象 , 可以衔接map和reduce两个阶段
                // 输出 单词 1
                context.write(kout,vout);//a 1    a 1
            }

        }
    }
    // 2 reduce阶段
    /**
     * 继承Reducer类
     */
    static class WordCountReducer extends Reducer<Text, IntWritable ,Text , IntWritable>{
        // reduce方法 一个单词执行一次 a 1  a 1  a 1
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int cnt = 0 ;
            for (IntWritable value : values) {
                cnt++ ;
            }
            context.write(key , new IntWritable(cnt));
        }
    }

    // 3 运行程序
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        // 1 创建一个Job
        Job job = Job.getInstance(conf, "wordcount");
        // 2 设置mapper类
        job.setMapperClass(WordCountMapper.class);
        // 3 设置reduce类
        job.setReducerClass(WordCountReducer.class);
        // 4 设置map端和reduce端输出类型

        // 指定map端的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //指定最终结果的数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);


        // 设置reduce的个数
        job.setNumReduceTasks(2);
        //5 输入路径
        FileInputFormat.setInputPaths(job ,new Path("d://wc/"));
        // 6 输出路径
        FileOutputFormat.setOutputPath(job,new Path("d://wc_res"));
        //7 job提交  等待程序执行完毕
        boolean b = job.waitForCompletion(true);
    }

}

2.1内部实现细节

本地中首先根据数据和文件数量和大小划分任务切片 split ，分别封装成不同的maptask
调度mapktask到不同的机器运行
反射生成类，调用Reader方法得到kin vin，也就是偏移量和每行的数据
循环执行map程序，执行完毕，得到kout vout，本例中(a,1)(a,1)(b,1)(b,1)(c,1)(d,1)(d,1)
分区器根据kout的哈希值取模得到区号，进行区内排序(a,1,1)(a,1,1) (c,1,1) ||| (b,1,0)(d,1,0)(d,1,0)
溢出，将排序好的数据以分区为单位，写到本地磁盘
启动reduceTask，不同的机器拉取属于自己分区的数据（这里如果map和reduce不是同一台机器，kout和vout就一定要实现序列化才可以拉取到）根据得到的kout分组，然后对对应的迭代器操作(本例中就是统计数量)
将数据合并，排序，分组
调用reduce方法，聚合输出。本案例中就是：(a,2) (b,1) (c,1) (d,2) 最终输出到磁盘。

2 线段案例

需求：求线段的交集例如： [1,2] [2,3] [1,4] —>>>[1,2] [2,3] [1,2,3,4] 计算出每个数字出现了几次

思路：本质还是求单词出现的次数。

package com._51doit.mr.day03;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * Author:   Hang.Z
 * Date:     21/06/23
 * Description:
 */
public class MR2_Line {

    static  class  LineMapper extends Mapper<LongWritable, Text, IntWritable , IntWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] split = line.split(",");
            String x1 = split[0] ; // 2
            String x2 = split[1] ;// 6
            int x1Int = Integer.parseInt(x1);
            int x2Int = Integer.parseInt(x2);
            IntWritable k = new IntWritable();
            IntWritable v = new IntWritable();
            for(int i = x1Int ; i<=x2Int;i++){
                k.set(i);
                v.set(1);
                context.write(k,v);
            }
        }
    }

    static  class  LineReducer extends Reducer<IntWritable , IntWritable,IntWritable , IntWritable>{
        @Override
        protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int cnt = 0 ;
            for (IntWritable value : values) {
                cnt++ ;
            }
            IntWritable v = new IntWritable();
            v.set(cnt);
            context.write(key ,v);

        }
    }


    public static void main(String[] args) throws  Exception{
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "line");
        job.setMapperClass(LineMapper.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setReducerClass(LineReducer.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.setInputPaths(job,new Path("data/line.txt"));
        FileOutputFormat.setOutputPath(job, new Path("line_res"));
        job.waitForCompletion(true);//10s

    }
}

3 电影案例

3.1 电影均分

自定义Bean

package com._51doit.beans;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;

/**
 * Author:   Hang.Z
 * Date:     21/06/23
 * Description:
 * {"movie":"2288","rate":"4","timeStamp":"978160616","uid":"17"}
 * 这个类要放在Map端的VALUE的位置输出  序列化
 * 序列化本质
 *    内存对象--->二进制  写
 *    二进制--->内存对象  读
 */
public class MovieWritable implements Writable {

    private String movie ;
    private double rate ;
    private String timeStamp ;
    private String uid ;

    public String getMovie() {
        return movie;
    }

    public void setMovie(String movie) {
        this.movie = movie;
    }

    public double getRate() {
        return rate;
    }

    public void setRate(double rate) {
        this.rate = rate;
    }

    public String getTimeStamp() {
        return timeStamp;
    }

    public void setTimeStamp(String timeStamp) {
        this.timeStamp = timeStamp;
    }

    public String getUid() {
        return uid;
    }

    public void setUid(String uid) {
        this.uid = uid;
    }

    @Override
    public String toString() {
        return "MovieWritable{" +
                "movie='" + movie + '\'' +
                ", rate=" + rate +
                ", timeStamp='" + timeStamp + '\'' +
                ", uid='" + uid + '\'' +
                '}';
    }
// 序列化
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(movie);
        dataOutput.writeDouble(rate);
        dataOutput.writeUTF(timeStamp);
        dataOutput.writeUTF(uid);

    }
// 反序列化
    public void readFields(DataInput dataInput) throws IOException {
      movie =  dataInput.readUTF();
      rate = dataInput.readDouble() ;
      timeStamp = dataInput.readUTF() ;
        uid = dataInput.readUTF() ;
    }
}

package com._51doit.mr.day03;

import com._51doit.beans.MovieWritable;
import com.alibaba.fastjson.JSON;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * Author:   Hang.Z
 * Date:     21/06/23
 * Description:
 * 求每部电影的平均分数
 *    K  电影ID
 *    V  自定义的类(JavaBean 封装整条数据)  , 实现序列化
 * JavaBean
 *     1 私有属性
 *     2 公共的Get Set方法
 *     3 toString
 *     4 序列化
 */
public class MR3_Movie_AvgRate {

    static  class MovieAvgRateMapper extends Mapper<LongWritable,Text, Text, MovieWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            try {
                String line = value.toString();
                // 解析JSON数据 解析失败 抛异常
                MovieWritable mb = JSON.parseObject(line, MovieWritable.class);
                String movie = mb.getMovie();
                Text k = new Text(movie);
                context.write(k, mb);
            } catch (Exception e) {  // 最大异常
                e.printStackTrace();
            }
        }
    }

    static  class MovieAvgRateReducer extends Reducer<Text, MovieWritable , Text , DoubleWritable>{

        @Override
        protected void reduce(Text key, Iterable<MovieWritable> values, Context context) throws IOException, InterruptedException {
            double sumRate = 0d ;
            int cnt = 0 ;
            for (MovieWritable mb : values) {
                double rate = mb.getRate();
                sumRate+=rate ;
                cnt++ ;
            }
            double avgRate =  sumRate/cnt ;
            DoubleWritable v = new DoubleWritable(avgRate);
            context.write(key , v);
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "avg");
        job.setMapperClass(MovieAvgRateMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(MovieWritable.class);
        job.setReducerClass(MovieAvgRateReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class);
        FileInputFormat.setInputPaths(job,new Path("D:\\mrdata\\movie\\input"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\mrdata\\movie\\avg_out"));
        job.waitForCompletion(true);

    }

}

回顾 :

元数据管理机制 checkpoint

MR核心设计思想 ,使用多台节点并行计算 , 任务的划分 ,处理数据->结果–分区器–>获取全局结果

物理切块:HDFS文件存储的物理块 128M

任务切片:Maptask处理任务的数据大小 128M

尽量做到MRTASK运行的节点是DN节点

案例 : 1) xxMapper extends Mapper<kin , vin , kout,vout> :

map() 一对INKV执行一次 , 默认的文本文件,一行执行一次

2) xxReducer extends Reducer<kin , vin , kout,vout>

reduce() 一组执行一次

context.write(K,V)

MR内部处理数据简单流程

输入路径/Mapper类/Reducer类/方法/逻辑
计算任务切片/初始化maptask任务[处理的数据信息, 封装Mapper类的map方法逻辑]
运行task读取数据 ;读取数据封装成kin vin --> map方法
在map方法中调用一次处理一行数据 , 一个maptask任务可能会执行多次map方法
map方法输出Kout Vout ----->分区器–[Kout,Vout,int partition] —> 缓存中[区排序]----->>>输出到磁盘
reduce启动拉取属于自己任务的数据, 合并,排序,分租
一组数据执行一次reduce方法进行数据聚合保存结果以分区任务为单位

3.2 评论次数最多的前n部电影

每部电影的评论次数-

package com._51doit.mr.day03;

import com._51doit.beans.MovieWritable;
import com.alibaba.fastjson.JSON;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.*;


/**
 * Author:   Hang.Z
 * Date:     21/06/24
 * Description:
 * 评论次数最多的前n部电影
 * 高效topN
 */
public class MR4_Movie_TopN {
    static class MR4_Movie_TopN_Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        /**
         * 一行执行一次
         *
         * @param key
         * @param value
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        Text k = new Text();
        IntWritable v = new IntWritable(1);

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            try {
                String line = value.toString();
                MovieWritable mb = JSON.parseObject(line, MovieWritable.class);
                String mid = mb.getMovie();
                k.set(mid);
                context.write(k, v);
            } catch (Exception e) {
                e.printStackTrace();
            }

        }
    }

    static class MR4_Movie_TopN_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        // 内存
        Map<String, Integer> map = new HashMap<String, Integer>();
        IntWritable v = new IntWritable();

        // 一组执行一次
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int cnt = 0;
            for (IntWritable value : values) {
                cnt++;
            }
            v.set(cnt);
            map.put(key.toString(), cnt);
            // context.write(key , v);
        }

        // 当前的task中的reduce执行完毕以后会执行一次  cleanup方法
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            // 对当前的task的所有的KV进行排序
            Set<Map.Entry<String, Integer>> entries = map.entrySet();
            ArrayList<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(entries);
            Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
                public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
                    return o2.getValue() - o1.getValue();
                }
            });
            // top3
            Text k = new Text();
            for (int i = 0; i < Math.min(3, list.size()); i++) {
                Map.Entry<String, Integer> entry = list.get(i);
                String mid = entry.getKey();
                Integer cnt = entry.getValue();
                k.set(mid);
                v.set(cnt);
                context.write(k, v);
            }

        }
    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        // 1 创建一个Job
        Job job = Job.getInstance(conf, "topN");
        // 2 设置mapper类
        job.setMapperClass(MR4_Movie_TopN_Mapper.class);
        // 3 设置reduce类
        job.setReducerClass(MR4_Movie_TopN_Reducer.class);
        // 4 设置map端和reduce端输出类型

        // 指定map端的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //指定最终结果的数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setNumReduceTasks(1);
        // 设置reduce的个数
       // job.setNumReduceTasks(1);
        //5 输入路径
        FileInputFormat.setInputPaths(job, new Path("D:\\mrdata\\movie\\input"));
        // 6 输出路径
        FileOutputFormat.setOutputPath(job, new Path("D:\\mrdata\\movie\\topn2"));
        //7 job提交  等待程序执行完毕
        boolean b = job.waitForCompletion(true);
    }


}