＜Zhuuu_ZZ＞呕心沥血之MapReduce-优快云博客

本文链接：https://blog.youkuaiyun.com/Zhuuu_ZZ/article/details/108520385

MapReduce

总览
一.MapReduce
二.MapReduce实现WordCount
三.MapReduce实现join操作

总览

在这里插入图片描述

一.MapReduce

1.什么是MapReduce

MapReduce是一个分布式计算框架
- 它将大型数据操作作业分解为可以跨服务器集群并行执行的单个任务。
- 起源于Google
适用于大规模数据处理场景
- 每个节点处理存储在该节点的数据
每个job包含Map和Reduce两部分

2.MapReduce的设计思想

分而治之
- 简化并行计算的编程模型
构建抽象模型：Map和Reduce
- 开发人员专注于实现Mapper和Reducer函数
隐藏系统层细节
- 开发人员专注于业务逻辑实现

3.MapReduce特点

优点
- 易于编程
- 可扩展性
- 高容错性
- 高吞吐量
不适用领域
- 难以实时计算
- 不适合流式计算

二.MapReduce实现WordCount

1.设计思路

在这里插入图片描述

2.执行过程

数据定义格式
- map: (K1,V1) → list (K2,V2)
- reduce: (K2,list(V2)) → list (K3,V3)
MapReduce执行过程
- Mapper
- Combiner
- Partitioner
- Shuffle and Sort
- Reducer

3.实现过程

3.1编写java代码

Mapper

/**
 * LongWritable 偏移量 long，表示该行在文件中的位置，而不是行号
 * Text map阶段的输入数据 一行文本信息 字符串类型 String
 * Text map阶段的数据字符串类型 String
 * IntWritable map阶段输出的value类型，对应java中的int型，表示行号
 */
public class WCMapper extends Mapper<LongWritable,Text, Text,IntWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    /**
	 * key 输入的 键
	 * value 输入的 值
	 * context 上下文对象
	 */
        String line=value.toString();
        String[] words=line.split(" ");
        for (String word : words) {
            context.write(new Text(word),new IntWritable(1));
        }
    }
}

Reducer

public class WCReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
/**
 * Text  数据类型：字符串类型 String
 * IntWritable reduce阶段的输入类型 int 
 * Text reduce阶段的输出数据类型 String类型
 * IntWritable 输出词频个数 Int型
 */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    /**
	 * key 输入的 键
	 * value 输入的 值
	 * context 上下文对象,用于输出键值对
	 */
        int total=0;
        for (IntWritable value : values) {
            total+=value.get();
        }
        context.write(key,new IntWritable(total)); }
}

Partitioner

public class WCPartitioner extends Partitioner<Text, IntWritable> {
    @Override
    public int getPartition(Text text, IntWritable intWritable, int i) {
        return Math.abs(text.hashCode()%i);
    }
}

public class WCDriver {
    public static void main(String[] args) throws Exception {
        //1.建立连接
        Configuration cfg=new Configuration();
        Job job=Job.getInstance(cfg,"job_wc");
        job.setJarByClass(WCDriver.class);
        //2.指定mapper和reducer
        job.setMapperClass(WCMapper.class);
        job.setReducerClass(WCReducer.class);
        //指定mapper输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //指定partitioner
        job.setNumReduceTasks(4);
        job.setPartitionerClass(WCPartitioner.class);
        //指定reducer输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //指定输入输出路径
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));//这里的文件夹运行之前不能存在
        //3.运行
        boolean result=job.waitForCompletion(true);
        System.out.println(result?"成功":"失败");
        System.exit(result?0:1);
    }
}

3.2 打jar包并上传到linux

打jar包在前文已详细叙述过，如有需要请点链接查看
链接: link.

3.3 执行M/R Job

hadoop jar testown.jar cn.kgc.kb09.mr.WCDriver /test/a.txt /test/haha
在这里插入图片描述

三.MapReduce实现join操作

实现代码所需文件：链接: link. 提取码: wvy4

1. Writable

import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class CustomOrder implements Writable {
    private String customId;
    private String customName;
    private String orderId;
    private String orderStatus;
    private String tableFlag;//为0时是custom表，为1时是order表
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(customId);
        out.writeUTF(customName);
        out.writeUTF(orderId);
        out.writeUTF(orderStatus);
        out.writeUTF(tableFlag);
    }
    @Override
    public void readFields(DataInput in) throws IOException {
        this.customId=in.readUTF();
        this.customName=in.readUTF();
        this.orderId=in.readUTF();
        this.orderStatus=in.readUTF();
        this.tableFlag=in.readUTF();
    }
    public String getCustomId() {
        return customId;
    }
    public void setCustomId(String customId) {
        this.customId = customId;
    }
    public String getCustomName() {
        return customName;
    }
    public void setCustomName(String customName) {
        this.customName = customName;
    }
public String getOrderId() {
        return orderId;
    }
    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }
    public String getOrderStatus() {
        return orderStatus;
    }

    public void setOrderStatus(String orderStatus) {
        this.orderStatus = orderStatus;
    }
    public String getTableFlag() {
        return tableFlag;
    }
    public void setTableFlag(String tableFlag) {
        this.tableFlag = tableFlag;
    }
    @Override
    public String toString() {
        return "customId='" + customId + '\'' +
                ", customName='" + customName + '\'' +
                ", orderId='" + orderId + '\'' +
                ", orderStatus='" + orderStatus +'\'';
    }
}

2.Mapper

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;

public class COMapperJoin extends Mapper<LongWritable, Text,Text,CustomOrder> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line=value.toString();
        String[] columns = line.split(",");
        for (int i = 0; i < columns.length; i++) {
            columns[i]=columns[i].split("\"")[1];
        }
        CustomOrder co=new CustomOrder();
        if(columns.length==4){//order表
            co.setCustomId(columns[2]);
            co.setCustomName("");
            co.setOrderId(columns[0]);
            co.setOrderStatus(columns[3]);
            co.setTableFlag("1");
        }else if(columns.length==9){//custom表
            co.setCustomId(columns[0]);
            co.setCustomName(columns[1]+"·"+columns[2]);
            co.setOrderId("");
            co.setOrderStatus("");
            co.setTableFlag("0");
        }
        context.write(new Text(co.getCustomId()),co);
    }
}

3.Reducer

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;

public class COReducerJoin extends Reducer<Text,CustomOrder,CustomOrder, NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<CustomOrder> values, Context context) throws IOException, InterruptedException {
        StringBuffer orderIds=new StringBuffer();
        StringBuffer statuses=new StringBuffer();
        CustomOrder customOrder=new CustomOrder();
        for (CustomOrder co : values) {
           if(co.getCustomName().equals("")) {
               orderIds.append(co.getOrderId()+"|");
               statuses.append(co.getOrderStatus()+"|");
           }else {
               customOrder.setCustomId(co.getCustomId());
               customOrder.setCustomName(co.getCustomName());
           }
        } String orderId="";
        String status="";
        if(orderIds.length()>0){
      orderId=orderIds.substring(0,orderIds.length()-1);}
       if(statuses.length()>0){
     status=statuses.substring(0,statuses.length()-1);}
        customOrder.setOrderId(orderId);
        customOrder.setOrderStatus(status);
        context.write(customOrder,NullWritable.get());
    }
}

4.Job

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class CODriver {
    public static void main(String[] args)throws Exception {
        Configuration cfg=new Configuration();
        Job job=Job.getInstance(cfg,"co_job");
       job.setJarByClass(CODriver.class);
       job.setMapperClass(COMapperJoin.class);
       job.setReducerClass(COReducerJoin.class);
       job.setMapOutputKeyClass(Text.class);
       job.setMapOutputValueClass(CustomOrder.class);
       job.setOutputKeyClass(CustomOrder.class);
       job.setOutputValueClass(NullWritable.class);
        FileInputFormat.setInputPaths(job,new Path("file:///" +
                "D:/IDEA/Data/newdata/testown/data"));
        FileOutputFormat.setOutputPath(job,new Path("file:///E:/test/coResult1/"));
        boolean result=job.waitForCompletion(true);
        System.out.println(result?"执行成功":"执行失败");
        System.exit(result?0:1);
    }
}