MapReduce的join操作

最新推荐文章于 2019-10-25 16:08:38 发布

Quite-quiet

最新推荐文章于 2019-10-25 16:08:38 发布

阅读量236

点赞数 1

文章标签： MapReduce join MapReduce连接 map连接 reduce连接

本文链接：https://blog.youkuaiyun.com/qq_30758673/article/details/100583726

版权

MapReduce的join操作

orders表
+-------------------+-------------+------+-----+---------+----------------+
| Field             | Type        | Null | Key | Default | Extra          |
+-------------------+-------------+------+-----+---------+----------------+
| order_id          | int(11)     | NO   | PRI | NULL    | auto_increment |
| order_date        | datetime    | NO   |     | NULL    |                |
| order_customer_id | int(11)     | NO   |     | NULL    |                |
| order_status      | varchar(45) | NO   |     | NULL    |                |
+-------------------+-------------+------+-----+---------+----------------+
customers表
+-------------------+--------------+------+-----+---------+----------------+
| Field             | Type         | Null | Key | Default | Extra          |
+-------------------+--------------+------+-----+---------+----------------+
| customer_id       | int(11)      | NO   | PRI | NULL    | auto_increment |
| customer_fname    | varchar(45)  | NO   |     | NULL    |                |
| customer_lname    | varchar(45)  | NO   |     | NULL    |                |
| customer_email    | varchar(45)  | NO   |     | NULL    |                |
| customer_password | varchar(45)  | NO   |     | NULL    |                |
| customer_street   | varchar(255) | NO   |     | NULL    |                |
| customer_city     | varchar(45)  | NO   |     | NULL    |                |
| customer_state    | varchar(45)  | NO   |     | NULL    |                |
| customer_zipcode  | varchar(45)  | NO   |     | NULL    |                |
+-------------------+--------------+------+-----+---------+----------------+

两张表中都有customer_id,故将customer_id作为key比较合适

Reduce阶段的join处理

mapper阶段通过获取inputSplit对象的path属性分辨出切割的数据来自哪个数据源,以做不同的处理生成中间值.

在reduce阶段将key值相同的中间值聚合为最后的对象

实现:

实体类CustomerOrders ,作为最后输出的value,也是中间值的value.

package com.orderAndCustomer.entry;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class CustomerOrders implements Writable {
    private String customer_id;
    private String orderId;
    private String name;
    private String orderStatus;
//    标签,用于区分不同数据源的对象
    private String flag;

    public String getCustomer_id() {
        return customer_id;
    }

    public CustomerOrders setCustomer_id(String customer_id) {
        this.customer_id = customer_id;
        return this;
    }

    public String getOrderId() {
        return orderId;
    }

    public CustomerOrders setOrderId(String orderId) {
        this.orderId = orderId;
        return this;
    }

    public String getName() {
        return name;
    }

    public CustomerOrders setName(String name) {
        this.name = name;
        return this;
    }

    public String getOrderStatus() {
        return orderStatus;
    }

    public CustomerOrders setOrderStatus(String orderStatus) {
        this.orderStatus = orderStatus;
        return this;
    }

    public String getFlag() {
        return flag;
    }

    public CustomerOrders setFlag(String flag) {
        this.flag = flag;
        return this;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.customer_id);
        out.writeUTF(this.orderId);
        out.writeUTF(this.name);
        out.writeUTF(this.orderStatus);
        out.writeUTF(this.flag);
    }

    @Override
    public String toString() {
        return customer_id + '\t' +
                orderId + '\t' +
                name + '\t' +
                orderStatus + '\t' +
                flag + '\'';
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.customer_id=in.readUTF();
        this.orderId=in.readUTF();
        this.name=in.readUTF();
        this.orderId=in.readUTF();
        this.flag=in.readUTF();
    }
}

mapper:
要注意的是:
inputSplit的导入包是org.apache.hadoop.mapreduce.lib.input.FileSplit;

package com.orderAndCustomer.mapper;

import com.orderAndCustomer.entry.CustomerOrders;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class ReduceJoinMapper extends Mapper<LongWritable, Text,Text, CustomerOrders> {
//    源文件路径
    String name="";
//    value
    CustomerOrders customerOrders =new CustomerOrders();
//    key
    Text text=new Text();
    //一个maptask会先执行setup然后开始循环遍历每一个map
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
//        获取这个map的InputSplit对象
        FileSplit inputSplit= (FileSplit) context.getInputSplit();
//        获取数据原文件的名称
        name = inputSplit.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//        连个源文件都是","为分割符
        String[] split = value.toString().split(",");
//        order.csv的处理逻辑,标签为1
        if (name.startsWith("order")){
            customerOrders.setOrderId(split[0]);
            customerOrders.setOrderStatus(split[3]);
            customerOrders.setCustomer_id(split[2]);
            customerOrders.setFlag("1");
            customerOrders.setName("");
        }else {         //customer的处理逻辑,标签为2
            customerOrders.setCustomer_id(split[0]);
            customerOrders.setName(split[1]);
            customerOrders.setFlag("0");
            customerOrders.setOrderId("");
            customerOrders.setOrderStatus("");
            customerOrders.setCustomer_id("");
        }
//        customer_id作为key
        text.set(customerOrders.getCustomer_id());
        context.write(text, customerOrders);
    }
}

reducer:

package com.orderAndCustomer.reduce;

import com.orderAndCustomer.entry.CustomerOrders;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;

//CustomerOrders作为输出key,也是最终显示的内容,value并不重要,所以value可以用NullWritable
public class ReduceJoinReducer extends Reducer<Text, CustomerOrders, CustomerOrders, NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<CustomerOrders> values, Context context) throws IOException, InterruptedException {
//        customer表的字段与order表的内用是一对多的关系
//        customer表的对象cuBean
        CustomerOrders cuBean=new CustomerOrders();
//        存放order表对象
        List<CustomerOrders> orderBeans=new ArrayList<>();
//遍历values,标签为1的存放在orderBeans中
        for (CustomerOrders bean:values
             ) {
            if ("1".equals(bean.getFlag())){
                CustomerOrders orderBean=new CustomerOrders();
                try {
                    BeanUtils.copyProperties(orderBean,bean);
                    orderBeans.add(orderBean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }else {
                try {
                    BeanUtils.copyProperties(cuBean,bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }
//        为每一个order添加上对应id的customer表中的内容
        for (CustomerOrders orderBean:orderBeans){
            orderBean.setName(cuBean.getName());
            context.write(orderBean,NullWritable.get());
        }
    }
}

driver

package com.orderAndCustomer.driver;

import com.orderAndCustomer.entry.CustomerOrders;
import com.orderAndCustomer.mapper.ReduceJoinMapper;
import com.orderAndCustomer.reduce.ReduceJoinReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class ReduceJoinDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Configuration configuration=new Configuration();
        Job job = Job.getInstance(configuration);
        job.setJarByClass(ReduceJoinDriver.class);

        job.setMapperClass(ReduceJoinMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(CustomerOrders.class);

        job.setReducerClass(ReduceJoinReducer.class);
        job.setOutputKeyClass(CustomerOrders.class);
        job.setOutputValueClass(NullWritable.class);


        FileInputFormat.addInputPath(job,new Path("/home/jarvis/Desktop/data"));
        Path path=new Path("/home/jarvis/Desktop/orderAndCus");
//        判断文件是否存在,存在则删除
        FileSystem fs =FileSystem.get(new URI(path.toString()),configuration);
        if (fs.exists(path)){
            fs.delete(path,true);
        }
        FileOutputFormat.setOutputPath(job,path);
        job.waitForCompletion(true);
    }
}

结论:会造成map和reduce端数据传输时(shuffle阶段)出现大量的数据传输,效率很低.而且合并的操作是在reduce阶段完成的,reduce端处理压力太大,map节点的运算负载则很低，资源利用率不高，且在reduce阶段极易产生数据倾斜

解决方案:map端实现数据合并

Mapper阶段的Join操作

order表有68883条数据

customer表有12435条数据

在map端缓存多张表,提前处理业务逻辑,增加map端的业务,减少reduce端数据的压力,尽可能的减少数据倾斜.

具体方法:

在驱动函数中加载缓存:

job.addCacheFile(new URI(""));	//缓存普通文件到task运行节点

在mapper的setup阶段,将文件读取到缓存集合中
```
URI[] cacheFiles = context.getCacheFiles();
```

mapper:

package com.orderAndCustomer.mapper;

import com.orderAndCustomer.entry.CustomerOrders;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.*;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

public class MapperJoinMapper extends Mapper<LongWritable, Text, NullWritable, CustomerOrders> {
    Map<String,String> cusMap =new HashMap<>();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
//        从缓存中读取文件
        URI[] cacheFiles = context.getCacheFiles();
//        存放文件路径
        String filename=new Path(cacheFiles[0]).getName();
//        读取文件
        BufferedReader bufferedReader =new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
        //        存放从文件中读取的每一行内容
        String line;
//        遍历缓存文件
        while(StringUtils.isNotEmpty(line=bufferedReader.readLine())){
            String [] split=line.split(",");
//            将id与name作为键值对存贮在cusMap中
            cusMap.put(split[0],split[1]);
    }
//        关流
        bufferedReader.close();
    }
//最后显示的内容对象
    CustomerOrders customerOrders =new CustomerOrders();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//        value都是order表的数据,切分后添加上对应id的customer数据生成最终对象,不需要reduce阶段
        String[] fields =value.toString().split(",");
        customerOrders.setCustomer_id(fields[2]);
        customerOrders.setOrderId(fields[0]);
        customerOrders.setOrderStatus(fields[3]);
        customerOrders.setName(cusMap.get(fields[2]));
        customerOrders.setFlag("1");
        context.write(NullWritable.get(), customerOrders);

    }
}

reduce

没有reduce

driver

package com.orderAndCustomer.driver;

import com.orderAndCustomer.entry.CustomerOrders;
import com.orderAndCustomer.mapper.MapperJoinMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class MapJoinDriver {
    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        job.setJarByClass(MapJoinDriver.class);

        job.setMapperClass(MapperJoinMapper.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(CustomerOrders.class);

        job.addCacheFile(new URI("/home/jarvis/Desktop/data/customers.csv"));
//输入
        FileInputFormat.setInputPaths(job,new Path("/home/jarvis/Desktop/data/orders.csv"));
        Path path =new Path("/home/jarvis/Desktop/orderAndCus");
//        判断文件是否存在,存在则删除
        FileSystem fs =FileSystem.get(new URI(path.toString()),configuration);
        if (fs.exists(path)){
            fs.delete(path,true);
        }
//        输出
        FileOutputFormat.setOutputPath(job,path);
        job.waitForCompletion(true);

    }
}