MapReduce之join

博客主要介绍了Hadoop中ReduceJoin和MapJoin两种join操作。ReduceJoin在map阶段读取数据并添加表名信息,reduce阶段完成join,会产生shuffle;MapJoin只有map阶段,通过cache加载数据直接在map端完成join,无shuffle,仅适用于小表。还给出了相关数据示例及代码模块。

假设现在是A.joinB

  • ReduceJoin:在map阶段先后读取A和B,通过在mapper的setup方法中获取当前正在读取的表名,并将表名信息添加到Bean对象中;在reduce阶段,A和B中相同key的数据被放到同一个reducer中,通过map阶段添加的表名信息先将reducer的value分成来自A的Aarray和来自B的Barray,遍历Aarray和Barray即可完成join操作;
  • MapJoin:只有map阶段无reduce阶段,通过cache的方式先将要B加载到map的计算节点,然后在map端直接完成join;
  • 分析
  1. ReduceJoin的每个map只获取到了B的部分信息,因此必须通过reduce阶段才能完成join,经过reduce会产生shuffle;
  2. MapJoin的每个map均获取到了B的全部信息,因此仅需map阶段即可完成join,无reduce因此不产生shuffle;
  3. MapJoin因为每个map均需要加载B的全部信息,因此仅当B为小表时适用。
1.ReduceJoin
  • 数据
    order.txt
    orderId,prodId,saleVol
    1,001,123
    2,001,222
    3,003,12
    4,002,34
    5,009,32
    6,001,12
    7,009,2
    8,007,123
    9,007,123
    10,008,33
    11,008,32

    proInfo.txt
    prodId,prodName,prodPrice
    001,小米1,1234
    002,小米2,222
    003,小米3,333
    004,华为1,111
    005,华为2,222
    006,华为3,444
    007,华为4,333
    008,华为5,321
    009,一加1,222

  • Bean

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class OrderBean implements Writable {
    private String orderId;
    private String prodId;
    private int saleVol;
    private String prodName;
    private float prodPrice;
    private String tag;

    public float getProdPrice() {
        return prodPrice;
    }

    public int getSaleVol() {
        return saleVol;
    }

    public String getOrderId() {
        return orderId;
    }

    public String getProdId() {
        return prodId;
    }

    public String getProdName() {
        return prodName;
    }

    public String getTag() {
        return tag;
    }

    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }

    public void setProdId(String prodId) {
        this.prodId = prodId;
    }

    public void setProdName(String prodName) {
        this.prodName = prodName;
    }

    public void setProdPrice(float prodPrice) {
        this.prodPrice = prodPrice;
    }

    public void setSaleVol(int saleVol) {
        this.saleVol = saleVol;
    }

    public void setTag(String tag) {
        this.tag = tag;
    }

    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(this.orderId);
        dataOutput.writeUTF(this.prodId);
        dataOutput.writeInt(this.saleVol);
        dataOutput.writeUTF(this.prodName);
        dataOutput.writeFloat(this.prodPrice);
        dataOutput.writeUTF(this.tag);
    }

    public void readFields(DataInput dataInput) throws IOException {
        this.orderId = dataInput.readUTF();
        this.prodId = dataInput.readUTF();
        this.saleVol = dataInput.readInt();
        this.prodName = dataInput.readUTF();
        this.prodPrice = dataInput.readFloat();
        this.tag = dataInput.readUTF();
    }

    @Override
    public String toString() {
        return  "orderId='" + orderId + '\'' +
                ", prodId='" + prodId + '\'' +
                ", prodName='" + prodName + '\'' +
                ", prodPrice=" + prodPrice +
                ", tag=" + tag;
    }
}
  • Mapper
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class RJMapper extends Mapper<LongWritable,Text, Text,OrderBean> {
    private String tableName;
    private Text mapKey = new Text();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        FileSplit inputSplit = (FileSplit) context.getInputSplit();
        tableName = inputSplit.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] fields = line.split(",");
        OrderBean orderBean = new OrderBean();
        if(tableName.startsWith("order")){
            orderBean.setOrderId(fields[0]);
            orderBean.setProdId(fields[1]);
            orderBean.setSaleVol(Integer.parseInt(fields[2]));
            orderBean.setProdName("");
            orderBean.setProdPrice(0);
            orderBean.setTag("order");

        } else if(tableName.startsWith("proInfo")){
            orderBean.setProdId(fields[0]);
            orderBean.setProdName(fields[1]);
            orderBean.setProdPrice(Float.parseFloat(fields[2]));
            orderBean.setOrderId("");
            orderBean.setSaleVol(0);
            orderBean.setTag("proInfo");
        }
        mapKey.set(orderBean.getProdId());
        context.write(mapKey,orderBean);
    }
}
  • Reducer
import com.sun.org.apache.xpath.internal.operations.Or;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import javax.xml.bind.SchemaOutputResolver;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

public class RJReducer extends Reducer<Text,OrderBean,Text,OrderBean> {

    @Override
    protected void reduce(Text key, Iterable<OrderBean> values, Context context) throws IOException, InterruptedException {
        ArrayList<OrderBean> orderList = new ArrayList<OrderBean>();
        ArrayList<OrderBean> infoList = new ArrayList<OrderBean>();

        for(OrderBean orderBean:values){
            OrderBean tmpOrderBean = new OrderBean();
            try {
                BeanUtils.copyProperties(tmpOrderBean,orderBean);
            } catch (IllegalAccessException e) {
                e.printStackTrace();
            } catch (InvocationTargetException e) {
                e.printStackTrace();
            }

            if(orderBean.getTag().equals("order")){
                orderList.add(tmpOrderBean);
            } else if(orderBean.getTag().equals("proInfo")){
                infoList.add(tmpOrderBean);
            }else{
                System.out.println("errrrrrrrrrrr");
            }
        }

        String prodId;
        for(OrderBean orderBean:orderList){
            prodId = orderBean.getProdId();
            for(OrderBean infoBean:infoList){
                if(infoBean.getProdId().equals(prodId)){
                    orderBean.setProdName(infoBean.getProdName());
                    orderBean.setProdPrice(infoBean.getProdPrice());
                    break;
                }
            }
            context.write(key,orderBean);
        }
    }
}
  • Driver
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.BasicConfigurator;

import java.io.IOException;

public class RJDriver {
    private static String HDFS_HOST = "hdfs://dong:9000";
    private static String INPUT_PATH = "hdfs:///data/joinTest";
    private static String OUTPUT_PATH = "hdfs:///data/result/joinResult";

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        BasicConfigurator.configure();
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS",HDFS_HOST);

        Job job = Job.getInstance(conf);
        job.setJarByClass(RJDriver.class);
        job.setMapperClass(RJMapper.class);
        job.setReducerClass(RJReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(OrderBean.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(OrderBean.class);

        Path inputPath = new Path(INPUT_PATH);
        Path outputPath = new Path(OUTPUT_PATH);

        FileInputFormat.setInputPaths(job,inputPath);
        FileOutputFormat.setOutputPath(job,outputPath);

        FileSystem fileSystem = FileSystem.get(conf);
        if(fileSystem.exists(outputPath)){
            fileSystem.delete(outputPath,true);
        }

        boolean result = job.waitForCompletion(true);
        System.exit(result?0:1);
    }
}
2.MapJoin
  • Bean
import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class OrderBean implements Writable {
    private String orderId;
    private String prodId;
    private int saleVol;
    private String prodName;
    private float prodPrice;


    public float getProdPrice() {
        return prodPrice;
    }

    public int getSaleVol() {
        return saleVol;
    }

    public String getOrderId() {
        return orderId;
    }

    public String getProdId() {
        return prodId;
    }

    public String getProdName() {
        return prodName;
    }

    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }

    public void setProdId(String prodId) {
        this.prodId = prodId;
    }

    public void setProdName(String prodName) {
        this.prodName = prodName;
    }

    public void setProdPrice(float prodPrice) {
        this.prodPrice = prodPrice;
    }

    public void setSaleVol(int saleVol) {
        this.saleVol = saleVol;
    }

    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(this.orderId);
        dataOutput.writeUTF(this.prodId);
        dataOutput.writeInt(this.saleVol);
        dataOutput.writeUTF(this.prodName);
        dataOutput.writeFloat(this.prodPrice);
    }

    public void readFields(DataInput dataInput) throws IOException {
        this.orderId = dataInput.readUTF();
        this.prodId = dataInput.readUTF();
        this.saleVol = dataInput.readInt();
        this.prodName = dataInput.readUTF();
        this.prodPrice = dataInput.readFloat();
    }

    @Override
    public String toString() {
        return  "orderId='" + orderId + '\'' +
                ", prodId='" + prodId + '\'' +
                ", prodName='" + prodName + '\'' +
                ", prodPrice=" + prodPrice;
    }
}

  • Mapper
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.*;
import java.util.HashMap;

public class MJMapper extends Mapper<LongWritable, Text,Text,OrderBean> {
    HashMap hashMap = new HashMap<String, String>();
    Text mapKey = new Text();
    @Override
    protected void setup(Context context) throws IOException {
        FileReader fr = new FileReader("proInfo.txt");
        BufferedReader reader = new BufferedReader(fr);
        String line;
        String[] fields;
        //001,小米1,1234

        while(StringUtils.isNotEmpty(line = reader.readLine())){
            fields = line.split(",");
            String val = fields[1] + "," + fields[2];
            hashMap.put(fields[0],val);
        }
        IOUtils.closeStream(reader);
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1,001,123
        String line = value.toString();
        String[] fields = line.split(",");
        OrderBean orderBean = new OrderBean();
        orderBean.setOrderId(fields[0]);
        orderBean.setProdId(fields[1]);
        orderBean.setSaleVol(Integer.parseInt(fields[2]));
        String tmpProdInfo = (String) hashMap.get(orderBean.getProdId());
        String[] prodInfo = tmpProdInfo.split(",");
        orderBean.setProdName(prodInfo[0]);
        orderBean.setProdPrice(Float.parseFloat(prodInfo[1]));
        mapKey.set(orderBean.getProdId());
        context.write(mapKey,orderBean);
    }
}
  • Driver
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.BasicConfigurator;
import org.jets3t.service.model.cloudfront.Distribution;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class MJDriver {
    private static String HDFS_HOST = "hdfs://dong:9000";
    private static String INPUT_PATH = "hdfs:///data/joinTest";
    private static String OUTPUT_PATH = "hdfs:///data/result/joinResult";

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        BasicConfigurator.configure();
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS",HDFS_HOST);

        Job job = Job.getInstance(conf);
        job.setJarByClass(MJDriver.class);
        job.setMapperClass(MJMapper.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(OrderBean.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(OrderBean.class);

        Path inputPath = new Path(INPUT_PATH);
        Path outputPath = new Path(OUTPUT_PATH);

        FileInputFormat.setInputPaths(job,inputPath);
        FileOutputFormat.setOutputPath(job,outputPath);
        job.addCacheFile(new URI( "hdfs://dong:9000/data/proInfo.txt"));
        //DistributedCache.addCacheArchive(new URI( "hdfs:///data/proInfo.txt"),conf);
        job.setNumReduceTasks(0);
        FileSystem fileSystem = FileSystem.get(conf);
        if(fileSystem.exists(outputPath)){
            fileSystem.delete(outputPath,true);
        }

        boolean result = job.waitForCompletion(true);
        System.exit(result?0:1);
    }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值