hadoop join数据倾斜解决方法

注意点:
1: map输出的一定是两表的外键
2:构造的信息bean要有一个标志位,用来判别现在的bean中的信息是属于哪个表的
下面是实现代码已运行通过

package join;


import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class InfoBean implements Writable {

    public InfoBean() {
    }

    public void set(String pid, String data, String price, String amount, String productName, String orderId, String categoryId,String flag) {
        this.pid = pid;
        this.data = data;
        this.price = price;
        this.amount = amount;
        this.productName = productName;
        this.orderId = orderId;
        this.categoryId = categoryId;
        this.flag = flag;
    }

    private String pid;
    private String data;
    private String price;
    private String amount;
    private String productName;
    private String orderId;
    private String categoryId;

    //0 order 1 product
    private String flag;





    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(pid);
        out.writeUTF(data);
        out.writeUTF(price);
        out.writeUTF(amount);
        out.writeUTF(productName);
        out.writeUTF(orderId);
        out.writeUTF(categoryId);
        out.writeUTF(flag);

    }

    @Override
    public void readFields(DataInput input) throws IOException {
        this.pid = input.readUTF();
        this.data = input.readUTF();
        this.price = input.readUTF();
        this.amount = input.readUTF();
        this.productName = input.readUTF();
        this.orderId = input.readUTF();
        this.categoryId = input.readUTF();
        this.flag = input.readUTF();
    }

    public String getPid() {
        return pid;
    }

    public void setPid(String pid) {
        this.pid = pid;
    }

    public String getData() {
        return data;
    }

    public void setData(String data) {
        this.data = data;
    }

    public String getPrice() {
        return price;
    }

    public void setPrice(String price) {
        this.price = price;
    }

    public String getAmount() {
        return amount;
    }

    public void setAmount(String amount) {
        this.amount = amount;
    }

    public String getProductName() {
        return productName;
    }

    public void setProductName(String productName) {
        this.productName = productName;
    }

    public String getOrderId() {
        return orderId;
    }

    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }

    public String getCategoryId() {
        return categoryId;
    }

    public void setCategoryId(String categoryId) {
        this.categoryId = categoryId;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public String toString() {
        return "InfoBean{" +
                "pid='" + pid + '\'' +
                ", data='" + data + '\'' +
                ", price='" + price + '\'' +
                ", amount='" + amount + '\'' +
                ", productName='" + productName + '\'' +
                ", orderId='" + orderId + '\'' +
                ", categoryId='" + categoryId + '\'' +
                ", flag='" + flag + '\'' +
                '}';
    }
}

这个bean包含了我们要输出的全部信息外加了一个标志位用来说明

mr程序

package join;



import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class DataJoin {

    //map输出时key为联系表的外键
    static class JoinMapper extends Mapper<LongWritable,Text,Text,InfoBean> {

        @Override
        protected void map(LongWritable number, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] fileds = line.split(",");
            InfoBean info = new InfoBean();
            Text key = new Text();
            //InfoBean为合体bean,所以先判断map进来的是哪种bean
            FileSplit split  = (FileSplit) context.getInputSplit();
            String filename = split.getPath().getName();
            if (filename.startsWith("order")){
                info.set(fileds[2],fileds[1],"",fileds[3],"",fileds[0],"","0");
            }else {
                info.set(fileds[0],"",fileds[3],"",fileds[1],"",fileds[2],"1");
            }
            String pid = info.getPid();
            key.set(pid);
            context.write(key,info);
        }
    }
    static class JoinReducer extends Reducer<Text,InfoBean,InfoBean,NullWritable>{
        @Override
        protected void reduce(Text key, Iterable<InfoBean> values, Context context) throws IOException, InterruptedException {
            List<InfoBean> infoBeans = new ArrayList<>();
            InfoBean productBean = new InfoBean();
            for (InfoBean value:values){
                String flag = value.getFlag();
                if ("0".equals(flag)){
                    InfoBean bean = new InfoBean();
                    try {
                        BeanUtils.copyProperties(bean,value);
                        infoBeans.add(bean);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }else {
                    try {
                        BeanUtils.copyProperties(productBean,value);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            }
            infoBeans.forEach(
                    infoBean -> {
                        infoBean.setProductName(productBean.getProductName());
                        infoBean.setCategoryId(productBean.getCategoryId());
                        infoBean.setPrice(productBean.getPrice());
                        try {
                            context.write(infoBean,NullWritable.get());
                        } catch (Exception e) {
                            e.printStackTrace();
                        }
                    }
            );
        }
    }

    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();

        //本地调试模式
        conf.set("mapreduce.framework.name","local");
        conf.set("fs.defaultFS","file:///");

        Job job = Job.getInstance();

        //指定本程序的jar包所在的本地路径
        job.setJarByClass(DataJoin.class);

        //指定运行的map程序
        job.setMapperClass(JoinMapper.class);
        job.setReducerClass(JoinReducer.class);

        //指定map输出数据的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(InfoBean.class);

        //最终输出数据类型kv
        job.setOutputKeyClass(InfoBean.class);
        job.setOutputValueClass(NullWritable.class);


        //指定job的输入原始文件所在目录
//        FileInputFormat.setInputPaths(job,new Path("/wordcount/input"));
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        Boolean res = job.waitForCompletion(true);

        System.exit(res ? 0 : 1);
    }

}

在上诉的例子中:
当一个商品卖的特别火的时候,他的order表将很多项,这时候通过pid的hashcode决定reducer的时候,就会出现数据倾斜。就是一个reducer处理很多,而其他的很少

解决方法:
使用 map side join(不需要reducer,只使用map)
将产品信息表(比order要小很多很多)放入一个distributedcache中,
然而hadoop已经为我们实现了distributedcache
所以代码改动如下:
1:新建一个product类

public class Product {
    private String pid;
    private String data;
    private String price;

2:map端:

    //map输出时key为联系表的外键
    static class JoinMapper extends Mapper<LongWritable,Text,Text,NullWritable> {

        Product p = new Product();

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("product.txt"))));
            String line = br.readLine();
            if (StringUtils.isNotEmpty(line)){
                String[] fileds = line.split(",");
                p.set(fileds[0],fileds[3],fileds[2],fileds[1]);
            }
            br.close();
        }

        @Override
        protected void map(LongWritable number, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] fileds = line.split(",");
            InfoBean info = new InfoBean();
            Text key = new Text();
            info.set(fileds[2],fileds[1],p.getPrice(),fileds[3],p.getProductName(),fileds[0],p.getCategoryId(),"0");
            String pid = info.getPid();
            key.set(pid + p.getProductName() + p.getPrice());
            context.write(key,NullWritable.get());
        }
    }

3:job端

         Configuration conf = new Configuration();


        //本地调试模式
        conf.set("mapreduce.framework.name","local");
        conf.set("fs.defaultFS","file:///");

//        本地提交模式 hdfs在线
//        conf.set("mapreduce.framework.name","local");
//        conf.set("fs.defaultFS","hdfs://master:9000");



        Job job = Job.getInstance();

        //线上
        job.setJarByClass(DataJoin.class);
        //调试模式
//        job.setJar("/home/willian/Desktop/project/java/hadoop/out/jar/word.jar");

        //指定运行的map程序
        job.setMapperClass(JoinMapper.class);
//        job.setReducerClass(JoinReducer.class);

//        //指定map输出数据的kv类型
//        job.setMapOutputKeyClass(Text.class);
//        job.setMapOutputValueClass(InfoBean.class);

        //最终输出数据类型kv
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        job.setNumReduceTasks(0);

//
//        job.addArchiveToClassPath(); //缓存jar包到task节点的classpath
//        job.addFileToClassPath(); //缓存普通文件到task节点的classpath
//        job.addCacheFile(); //缓存普通文件到task节点的工作目录中


        job.addCacheFile(new URI("file:///home/willian/Desktop/project/java/hadoop/product.txt"));//或hdfs文件路径

        //指定job的输入原始文件所在目录
//        FileInputFormat.setInputPaths(job,new Path("/wordcount/input"));
        FileInputFormat.setInputPaths(job, new Path("/home/willian/Desktop/project/java/hadoop/mrlocal/order.txt"));
        FileOutputFormat.setOutputPath(job, new Path("/home/willian/Desktop/project/java/hadoop/mrlocal/out"));

        Boolean res = job.waitForCompletion(true);

        System.exit(res ? 0 : 1);
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值