reduce端的join算法和map端的join算法

最新推荐文章于 2025-12-31 11:47:28 发布

原创最新推荐文章于 2025-12-31 11:47:28 发布 · 486 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#hadoop

hadoop 专栏收录该内容

26 篇文章

订阅专栏

本文探讨了Hadoop中两种Join操作，包括map端的join算法和reduce端的join算法。重点讲解了map端如何通过缓存文件实现数据的联合，以及这两种方法在大数据处理中的应用和优缺点。

------------------------------------reduce端join-------------------------------
package demo06.reducejoin;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;

public class ReduceJoinMap extends Mapper<LongWritable, Text,Text,Text> {
    Text k2 = new Text();

    /**
     * 进行分类，商品id作为k2行数据作为v2
     * 获取文件名，如果入k1,v1的文件名是produce为商品,order为订单
     * 
     * 
     */
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
        //获取文件路径名
        FileSplit inputSplit = (FileSplit) context.getInputSplit();
        Path path = inputSplit.getPath();
        String pathName = path.getName();
        
        String line = value.toString();

        //逻辑处理
        if(pathName.startsWith("p")){
            //商品表数据
            String[] splits = line.split(",");
            k2.set(splits[0]);
            context.write(k2,value);
        }else{
            String[] splits = line.split(",");
            k2.set(splits[2]);
            context.write(k2,value);
        }
    }
}

package demo06.reducejoin;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class ReduceJoinReduce extends Reducer<Text,Text,Text,NullWritable> {

    /**
     * 利用hadoop相同的key合并，多行数据变为集合的特性进行join,此时相同产品的产品数据和订单数据都在集合内,
     * 如果是订单数据就给orderLine，如果是商品数据就给productLine
     * 再进行拼接完成简单的join
     */
    @Override
    protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        String orderLine = "";
        String productLine = "";
        for (Text text: values) {
            if(text.toString().startsWith("p")){
                productLine=text.toString();
            } else{
                orderLine=text.toString();
            }

        }

        context.write(new Text(orderLine+"\t"+productLine),NullWritable.get());
    }
}

-----------------------------------------------------map端join-------------------------------------------

在主程序里添加缓存文件

//添加我们的缓存文件
DistributedCache.addCacheFile(new URI("hdfs://node01:8020/cachefile/pdts.txt"),configuration);

package demo07.mapjoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;

public class MapJoinMap extends Mapper<LongWritable, Text,Text, NullWritable> {
    //定义一个setup结果集保存数据
    HashMap<String,String> map;
    /**
     * 重写setup方法获取缓存文件，将缓存文件保存在map中
     * 重写setup方法，里面用文件系统来处理文件，将商品保存为一个集合
     * 在map方法中与订单进行匹配
     */
    @Override
    protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        map = new HashMap<String,String>();
        //从context中获取configuration
        Configuration configuration = context.getConfiguration();
        //我们只有一个缓存文件
        URI[] caches = DistributedCache.getCacheFiles(configuration);
        URI cacheFile = caches[0];
        //cacheFile是hdfs://node01:8020/.....
        //获取一个文件系统
        FileSystem fileSystem = FileSystem.get(cacheFile,configuration);
        //获取文件的输入流
        FSDataInputStream fsDataInputStream = fileSystem.open(new Path(cacheFile));
        //流读取为字符串
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fsDataInputStream));
        String line = bufferedReader.readLine();
        String lineStr = null;
        while ((line = bufferedReader.readLine())!=null){
            String[] lineArray = line.split(",");
            map.put(lineArray[0],line);
        }


    }

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        String[] splits = value.toString().split(",");
        //获取我们商品表的数据
        String product = map.get(splits[2]);
        //将商品表和订单表中的数据进行拼接
        context.write(new Text(value.toString()+"\t"+product),NullWritable.get());
    }
}