MapReduce经典案例爷孙关系(自连接MapJoin和ReduceJoin的两种实现)

最新推荐文章于 2021-09-09 22:35:48 发布

原创最新推荐文章于 2021-09-09 22:35:48 发布 · 1.8k 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#hadoop #mapreduce #join

Hadoop 专栏收录该内容

6 篇文章

订阅专栏

博客介绍了如何利用MapReduce处理爷孙关系查询，分别探讨了reduce-join和map-join两种实现方式。reduce-join通过数据翻转和标识数据来源实现自连接，map-join则适用于存在主键的小表join，能提高运行效率，尤其在处理大表和小表join时，可避免数据倾斜问题。

需求：

求出所有具有爷孙关系的人

数据来源：

儿子    父亲
Tom	Lucy
Tom	Jack
Jone	Lucy
Jone	Jack
Lucy	Mary
Lucy	Ben
Jack	Alice
Jack	Jesse
Terry	Alice
Terry	Jesse
Philip	Terry
Philip	Alma
Mark	Terry
Mark	Alma

reduce-join 处理结果：

孙子    父亲    爷爷
Tom	Jack	Alice
Tom	Jack	Jesse
Jone	Jack	Alice
Jone	Jack	Jesse
Tom	Lucy	Ben
Tom	Lucy	Mary
Jone	Lucy	Ben
Jone	Lucy	Mary
Philip	Terry	Alice
Philip	Terry	Jesse
Mark	Terry	Alice
Mark	Terry	Jesse

map-join 处理结果：

Tom	Lucy	Ben
Tom	Jack	Jesse
Jone	Lucy	Ben
Jone	Jack	Jesse
Lucy	Mary	null
Lucy	Ben	null
Jack	Alice	null
Jack	Jesse	null
Terry	Alice	null
Terry	Jesse	null
Philip	Terry	Jesse
Philip	Alma	null
Mark	Terry	Jesse
Mark	Alma	null

reduce-join 分析：

寻找三代关系其本质就是一张表的自连接，而自连接的键值就是父亲。所以首先需要将数据进行叠加输出，以用来产生两张表。但是叠加的同时需要把数据进行翻转（也就是mapkey一致）才能达到自连接的效果，还需要注意的是，在发送到reducer的时候需要标识数据来源否则无法区分是孙子还是爷爷。

reduce-join 代码实现：

package mr.day04;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.LinkedList;

/**
 * @ClassName: SonDriver
 * @Description: 
 * @Author: xuezhouyi
 * @Version: V1.0
 **/
public class SonDriver {
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		job.setJarByClass(SonDriver.class);
		job.setMapperClass(Mapper1.class);
		job.setReducerClass(Reducer1.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		FileSystem fileSystem = FileSystem.get(conf);
		if (fileSystem.exists(new Path(args[1]))) {
			fileSystem.delete(new Path(args[1]), true);
		}
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

	private static class Mapper1 extends Mapper<LongWritable, Text, Text, Text> {
		Text k = new Text();
		Text v = new Text();

		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String[] split = value.toString().split("\t");
			/* s#开头标识孙子 */
			k.set(split[1].trim());
			v.set("s#\t" + split[0]);
			context.write(k, v);

			/* g#开头标识爷爷 */
			k.set(split[0].trim());
			v.set("g#\t" + split[1]);
			context.write(k, v);
		}
	}

	private static class Reducer1 extends Reducer<Text, Text, Text, NullWritable> {
		Text k = new Text();

		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
			/* 定义两个缓冲变量，做结果的临时存放 */
			LinkedList<String> son = new LinkedList<>();
			LinkedList<String> grand = new LinkedList<>();

			for (Text value : values) {
				String[] split = value.toString().split("\t");
				/* 处理孙子 */
				if ("s#".equals(split[0]))
					son.add(split[1].trim());
				/* 处理爷爷 */
				if ("g#".equals(split[0]))
					grand.add(split[1].trim());
			}

			/* 判断是否存在三代关系，也就是两个缓冲变量是否至少有一个值 */
			if (son.size() > 0 && grand.size() > 0) {
				for (String s : son) {
					for (String g : grand) {
						/* 双层遍历展开关系 */
						k.set(s + "\t" + key.toString() + "\t" + g);
						context.write(k, NullWritable.get());
					}
				}
			}
		}
	}
}

map-join 分析：

这道题目本身并不适用于map-join，因为没有主键，而在正常的业务中我们一张临时代码表必然是存在主键的，这个时候我们使用map-join会大大提高运行速度，这里仅仅是为了演示而已

map-join 代码实现：

package mr.day04;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;

/**
 * @ClassName: SonDriver
 * @Description: 
 * @Author: xuezhouyi
 * @Version: V1.0
 **/
public class SonDriver {
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		job.setJarByClass(SonDriver.class);
		job.setMapperClass(Mapper1.class);

		/* 不需要设置mapper的输出因为没有reducer */
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);

		/* 放入缓存文件 */
		job.addCacheFile(new URI(args[2]));

		/* 不启动reducetask */
		job.setNumReduceTasks(0);

		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		FileSystem fileSystem = FileSystem.get(conf);
		if (fileSystem.exists(new Path(args[1]))) {
			fileSystem.delete(new Path(args[1]), true);
		}
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

	private static class Mapper1 extends Mapper<LongWritable, Text, Text, NullWritable> {
		/* 定义容器存放临时表 */
		HashMap<String, String> tmp = new HashMap<>();
		Text k = new Text();

		@Override
		protected void setup(Context context) throws IOException, InterruptedException {
			URI[] cacheFiles = context.getCacheFiles();

			/* 只有一张临时表 */
			String path = cacheFiles[0].getPath();
			FileSystem fileSystem = FileSystem.get(context.getConfiguration());
			FSDataInputStream is = fileSystem.open(new Path(path));
			BufferedReader br = new BufferedReader(new InputStreamReader(is));
			String line = "";

			/* 缓存临时表到容器中，注意：这里会有去重操作理论上匹配的主键是不会重复的 */
			while ((line = br.readLine()) != null) {
				String[] split = line.split("\t");
				tmp.put(split[0].trim(), split[1].trim());
			}
			br.close();
		}

		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String[] split = value.toString().split("\t");
			if (split.length != 2) {
				return;
			}

			/* 根据父亲找爷爷 */
			String son = split[0].trim();
			String father = split[1].trim();
			String grand = tmp.get(father);

			/* 封装发送 */
			k.set(son + "\t" + father + "\t" + grand);
			context.write(k, NullWritable.get());
		}
	}
}

扩展

在大表和小表join的过程中，更多是推荐MapJoin，因为可以避免ReduceJoin带来的数据倾斜问题

如果需要加入多张小表到缓存中，则可以设置如下：

/* 放入两个缓存文件 */
URI[] uris = {new URI(args[2]), new URI(args[3])};
job.setCacheFiles(uris);

然后在Mapper的setup方法中进行缓存

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    String userPath = "";
    String moviePath = "";
    URI[] cacheFiles = context.getCacheFiles();

    /* 分别给两个缓冲的路径赋值 */
    for (URI cacheFile : cacheFiles) {
        String path = cacheFile.getPath();
        if (path.contains("user")) {
            userPath = path;
        } else if (path.contains("movie")) {
            moviePath = path;
        }
    }

    /* 缓冲流读入内存 */
    FileSystem fileSystem = FileSystem.get(context.getConfiguration());
    FSDataInputStream userIS = fileSystem.open(new Path(userPath));
    BufferedReader userBR = new BufferedReader(new InputStreamReader(userIS));
    FSDataInputStream movieIS = fileSystem.open(new Path(moviePath));
    BufferedReader movieBR = new BufferedReader(new InputStreamReader(movieIS));
    String line = "";

    /* 缓存users信息到容器 */
    while ((line = userBR.readLine()) != null) {
        String[] split = line.split("::");
        users.put(Integer.parseInt(split[0].trim()), split[1].trim() + "\t" + split[2].trim() + "\t" + split[3].trim() + "\t" + split[4].trim());
    }
    userBR.close();

    /* 缓存movies信息到容器 */
    while ((line = movieBR.readLine()) != null) {
        String[] split = line.split("::");
        movies.put(Integer.parseInt(split[0].trim()), split[1].trim() + "\t" + split[2].trim());
    }
    movieBR.close();
}

最后，就是map方法中的MapJoin

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String[] split = value.toString().split("::");
    if (split.length != 4) {
        return;
    }
    int userId = Integer.parseInt(split[0]);
    int movieId = Integer.parseInt(split[1]);

    /* 获取对应id小表中的信息 */
    String user = users.get(userId);
    String movie = movies.get(movieId);

    /* 封装发送 */
    k.set(userId + "\t" + movieId + "\t" + user + "\t" + movie);
    context.write(k, NullWritable.get());
}

这样就把ReduceJoin提前到了Map端，直接减少了shuffle过程很大程度提高了执行效率