需求:
求出所有具有爷孙关系的人
数据来源:
儿子 父亲
Tom Lucy
Tom Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma
reduce-join 处理结果:
孙子 父亲 爷爷
Tom Jack Alice
Tom Jack Jesse
Jone Jack Alice
Jone Jack Jesse
Tom Lucy Ben
Tom Lucy Mary
Jone Lucy Ben
Jone Lucy Mary
Philip Terry Alice
Philip Terry Jesse
Mark Terry Alice
Mark Terry Jesse
map-join 处理结果:
Tom Lucy Ben
Tom Jack Jesse
Jone Lucy Ben
Jone Jack Jesse
Lucy Mary null
Lucy Ben null
Jack Alice null
Jack Jesse null
Terry Alice null
Terry Jesse null
Philip Terry Jesse
Philip Alma null
Mark Terry Jesse
Mark Alma null
reduce-join 分析:
寻找三代关系其本质就是一张表的自连接,而自连接的键值就是父亲。所以首先需要将数据进行叠加输出,以用来产生两张表。但是叠加的同时需要把数据进行翻转(也就是mapkey一致)才能达到自连接的效果,还需要注意的是,在发送到reducer的时候需要标识数据来源否则无法区分是孙子还是爷爷。
reduce-join 代码实现:
package mr.day04;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.LinkedList;
/**
* @ClassName: SonDriver
* @Description:
* @Author: xuezhouyi
* @Version: V1.0
**/
public class SonDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SonDriver.class);
job.setMapperClass(Mapper1.class);
job.setReducerClass(Reducer1.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
FileSystem fileSystem = FileSystem.get(conf);
if (fileSystem.exists(new Path(args[1]))) {
fileSystem.delete(new Path(args[1]), true);
}
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
private static class Mapper1 extends Mapper<LongWritable, Text, Text, Text> {
Text k = new Text();
Text v = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
/* s#开头标识孙子 */
k.set(split[1].trim());
v.set("s#\t" + split[0]);
context.write(k, v);
/* g#开头标识爷爷 */
k.set(split[0].trim());
v.set("g#\t" + split[1]);
context.write(k, v);
}
}
private static class Reducer1 extends Reducer<Text, Text, Text, NullWritable> {
Text k = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
/* 定义两个缓冲变量,做结果的临时存放 */
LinkedList<String> son = new LinkedList<>();
LinkedList<String> grand = new LinkedList<>();
for (Text value : values) {
String[] split = value.toString().split("\t");
/* 处理孙子 */
if ("s#".equals(split[0]))
son.add(split[1].trim());
/* 处理爷爷 */
if ("g#".equals(split[0]))
grand.add(split[1].trim());
}
/* 判断是否存在三代关系,也就是两个缓冲变量是否至少有一个值 */
if (son.size() > 0 && grand.size() > 0) {
for (String s : son) {
for (String g : grand) {
/* 双层遍历展开关系 */
k.set(s + "\t" + key.toString() + "\t" + g);
context.write(k, NullWritable.get());
}
}
}
}
}
}
map-join 分析:
这道题目本身并不适用于map-join,因为没有主键,而在正常的业务中我们一张临时代码表必然是存在主键的,这个时候我们使用map-join会大大提高运行速度,这里仅仅是为了演示而已
map-join 代码实现:
package mr.day04;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
/**
* @ClassName: SonDriver
* @Description:
* @Author: xuezhouyi
* @Version: V1.0
**/
public class SonDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SonDriver.class);
job.setMapperClass(Mapper1.class);
/* 不需要设置mapper的输出因为没有reducer */
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
/* 放入缓存文件 */
job.addCacheFile(new URI(args[2]));
/* 不启动reducetask */
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
FileSystem fileSystem = FileSystem.get(conf);
if (fileSystem.exists(new Path(args[1]))) {
fileSystem.delete(new Path(args[1]), true);
}
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
private static class Mapper1 extends Mapper<LongWritable, Text, Text, NullWritable> {
/* 定义容器存放临时表 */
HashMap<String, String> tmp = new HashMap<>();
Text k = new Text();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
URI[] cacheFiles = context.getCacheFiles();
/* 只有一张临时表 */
String path = cacheFiles[0].getPath();
FileSystem fileSystem = FileSystem.get(context.getConfiguration());
FSDataInputStream is = fileSystem.open(new Path(path));
BufferedReader br = new BufferedReader(new InputStreamReader(is));
String line = "";
/* 缓存临时表到容器中,注意:这里会有去重操作理论上匹配的主键是不会重复的 */
while ((line = br.readLine()) != null) {
String[] split = line.split("\t");
tmp.put(split[0].trim(), split[1].trim());
}
br.close();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
if (split.length != 2) {
return;
}
/* 根据父亲找爷爷 */
String son = split[0].trim();
String father = split[1].trim();
String grand = tmp.get(father);
/* 封装发送 */
k.set(son + "\t" + father + "\t" + grand);
context.write(k, NullWritable.get());
}
}
}
扩展
在大表和小表join的过程中,更多是推荐MapJoin,因为可以避免ReduceJoin带来的数据倾斜问题
如果需要加入多张小表到缓存中,则可以设置如下:
/* 放入两个缓存文件 */
URI[] uris = {new URI(args[2]), new URI(args[3])};
job.setCacheFiles(uris);
然后在Mapper的setup方法中进行缓存
@Override
protected void setup(Context context) throws IOException, InterruptedException {
String userPath = "";
String moviePath = "";
URI[] cacheFiles = context.getCacheFiles();
/* 分别给两个缓冲的路径赋值 */
for (URI cacheFile : cacheFiles) {
String path = cacheFile.getPath();
if (path.contains("user")) {
userPath = path;
} else if (path.contains("movie")) {
moviePath = path;
}
}
/* 缓冲流读入内存 */
FileSystem fileSystem = FileSystem.get(context.getConfiguration());
FSDataInputStream userIS = fileSystem.open(new Path(userPath));
BufferedReader userBR = new BufferedReader(new InputStreamReader(userIS));
FSDataInputStream movieIS = fileSystem.open(new Path(moviePath));
BufferedReader movieBR = new BufferedReader(new InputStreamReader(movieIS));
String line = "";
/* 缓存users信息到容器 */
while ((line = userBR.readLine()) != null) {
String[] split = line.split("::");
users.put(Integer.parseInt(split[0].trim()), split[1].trim() + "\t" + split[2].trim() + "\t" + split[3].trim() + "\t" + split[4].trim());
}
userBR.close();
/* 缓存movies信息到容器 */
while ((line = movieBR.readLine()) != null) {
String[] split = line.split("::");
movies.put(Integer.parseInt(split[0].trim()), split[1].trim() + "\t" + split[2].trim());
}
movieBR.close();
}
最后,就是map方法中的MapJoin
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("::");
if (split.length != 4) {
return;
}
int userId = Integer.parseInt(split[0]);
int movieId = Integer.parseInt(split[1]);
/* 获取对应id小表中的信息 */
String user = users.get(userId);
String movie = movies.get(movieId);
/* 封装发送 */
k.set(userId + "\t" + movieId + "\t" + user + "\t" + movie);
context.write(k, NullWritable.get());
}
这样就把ReduceJoin提前到了Map端,直接减少了shuffle过程很大程度提高了执行效率
博客介绍了如何利用MapReduce处理爷孙关系查询,分别探讨了reduce-join和map-join两种实现方式。reduce-join通过数据翻转和标识数据来源实现自连接,map-join则适用于存在主键的小表join,能提高运行效率,尤其在处理大表和小表join时,可避免数据倾斜问题。
1638

被折叠的 条评论
为什么被折叠?



