MapReduce的join操作
orders表
+-------------------+-------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+-------------------+-------------+------+-----+---------+----------------+
| order_id | int(11) | NO | PRI | NULL | auto_increment |
| order_date | datetime | NO | | NULL | |
| order_customer_id | int(11) | NO | | NULL | |
| order_status | varchar(45) | NO | | NULL | |
+-------------------+-------------+------+-----+---------+----------------+
customers表
+-------------------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+-------------------+--------------+------+-----+---------+----------------+
| customer_id | int(11) | NO | PRI | NULL | auto_increment |
| customer_fname | varchar(45) | NO | | NULL | |
| customer_lname | varchar(45) | NO | | NULL | |
| customer_email | varchar(45) | NO | | NULL | |
| customer_password | varchar(45) | NO | | NULL | |
| customer_street | varchar(255) | NO | | NULL | |
| customer_city | varchar(45) | NO | | NULL | |
| customer_state | varchar(45) | NO | | NULL | |
| customer_zipcode | varchar(45) | NO | | NULL | |
+-------------------+--------------+------+-----+---------+----------------+
两张表中都有customer_id,故将customer_id作为key比较合适
Reduce阶段的join处理
mapper阶段通过获取inputSplit对象的path属性分辨出切割的数据来自哪个数据源,以做不同的处理生成中间值.
在reduce阶段将key值相同的中间值聚合为最后的对象
实现:
实体类CustomerOrders ,作为最后输出的value,也是中间值的value.
package com.orderAndCustomer.entry;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class CustomerOrders implements Writable {
private String customer_id;
private String orderId;
private String name;
private String orderStatus;
// 标签,用于区分不同数据源的对象
private String flag;
public String getCustomer_id() {
return customer_id;
}
public CustomerOrders setCustomer_id(String customer_id) {
this.customer_id = customer_id;
return this;
}
public String getOrderId() {
return orderId;
}
public CustomerOrders setOrderId(String orderId) {
this.orderId = orderId;
return this;
}
public String getName() {
return name;
}
public CustomerOrders setName(String name) {
this.name = name;
return this;
}
public String getOrderStatus() {
return orderStatus;
}
public CustomerOrders setOrderStatus(String orderStatus) {
this.orderStatus = orderStatus;
return this;
}
public String getFlag() {
return flag;
}
public CustomerOrders setFlag(String flag) {
this.flag = flag;
return this;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.customer_id);
out.writeUTF(this.orderId);
out.writeUTF(this.name);
out.writeUTF(this.orderStatus);
out.writeUTF(this.flag);
}
@Override
public String toString() {
return customer_id + '\t' +
orderId + '\t' +
name + '\t' +
orderStatus + '\t' +
flag + '\'';
}
@Override
public void readFields(DataInput in) throws IOException {
this.customer_id=in.readUTF();
this.orderId=in.readUTF();
this.name=in.readUTF();
this.orderId=in.readUTF();
this.flag=in.readUTF();
}
}
mapper:
要注意的是:
inputSplit的导入包是org.apache.hadoop.mapreduce.lib.input.FileSplit;
package com.orderAndCustomer.mapper;
import com.orderAndCustomer.entry.CustomerOrders;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class ReduceJoinMapper extends Mapper<LongWritable, Text,Text, CustomerOrders> {
// 源文件路径
String name="";
// value
CustomerOrders customerOrders =new CustomerOrders();
// key
Text text=new Text();
//一个maptask会先执行setup然后开始循环遍历每一个map
@Override
protected void setup(Context context) throws IOException, InterruptedException {
// 获取这个map的InputSplit对象
FileSplit inputSplit= (FileSplit) context.getInputSplit();
// 获取数据原文件的名称
name = inputSplit.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 连个源文件都是","为分割符
String[] split = value.toString().split(",");
// order.csv的处理逻辑,标签为1
if (name.startsWith("order")){
customerOrders.setOrderId(split[0]);
customerOrders.setOrderStatus(split[3]);
customerOrders.setCustomer_id(split[2]);
customerOrders.setFlag("1");
customerOrders.setName("");
}else { //customer的处理逻辑,标签为2
customerOrders.setCustomer_id(split[0]);
customerOrders.setName(split[1]);
customerOrders.setFlag("0");
customerOrders.setOrderId("");
customerOrders.setOrderStatus("");
customerOrders.setCustomer_id("");
}
// customer_id作为key
text.set(customerOrders.getCustomer_id());
context.write(text, customerOrders);
}
}
reducer:
package com.orderAndCustomer.reduce;
import com.orderAndCustomer.entry.CustomerOrders;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;
//CustomerOrders作为输出key,也是最终显示的内容,value并不重要,所以value可以用NullWritable
public class ReduceJoinReducer extends Reducer<Text, CustomerOrders, CustomerOrders, NullWritable> {
@Override
protected void reduce(Text key, Iterable<CustomerOrders> values, Context context) throws IOException, InterruptedException {
// customer表的字段与order表的内用是一对多的关系
// customer表的对象cuBean
CustomerOrders cuBean=new CustomerOrders();
// 存放order表对象
List<CustomerOrders> orderBeans=new ArrayList<>();
//遍历values,标签为1的存放在orderBeans中
for (CustomerOrders bean:values
) {
if ("1".equals(bean.getFlag())){
CustomerOrders orderBean=new CustomerOrders();
try {
BeanUtils.copyProperties(orderBean,bean);
orderBeans.add(orderBean);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
}else {
try {
BeanUtils.copyProperties(cuBean,bean);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
}
}
// 为每一个order添加上对应id的customer表中的内容
for (CustomerOrders orderBean:orderBeans){
orderBean.setName(cuBean.getName());
context.write(orderBean,NullWritable.get());
}
}
}
driver
package com.orderAndCustomer.driver;
import com.orderAndCustomer.entry.CustomerOrders;
import com.orderAndCustomer.mapper.ReduceJoinMapper;
import com.orderAndCustomer.reduce.ReduceJoinReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class ReduceJoinDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
Configuration configuration=new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(ReduceJoinDriver.class);
job.setMapperClass(ReduceJoinMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(CustomerOrders.class);
job.setReducerClass(ReduceJoinReducer.class);
job.setOutputKeyClass(CustomerOrders.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job,new Path("/home/jarvis/Desktop/data"));
Path path=new Path("/home/jarvis/Desktop/orderAndCus");
// 判断文件是否存在,存在则删除
FileSystem fs =FileSystem.get(new URI(path.toString()),configuration);
if (fs.exists(path)){
fs.delete(path,true);
}
FileOutputFormat.setOutputPath(job,path);
job.waitForCompletion(true);
}
}
结论:会造成map和reduce端数据传输时(shuffle阶段)出现大量的数据传输,效率很低.而且合并的操作是在reduce阶段完成的,reduce端处理压力太大,map节点的运算负载则很低,资源利用率不高,且在reduce阶段极易产生数据倾斜
解决方案:map端实现数据合并
Mapper阶段的Join操作
order表有68883条数据
customer表有12435条数据
在map端缓存多张表,提前处理业务逻辑,增加map端的业务,减少reduce端数据的压力,尽可能的减少数据倾斜.
-
具体方法:
-
在驱动函数中加载缓存:
job.addCacheFile(new URI("")); //缓存普通文件到task运行节点
-
在mapper的setup阶段,将文件读取到缓存集合中
URI[] cacheFiles = context.getCacheFiles();
-
mapper:
package com.orderAndCustomer.mapper;
import com.orderAndCustomer.entry.CustomerOrders;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.*;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
public class MapperJoinMapper extends Mapper<LongWritable, Text, NullWritable, CustomerOrders> {
Map<String,String> cusMap =new HashMap<>();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
// 从缓存中读取文件
URI[] cacheFiles = context.getCacheFiles();
// 存放文件路径
String filename=new Path(cacheFiles[0]).getName();
// 读取文件
BufferedReader bufferedReader =new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
// 存放从文件中读取的每一行内容
String line;
// 遍历缓存文件
while(StringUtils.isNotEmpty(line=bufferedReader.readLine())){
String [] split=line.split(",");
// 将id与name作为键值对存贮在cusMap中
cusMap.put(split[0],split[1]);
}
// 关流
bufferedReader.close();
}
//最后显示的内容对象
CustomerOrders customerOrders =new CustomerOrders();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// value都是order表的数据,切分后添加上对应id的customer数据生成最终对象,不需要reduce阶段
String[] fields =value.toString().split(",");
customerOrders.setCustomer_id(fields[2]);
customerOrders.setOrderId(fields[0]);
customerOrders.setOrderStatus(fields[3]);
customerOrders.setName(cusMap.get(fields[2]));
customerOrders.setFlag("1");
context.write(NullWritable.get(), customerOrders);
}
}
reduce
没有reduce
driver
package com.orderAndCustomer.driver;
import com.orderAndCustomer.entry.CustomerOrders;
import com.orderAndCustomer.mapper.MapperJoinMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class MapJoinDriver {
public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(MapJoinDriver.class);
job.setMapperClass(MapperJoinMapper.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(CustomerOrders.class);
job.addCacheFile(new URI("/home/jarvis/Desktop/data/customers.csv"));
//输入
FileInputFormat.setInputPaths(job,new Path("/home/jarvis/Desktop/data/orders.csv"));
Path path =new Path("/home/jarvis/Desktop/orderAndCus");
// 判断文件是否存在,存在则删除
FileSystem fs =FileSystem.get(new URI(path.toString()),configuration);
if (fs.exists(path)){
fs.delete(path,true);
}
// 输出
FileOutputFormat.setOutputPath(job,path);
job.waitForCompletion(true);
}
}