假设现在是A.joinB
- ReduceJoin:在map阶段先后读取A和B,通过在mapper的setup方法中获取当前正在读取的表名,并将表名信息添加到Bean对象中;在reduce阶段,A和B中相同key的数据被放到同一个reducer中,通过map阶段添加的表名信息先将reducer的value分成来自A的Aarray和来自B的Barray,遍历Aarray和Barray即可完成join操作;
- MapJoin:只有map阶段无reduce阶段,通过cache的方式先将要B加载到map的计算节点,然后在map端直接完成join;
- 分析
- ReduceJoin的每个map只获取到了B的部分信息,因此必须通过reduce阶段才能完成join,经过reduce会产生shuffle;
- MapJoin的每个map均获取到了B的全部信息,因此仅需map阶段即可完成join,无reduce因此不产生shuffle;
- MapJoin因为每个map均需要加载B的全部信息,因此仅当B为小表时适用。
1.ReduceJoin
-
数据
order.txt
orderId,prodId,saleVol
1,001,123
2,001,222
3,003,12
4,002,34
5,009,32
6,001,12
7,009,2
8,007,123
9,007,123
10,008,33
11,008,32proInfo.txt
prodId,prodName,prodPrice
001,小米1,1234
002,小米2,222
003,小米3,333
004,华为1,111
005,华为2,222
006,华为3,444
007,华为4,333
008,华为5,321
009,一加1,222 -
Bean
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class OrderBean implements Writable {
private String orderId;
private String prodId;
private int saleVol;
private String prodName;
private float prodPrice;
private String tag;
public float getProdPrice() {
return prodPrice;
}
public int getSaleVol() {
return saleVol;
}
public String getOrderId() {
return orderId;
}
public String getProdId() {
return prodId;
}
public String getProdName() {
return prodName;
}
public String getTag() {
return tag;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public void setProdId(String prodId) {
this.prodId = prodId;
}
public void setProdName(String prodName) {
this.prodName = prodName;
}
public void setProdPrice(float prodPrice) {
this.prodPrice = prodPrice;
}
public void setSaleVol(int saleVol) {
this.saleVol = saleVol;
}
public void setTag(String tag) {
this.tag = tag;
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(this.orderId);
dataOutput.writeUTF(this.prodId);
dataOutput.writeInt(this.saleVol);
dataOutput.writeUTF(this.prodName);
dataOutput.writeFloat(this.prodPrice);
dataOutput.writeUTF(this.tag);
}
public void readFields(DataInput dataInput) throws IOException {
this.orderId = dataInput.readUTF();
this.prodId = dataInput.readUTF();
this.saleVol = dataInput.readInt();
this.prodName = dataInput.readUTF();
this.prodPrice = dataInput.readFloat();
this.tag = dataInput.readUTF();
}
@Override
public String toString() {
return "orderId='" + orderId + '\'' +
", prodId='" + prodId + '\'' +
", prodName='" + prodName + '\'' +
", prodPrice=" + prodPrice +
", tag=" + tag;
}
}
- Mapper
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class RJMapper extends Mapper<LongWritable,Text, Text,OrderBean> {
private String tableName;
private Text mapKey = new Text();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
FileSplit inputSplit = (FileSplit) context.getInputSplit();
tableName = inputSplit.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split(",");
OrderBean orderBean = new OrderBean();
if(tableName.startsWith("order")){
orderBean.setOrderId(fields[0]);
orderBean.setProdId(fields[1]);
orderBean.setSaleVol(Integer.parseInt(fields[2]));
orderBean.setProdName("");
orderBean.setProdPrice(0);
orderBean.setTag("order");
} else if(tableName.startsWith("proInfo")){
orderBean.setProdId(fields[0]);
orderBean.setProdName(fields[1]);
orderBean.setProdPrice(Float.parseFloat(fields[2]));
orderBean.setOrderId("");
orderBean.setSaleVol(0);
orderBean.setTag("proInfo");
}
mapKey.set(orderBean.getProdId());
context.write(mapKey,orderBean);
}
}
- Reducer
import com.sun.org.apache.xpath.internal.operations.Or;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import javax.xml.bind.SchemaOutputResolver;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
public class RJReducer extends Reducer<Text,OrderBean,Text,OrderBean> {
@Override
protected void reduce(Text key, Iterable<OrderBean> values, Context context) throws IOException, InterruptedException {
ArrayList<OrderBean> orderList = new ArrayList<OrderBean>();
ArrayList<OrderBean> infoList = new ArrayList<OrderBean>();
for(OrderBean orderBean:values){
OrderBean tmpOrderBean = new OrderBean();
try {
BeanUtils.copyProperties(tmpOrderBean,orderBean);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
if(orderBean.getTag().equals("order")){
orderList.add(tmpOrderBean);
} else if(orderBean.getTag().equals("proInfo")){
infoList.add(tmpOrderBean);
}else{
System.out.println("errrrrrrrrrrr");
}
}
String prodId;
for(OrderBean orderBean:orderList){
prodId = orderBean.getProdId();
for(OrderBean infoBean:infoList){
if(infoBean.getProdId().equals(prodId)){
orderBean.setProdName(infoBean.getProdName());
orderBean.setProdPrice(infoBean.getProdPrice());
break;
}
}
context.write(key,orderBean);
}
}
}
- Driver
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.BasicConfigurator;
import java.io.IOException;
public class RJDriver {
private static String HDFS_HOST = "hdfs://dong:9000";
private static String INPUT_PATH = "hdfs:///data/joinTest";
private static String OUTPUT_PATH = "hdfs:///data/result/joinResult";
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
BasicConfigurator.configure();
Configuration conf = new Configuration();
conf.set("fs.defaultFS",HDFS_HOST);
Job job = Job.getInstance(conf);
job.setJarByClass(RJDriver.class);
job.setMapperClass(RJMapper.class);
job.setReducerClass(RJReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(OrderBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(OrderBean.class);
Path inputPath = new Path(INPUT_PATH);
Path outputPath = new Path(OUTPUT_PATH);
FileInputFormat.setInputPaths(job,inputPath);
FileOutputFormat.setOutputPath(job,outputPath);
FileSystem fileSystem = FileSystem.get(conf);
if(fileSystem.exists(outputPath)){
fileSystem.delete(outputPath,true);
}
boolean result = job.waitForCompletion(true);
System.exit(result?0:1);
}
}
2.MapJoin
- Bean
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class OrderBean implements Writable {
private String orderId;
private String prodId;
private int saleVol;
private String prodName;
private float prodPrice;
public float getProdPrice() {
return prodPrice;
}
public int getSaleVol() {
return saleVol;
}
public String getOrderId() {
return orderId;
}
public String getProdId() {
return prodId;
}
public String getProdName() {
return prodName;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public void setProdId(String prodId) {
this.prodId = prodId;
}
public void setProdName(String prodName) {
this.prodName = prodName;
}
public void setProdPrice(float prodPrice) {
this.prodPrice = prodPrice;
}
public void setSaleVol(int saleVol) {
this.saleVol = saleVol;
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(this.orderId);
dataOutput.writeUTF(this.prodId);
dataOutput.writeInt(this.saleVol);
dataOutput.writeUTF(this.prodName);
dataOutput.writeFloat(this.prodPrice);
}
public void readFields(DataInput dataInput) throws IOException {
this.orderId = dataInput.readUTF();
this.prodId = dataInput.readUTF();
this.saleVol = dataInput.readInt();
this.prodName = dataInput.readUTF();
this.prodPrice = dataInput.readFloat();
}
@Override
public String toString() {
return "orderId='" + orderId + '\'' +
", prodId='" + prodId + '\'' +
", prodName='" + prodName + '\'' +
", prodPrice=" + prodPrice;
}
}
- Mapper
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.*;
import java.util.HashMap;
public class MJMapper extends Mapper<LongWritable, Text,Text,OrderBean> {
HashMap hashMap = new HashMap<String, String>();
Text mapKey = new Text();
@Override
protected void setup(Context context) throws IOException {
FileReader fr = new FileReader("proInfo.txt");
BufferedReader reader = new BufferedReader(fr);
String line;
String[] fields;
//001,小米1,1234
while(StringUtils.isNotEmpty(line = reader.readLine())){
fields = line.split(",");
String val = fields[1] + "," + fields[2];
hashMap.put(fields[0],val);
}
IOUtils.closeStream(reader);
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1,001,123
String line = value.toString();
String[] fields = line.split(",");
OrderBean orderBean = new OrderBean();
orderBean.setOrderId(fields[0]);
orderBean.setProdId(fields[1]);
orderBean.setSaleVol(Integer.parseInt(fields[2]));
String tmpProdInfo = (String) hashMap.get(orderBean.getProdId());
String[] prodInfo = tmpProdInfo.split(",");
orderBean.setProdName(prodInfo[0]);
orderBean.setProdPrice(Float.parseFloat(prodInfo[1]));
mapKey.set(orderBean.getProdId());
context.write(mapKey,orderBean);
}
}
- Driver
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.BasicConfigurator;
import org.jets3t.service.model.cloudfront.Distribution;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class MJDriver {
private static String HDFS_HOST = "hdfs://dong:9000";
private static String INPUT_PATH = "hdfs:///data/joinTest";
private static String OUTPUT_PATH = "hdfs:///data/result/joinResult";
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
BasicConfigurator.configure();
Configuration conf = new Configuration();
conf.set("fs.defaultFS",HDFS_HOST);
Job job = Job.getInstance(conf);
job.setJarByClass(MJDriver.class);
job.setMapperClass(MJMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(OrderBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(OrderBean.class);
Path inputPath = new Path(INPUT_PATH);
Path outputPath = new Path(OUTPUT_PATH);
FileInputFormat.setInputPaths(job,inputPath);
FileOutputFormat.setOutputPath(job,outputPath);
job.addCacheFile(new URI( "hdfs://dong:9000/data/proInfo.txt"));
//DistributedCache.addCacheArchive(new URI( "hdfs:///data/proInfo.txt"),conf);
job.setNumReduceTasks(0);
FileSystem fileSystem = FileSystem.get(conf);
if(fileSystem.exists(outputPath)){
fileSystem.delete(outputPath,true);
}
boolean result = job.waitForCompletion(true);
System.exit(result?0:1);
}
}
博客主要介绍了Hadoop中ReduceJoin和MapJoin两种join操作。ReduceJoin在map阶段读取数据并添加表名信息,reduce阶段完成join,会产生shuffle;MapJoin只有map阶段,通过cache加载数据直接在map端完成join,无shuffle,仅适用于小表。还给出了相关数据示例及代码模块。
462

被折叠的 条评论
为什么被折叠?



