MR优化之——MapSideJoin、ReducerSideJoin

最新推荐文章于 2023-08-03 14:11:21 发布

原创最新推荐文章于 2023-08-03 14:11:21 发布 · 367 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#MR #MapSideJoin #ReduceSideJoin

本文介绍使用Hadoop实现多文件Join操作的方法，包括Mapper端和Reducer端的Join过程，并提供具体代码示例。

如代码有误，望各位大牛纠正一下，本人会及时修改！

import java.io.BufferedReader;

import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
* @author 吕梁彪
* 需要广播的表的内容为shop表
* 需要进行join的表为information表
*/
public class Mutli_File_MapperMeger extends Configured implements Tool {
private static String CUSTOMER_CACHE_URL = "file:/D:/IOoperation/MapReducer/input/shop.txt";
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"Mutli_File_Meger");
job.setJarByClass(Mutli_File_MapperMeger.class);
//URI参数必须为下列格式
job.addCacheFile(URI.create(CUSTOMER_CACHE_URL));
job.setMapperClass(Mutli_File_MegerMap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setNumReduceTasks(0);
Path path = new Path("D:\\IOoperation\\MapReducer\\output\\out1");
FileSystem fs = FileSystem.get(conf);
if(fs.exists(path)) {
fs.delete(path,true);
}
FileInputFormat.setInputPaths(job, "D:\\IOoperation\\MapReducer\\input\\shop_information.txt");
FileOutputFormat.setOutputPath(job, path);

return job.waitForCompletion(true)?0:1;
}
public static class Mutli_File_MegerMap extends Mapper<LongWritable, Text, Text, Text>{
HashMap<String,String> hm = new HashMap<String,String>();
Text k = new Text();
Text v = new Text();
StringBuffer sb;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
FileSystem fs = FileSystem.get(URI.create(CUSTOMER_CACHE_URL), context.getConfiguration());
FSDataInputStream fdis = fs.open(new Path(CUSTOMER_CACHE_URL));
BufferedReader br = new BufferedReader(new InputStreamReader(fdis));
String str;
while (StringUtils.isNotEmpty(str = br.readLine())) {
String[] fields = str.split("\t");
hm.put(fields[0], fields[1]);
}

if(br!=null) {
br.close();
}
}

@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
String[] split = StringUtils.split(value.toString(),"\t");
for(int i = 0 ; i < split.length ; i++) {
sb = new StringBuffer();
sb.append(split[1]).append("|").append(split[0])
.append("|").append(hm.get(split[1])).append("|").append(split[2]);
}
k.set(sb.toString());
context.write(k, v);
}

}
public static void main(String[] args) {
try {
System.out.println(ToolRunner.run(new Mutli_File_MapperMeger(), args));
} catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}

}

-----------------------------------华丽的分割线-------------------------------------------

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
* @author 吕梁彪
* 此次在Reducer进行join操作
* 输入路径为多个
* 此程序需要打成jar包在集群上运行
*/
public class Mutil_File_ReducerMeger extends Configured implements Tool{

@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Mutil_File_ReducerMeger");
job.setJarByClass(Mutil_File_ReducerMeger.class);
job.setMapperClass(Mutil_File_ReducerMeger_Mapper.class);
job.setReducerClass(Mutil_File_ReducerMeger_Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LLBWritable.class);
job.setOutputKeyClass(LLBWritable.class);
job.setOutputValueClass(NullWritable.class);
//job.setNumReduceTasks(0);
Path path = new Path("D:\\IOoperation\\MapReducer\\output\\out1");

FileSystem fs = FileSystem.get(conf);
if(fs.exists(path)) {
fs.delete(path,true);
}
FileInputFormat.setInputPaths(job, "D:\\IOoperation\\MapReducer\\input\\shop_information.txt,D:\\IOoperation\\MapReducer\\input\\shop.txt");
FileOutputFormat.setOutputPath(job, path);
return job.waitForCompletion(true)?0:1;
}
public static class LLBWritable implements Writable,Serializable{
private static final long serialVersionUID = 1L;
private String goodsName;
private int goodsId;
private int goodsPrice;
private int goodsSales;
private int goodsFlag;

public void set(String goodsName,int goodsId,int goodsPrice,int goodsSales,int goodsFlag) {
this.goodsName = goodsName;
this.goodsId = goodsId;
this.goodsPrice = goodsPrice;
this.goodsSales = goodsSales;
this.goodsFlag = goodsFlag;

}

public int getGoodsFlag() {
return goodsFlag;
}

public void setGoodsFlag(int goodsFlag) {
this.goodsFlag = goodsFlag;
}

public String getGoodsName() {
return goodsName;
}

public void setGoodsName(String goodsName) {
this.goodsName = goodsName;
}

public int getGoodsId() {
return goodsId;
}

public void setGoodsId(int goodsId) {
this.goodsId = goodsId;
}

public int getGoodsPrice() {
return goodsPrice;
}

public void setGoodsPrice(int goodsPrice) {
this.goodsPrice = goodsPrice;
}

public int getGoodsSales() {
return goodsSales;
}

public void setGoodsSales(int goodsSales) {
this.goodsSales = goodsSales;
}

@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(goodsName);
out.writeInt(goodsId);
out.writeInt(goodsPrice);
out.writeInt(goodsSales);
out.writeInt(goodsFlag);
}

@Override
public void readFields(DataInput in) throws IOException {
goodsName = in.readUTF();
goodsId = in.readInt();
goodsPrice = in.readInt();
goodsSales = in.readInt();
goodsFlag = in.readInt();
}

@Override
public String toString() {
return "Goods_Information [goodsName=" + goodsName + ", goodsId=" + goodsId + ", goodsPrice=" + goodsPrice
+ ", goodsSales=" + goodsSales + "]";
}

}

public static class Mutil_File_ReducerMeger_Mapper extends Mapper<LongWritable, Text, Text, LLBWritable>{
Text k = new Text();
LLBWritable llb = new LLBWritable();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//获取输入文件的名称(便于分类)
FileSplit fs =(FileSplit)context.getInputSplit();
String name = fs.getPath().getName();
String[] values;
if(name.contains("information")) {
k.set("shop_information");
values = StringUtils.split(value.toString(),"\t");
for(int i = 0 ; i < values.length ; i++) {
llb.set(values[0],Integer.parseInt(values[1]), 0, Integer.parseInt(values[2]), 2);
}
}else {
k.set("shop");
values = StringUtils.split(value.toString(),"\t");
for(int i = 0 ; i < values.length ; i++) {
llb.set("", Integer.parseInt(values[0]), Integer.parseInt(values[1]), 0, 1);
}
}
context.write(k, llb);
}
}

public static class Mutil_File_ReducerMeger_Reducer extends Reducer<Text, LLBWritable, LLBWritable, NullWritable>{
ArrayList<LLBWritable> list = new ArrayList<>();
LLBWritable llb1 = new LLBWritable();
@Override
protected void reduce(Text arg0, Iterable<LLBWritable> values,Context context) throws IOException, InterruptedException {

for (LLBWritable value : values) {
if("shop".equals(arg0.toString())) {
try {
BeanUtils.copyProperties(llb1,value);
} catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}else {
try {
LLBWritable llb2 = new LLBWritable();
BeanUtils.copyProperties(llb2,value);
list.add(llb2);
} catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
}
for (LLBWritable llb : list) {
llb.setGoodsPrice(llb1.getGoodsPrice());
context.write(llb, NullWritable.get());
}
}
}

public static void main(String[] args) {
try {
System.out.println(ToolRunner.run(new Mutil_File_ReducerMeger(), args));
} catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
}