MR优化之——MapSideJoin、ReducerSideJoin

本文介绍使用Hadoop实现多文件Join操作的方法,包括Mapper端和Reducer端的Join过程,并提供具体代码示例。

如代码有误,望各位大牛纠正一下,本人会及时修改!

import java.io.BufferedReader;

import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;


import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


/**
 * @author 吕梁彪
 * 需要广播的表的内容为shop表
 * 需要进行join的表为information表
 */
public class Mutli_File_MapperMeger extends  Configured implements Tool {
private static String CUSTOMER_CACHE_URL = "file:/D:/IOoperation/MapReducer/input/shop.txt";
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"Mutli_File_Meger");
job.setJarByClass(Mutli_File_MapperMeger.class);
//URI参数必须为下列格式
job.addCacheFile(URI.create(CUSTOMER_CACHE_URL));
job.setMapperClass(Mutli_File_MegerMap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setNumReduceTasks(0);
Path path = new Path("D:\\IOoperation\\MapReducer\\output\\out1");
FileSystem fs = FileSystem.get(conf);
if(fs.exists(path)) {
fs.delete(path,true);
}
FileInputFormat.setInputPaths(job, "D:\\IOoperation\\MapReducer\\input\\shop_information.txt");
FileOutputFormat.setOutputPath(job, path);


return job.waitForCompletion(true)?0:1;
}
public static class Mutli_File_MegerMap extends Mapper<LongWritable, Text, Text, Text>{
HashMap<String,String> hm = new HashMap<String,String>();
Text k = new Text();
Text v = new Text();
StringBuffer sb;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
FileSystem fs = FileSystem.get(URI.create(CUSTOMER_CACHE_URL), context.getConfiguration());
FSDataInputStream fdis = fs.open(new Path(CUSTOMER_CACHE_URL));
BufferedReader br = new BufferedReader(new InputStreamReader(fdis));
String str;
while (StringUtils.isNotEmpty(str = br.readLine())) {
String[] fields = str.split("\t");
hm.put(fields[0], fields[1]);
}

if(br!=null) {
br.close();
}
}


@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
String[] split = StringUtils.split(value.toString(),"\t");
for(int i = 0 ; i < split.length ; i++) {
sb = new StringBuffer();
sb.append(split[1]).append("|").append(split[0])
.append("|").append(hm.get(split[1])).append("|").append(split[2]);
}
k.set(sb.toString());
context.write(k, v);
}


}
public static void main(String[] args) {
try {
System.out.println(ToolRunner.run(new Mutli_File_MapperMeger(), args));
} catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}

}

-----------------------------------华丽的分割线-------------------------------------------

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;


import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;




/**
 * @author 吕梁彪
 * 此次在Reducer进行join操作
 * 输入路径为多个
 * 此程序需要打成jar包在集群上运行
 */
public class Mutil_File_ReducerMeger extends Configured implements Tool{


@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Mutil_File_ReducerMeger");
job.setJarByClass(Mutil_File_ReducerMeger.class);
job.setMapperClass(Mutil_File_ReducerMeger_Mapper.class);
job.setReducerClass(Mutil_File_ReducerMeger_Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LLBWritable.class);
job.setOutputKeyClass(LLBWritable.class);
job.setOutputValueClass(NullWritable.class);
//job.setNumReduceTasks(0);
Path path = new Path("D:\\IOoperation\\MapReducer\\output\\out1");


FileSystem fs = FileSystem.get(conf);
if(fs.exists(path)) {
fs.delete(path,true);
}
FileInputFormat.setInputPaths(job, "D:\\IOoperation\\MapReducer\\input\\shop_information.txt,D:\\IOoperation\\MapReducer\\input\\shop.txt");
FileOutputFormat.setOutputPath(job, path);
return job.waitForCompletion(true)?0:1;
}
public static class LLBWritable implements Writable,Serializable{
private static final long serialVersionUID = 1L;
private String goodsName;
private int goodsId;
private int goodsPrice;
private int goodsSales;
private int goodsFlag;


public void set(String goodsName,int goodsId,int goodsPrice,int goodsSales,int goodsFlag) {
this.goodsName = goodsName;
this.goodsId = goodsId;
this.goodsPrice = goodsPrice;
this.goodsSales = goodsSales;
this.goodsFlag = goodsFlag;


}


public int getGoodsFlag() {
return goodsFlag;
}


public void setGoodsFlag(int goodsFlag) {
this.goodsFlag = goodsFlag;
}


public String getGoodsName() {
return goodsName;
}


public void setGoodsName(String goodsName) {
this.goodsName = goodsName;
}


public int getGoodsId() {
return goodsId;
}


public void setGoodsId(int goodsId) {
this.goodsId = goodsId;
}


public int getGoodsPrice() {
return goodsPrice;
}


public void setGoodsPrice(int goodsPrice) {
this.goodsPrice = goodsPrice;
}


public int getGoodsSales() {
return goodsSales;
}


public void setGoodsSales(int goodsSales) {
this.goodsSales = goodsSales;
}


@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(goodsName);
out.writeInt(goodsId);
out.writeInt(goodsPrice);
out.writeInt(goodsSales);
out.writeInt(goodsFlag);
}


@Override
public void readFields(DataInput in) throws IOException {
goodsName = in.readUTF();
goodsId = in.readInt();
goodsPrice = in.readInt();
goodsSales = in.readInt();
goodsFlag = in.readInt();
}


@Override
public String toString() {
return "Goods_Information [goodsName=" + goodsName + ", goodsId=" + goodsId + ", goodsPrice=" + goodsPrice
+ ", goodsSales=" + goodsSales + "]";
}




}


public static class Mutil_File_ReducerMeger_Mapper extends Mapper<LongWritable, Text, Text, LLBWritable>{
Text k = new Text();
LLBWritable llb = new LLBWritable();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//获取输入文件的名称(便于分类)
FileSplit fs =(FileSplit)context.getInputSplit();
String name = fs.getPath().getName();
String[] values;
if(name.contains("information")) {
k.set("shop_information");
values = StringUtils.split(value.toString(),"\t");
for(int i = 0 ; i < values.length ; i++) {
llb.set(values[0],Integer.parseInt(values[1]), 0, Integer.parseInt(values[2]), 2);
}
}else {
k.set("shop");
values = StringUtils.split(value.toString(),"\t");
for(int i = 0 ; i < values.length ; i++) {
llb.set("", Integer.parseInt(values[0]), Integer.parseInt(values[1]), 0, 1);
}
}
context.write(k, llb);
}
}


public static class Mutil_File_ReducerMeger_Reducer extends Reducer<Text, LLBWritable, LLBWritable, NullWritable>{
ArrayList<LLBWritable> list = new ArrayList<>();
LLBWritable llb1 = new LLBWritable();
@Override
protected void reduce(Text arg0, Iterable<LLBWritable> values,Context context) throws IOException, InterruptedException {


for (LLBWritable value : values) {
if("shop".equals(arg0.toString())) {
try {
BeanUtils.copyProperties(llb1,value);
} catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}else {
try {
LLBWritable llb2 =  new LLBWritable();
BeanUtils.copyProperties(llb2,value);
list.add(llb2);
} catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
}
for (LLBWritable llb : list) {
llb.setGoodsPrice(llb1.getGoodsPrice());
context.write(llb, NullWritable.get());
}
}
}


public static void main(String[] args) {
try {
System.out.println(ToolRunner.run(new Mutil_File_ReducerMeger(), args));
} catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
}

-------------------------------------shop.txt_content----------------------

4107 4
5236 3
1025 1
1597 7
3649 11
5789 5
2635 8

----------------------------------------shop_information.txt_content---------------------

冰淇淋 4107 2000
蛋糕 5236 3000
阿尔卑斯 1025 5000
笔记本 1597 700
台灯 3649 110
钢笔 5789 600
水印画 2635 120

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值