mapreduce join分为两种:
1、map的join 2、reduce的join
思路:
1、map的join是将一个数据集的数据放入Map集合中,将集合在setup放入到缓存中,所以涉及DistributedCache,因为涉及在内存,所以放入缓存的数据集样本要小,否则不适用,所以这个业务场景比较少
setup中代码实现如下:(官网有源码)
@Override
public void setup(Context context) throws IOException, InterruptedException {
Configuration configuration = context.getConfiguration();
URI[] uri = Job.getInstance(configuration).getCacheFiles();
Path path = new Path(uri[0]);
FileSystem fileSystem = FileSystem.get(configuration);
InputStream inputStream = fileSystem.open(path);
InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
String line = null;
while ((line=bufferedReader.readLine())!=null){
if (line.trim().length()>0){
customer.put(line.split(",")[0],line);
}
}
bufferedReader.close();
inputStreamReader.close();
inputStream.close();
}
在主函数中要调用job.addCache方法,将URI写入进去
2、reduce的join
将需要join的数据集都作为map的输入,在map的逻辑中对数据进行标记,reduce中对数据进行合并,需要自定义数据类型
自定义数据类型
package com.kfk.hadoop.mr.join;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class DataJoinWritable implements Writable {
private int id;
private String data;
public DataJoinWritable() {
}
public DataJoinWritable(int id, String data) {
this.set(id,data);
}
public void set(int id, String data) {
this.id = id;
this.data = data;
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(this.id);
dataOutput.writeUTF(this.data);
}
public void readFields(DataInput dataInput) throws IOException {
this.id = dataInput.readInt();
this.data = dataInput.readUTF();
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getData() {
return data;
}
public void setData(String data) {
this.data = data;
}
@Override
public String toString() {
return "DataJoinWritable{" +
"id=" + id +
", data='" + data + '\'' +
'}';
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
DataJoinWritable that = (DataJoinWritable) o;
if (id != that.id) return false;
return data != null ? data.equals(that.data) : that.data == null;
}
@Override
public int hashCode() {
int result = id;
result = 31 * result + (data != null ? data.hashCode() : 0);
return result;
}
}
标记类
package com.kfk.hadoop.mr.join;
public class DataCommon {
public static final int CUSTOMER = 1;
public static final int ORDER = 2;
}
map代码
public static class ReduceJoinMapper extends Mapper<LongWritable, Text,Text, DataJoinWritable>{
Text outKey = new Text();
DataJoinWritable outValue = new DataJoinWritable();
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split(",");
if (line.length==3){
outKey.set(line[0]);
outValue.set(DataCommon.CUSTOMER,line[1]+","+line[2]);
}else {
outKey.set(line[1]);
outValue.set(DataCommon.ORDER,line[0]+","+line[2]+","+line[3]);
}
context.write(outKey,outValue);
}
}
reduce代码
public static class ReduceJoinReducer extends Reducer<Text,DataJoinWritable, Text,Text>{
private String customer = null;
private List<String> order = new ArrayList<String>();
@Override
public void reduce(Text key, Iterable<DataJoinWritable> values, Context context) throws IOException, InterruptedException {
for (DataJoinWritable dataJoinWritable:values){
if (dataJoinWritable.getId()==DataCommon.CUSTOMER){
customer = dataJoinWritable.getData();
}else if (dataJoinWritable.getId()==DataCommon.ORDER){
order.add(dataJoinWritable.getData());
}
}
for (String o:order) {
context.write(key,new Text(customer+","+o));
}
}
}
}