ReduceJoin
作用:在reduce端进行多个数据源的连接。
map: 处理来自不同文件的输入,将不同文件的连接字段作为key,其他字段作为value,输出到reduce.
reduce:此时连接字段(key)相同的数据被分到同一个reduce中,进行字段合并即可.
缺点:在map端和reduce端IO过多,导致效率低。
本例使用reduceJoin实现问题1
问题描述:
1、 任意多个数据源的内连接
输入有两个文件,一个名为factory的输入文件包含描述工厂名和其对应地址ID的表,另一个名为address的输入文件包含描述地址名和其ID的表格。请编写一个程序输出工厂名和其对应地址的名字。
输入:输入有两个文件,第一个描述了工厂名和对应地址的ID,第二个输入文件描述了地址名和其ID。
输出:输出是一个包含工厂名和其对应地名的文件。
【数据样例】 输入:
①factory.txt:
factoryname addressID
Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Bank of Beijing 1
Nanchang Univ 5
Shanghai Bank 10
②address.txt:
addressID addressname
1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian
11 Chengdu
输出(以下输入为内连接)
factoryname addressID addressname
Bank of Beijing 1 Beijing
Beijing Rising 1 Beijing
Beijing Red Star 1 Beijing
Guangzhou Development Bank 2 Guangzhou
Guangzhou Honda 2 Guangzhou
Tencent 3 Shenzhen
Shenzhen Thunder 3 Shenzhen
要求:输出文件的第一行必须是“factoryname addressID addressname”
2、选做题,上述数据如果改为左外(右外)或外连接,程序应该怎么修改
3、如果上述两个表格数据量很大,尝试改进程序(可以自己模式数据测试)
说明: 数据连接实验可以使用基本MapReduce或者使用Hadoop DataJoin工具包来写。
Bean
public class MyBean implements Writable {
private String facName;
private int addID;
private String addName;
private String type;
public MyBean() {
super();
}
public MyBean(String facName, int addID, String addName, String type) {
this.facName = facName;
this.addID = addID;
this.addName = addName;
this.type = type;
}
@Override
public String toString() {
return facName + "\t" + addID + "\t" + addName;
}
// write readFields
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(facName);
out.writeInt(addID);
out.writeUTF(addName);
out.writeUTF(type);
}
@Override
public void readFields(DataInput in) throws IOException {
this.facName = in.readUTF();
this.addID = in.readInt();
this.addName = in.readUTF();
this.type = in.readUTF();
}
//get & set
public String getFacName() {
return facName;
}
public void setFacName(String facName) {
this.facName = facName;
}
public int getAddID() {
return addID;
}
public void setAddID(int addID) {
this.addID = addID;
}
public String getAddName() {
return addName;
}
public void setAddName(String addName) {
this.addName = addName;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
}
MyReduceJoin类
public class MyReduceJoin {
public static class MyReduceJoinMapper
extends Mapper<LongWritable, Text, Text, MyBean> {}
// reducer
public static class MyReduceJoinReducer
extends Reducer<Text, MyBean, Text, NullWritable>{}
// driver
public static void main(String[] args) {}
}
Mapper
public static class MyReduceJoinMapper extends Mapper<LongWritable, Text, Text, MyBean> {
//k1 & v1
Text k1 = new Text();
MyBean myBean = new MyBean();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
FileSplit split = (FileSplit)context.getInputSplit();
String filename = split.getPath().getName();
if("factory.txt".equals(filename)) { //input from file1
String line = value.toString(); //Beijing Red Star 1
if(!line.startsWith("factory")) { //filter first line
String[] fields = line.split(" ");
k1.set(fields[fields.length-1]);
String facName = "";
for (int i=0;i<=fields.length-2;i++) {
facName += fields[i];
if(i!=fields.length-2) facName += " ";
}
myBean.setFacName(facName);
myBean.setAddID(Integer.parseInt(fields[fields.length-1]));
myBean.setAddName("");
myBean.setType("factory");
context.write(k1, myBean);
}
}
else { //input from file2
String line = value.toString(); //1 Beijing
if(!line.startsWith("addressID")) { //filter first line
String[] fields = line.split(" ");
k1.set(fields[0]);
myBean.setFacName("");
myBean.setAddID(Integer.parseInt(fields[0]));
myBean.setAddName(fields[1]);
myBean.setType("address");
context.write(k1, myBean);
}
}
}
}
Reducer 实现数据连接
public static class MyReduceJoinReducer extends Reducer<Text, MyBean, Text, NullWritable>{
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
context.write(new Text("factoryname\taddressID\taddressname"), NullWritable.get());
}
@Override
protected void reduce(Text key, Iterable<MyBean> values,
Context context) throws IOException, InterruptedException {
ArrayList<MyBean> beans = new ArrayList<>(); //beans from file1
MyBean myBean = new MyBean(); //one single bean from file2
for (MyBean value : values) {
if("factory".equals(value.getType())) { //add into arrayList
MyBean tempBean = new MyBean();
try {
BeanUtils.copyProperties(tempBean, value);
} catch (IllegalAccessException | InvocationTargetException e) {
e.printStackTrace();
}
beans.add(tempBean);
}
else {
try {
BeanUtils.copyProperties(myBean, value);
} catch (IllegalAccessException | InvocationTargetException e) {
e.printStackTrace();
}
}
}
if(myBean.getAddName() != null) {
for (MyBean tempBean : beans) {
tempBean.setAddName(myBean.getAddName());
context.write(new Text(tempBean.toString()), NullWritable.get());
}
}
}
}
Driver
public static void main(String[] args) throws
IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(MyReduceJoin.class);
job.setMapperClass(MyReduceJoinMapper.class);
job.setReducerClass(MyReduceJoinReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(MyBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}