在Hadoop实战多表关联的例子上做了修改,感觉Hadoop实战上的例子限制性较大。
使用了两个数据文件,一个文件是factory.txt,数据如下:
factoryName addressId
Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Bank of Beijing 1
另一个是address.txt,数据如下:
addressId addressName
1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian
最终结果:
factoryName addreddName
Bank of Beijing Beijing
Beijing Rising Beijing
Beijing Red Star Beijing
Guangzhou Development Bank Guangzhou
Guangzhou Honda Guangzhou
Tencent Shenzhen
Shenzhen Thunder Shenzhen
使用了两个数据文件,一个文件是factory.txt,数据如下:
factoryName addressId
Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Bank of Beijing 1
另一个是address.txt,数据如下:
addressId addressName
1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian
最终结果:
factoryName addreddName
Bank of Beijing Beijing
Beijing Rising Beijing
Beijing Red Star Beijing
Guangzhou Development Bank Guangzhou
Guangzhou Honda Guangzhou
Tencent Shenzhen
Shenzhen Thunder Shenzhen
package com.more.table;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class STjoin {
public static int time = 0;
// Beijing Red Star 1
// 1 Beijing
public static class SMap extends Mapper{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException{
String line = value.toString();
// 输入文件首行不做处理
if(line.contains("factoryName") || line.contains("addressId")){
return;
}
String[] values = line.split(" ");
// 第一个数据
String firstVal = values[0];
// 数组最后一个数据
String lastVal = values[values.length-1];
Pattern pattern = Pattern.compile("^[1-9]d*$");
Matcher isNum = pattern.matcher(lastVal);
// 判断开头/结束是否为数字,对字段进行分割
// 对如果数据的第一个为数字的话则第一个是addressId
// 否则最后一个是addressId
if(isNum.matches()){
//
// 左表 aaa 11 substring包头不包尾
context.write(new Text(lastVal), new Text("1+" + line.substring(0, line.length() - lastVal.length())));
}else{
// 11 aaa
context.write(new Text(values[0]), new Text("2+" + line.substring(firstVal.length())));
}
}
}
// reduce
public static class SReduce extends Reducer{
public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{
// <1,1+Beijing Red Star><1,1+Bank of Beijing><1,2+Beijing>
// shuffle后的值:<1,<1+Beijing Red Star><1+Bank of Beijing><2+Beijing>>
if(time == 0){
// 输出表头
context.write(new Text("factoryName"), new Text("addreddName"));
time++;
}
// 工厂名称list
List factoryList = new ArrayList();
// 地址list
List addressList = new ArrayList();
Iterator ite = values.iterator();
while(ite.hasNext()){
String record = ite.next().toString();
char type = record.charAt(0);
if(type == '1'){
// 左表
factoryList.add(record.substring(2));
}else{
// 右表
addressList.add(record.substring(2));
}
}
// 求笛卡尔积
if(factoryList.size() > 0 && addressList.size() > 0){
for (String factory : factoryList) {
for (String address : addressList) {
// 输出结果
context.write(new Text(factory), new Text(address));
}
}
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if(otherArgs.length != 2){
System.err.println("Usage: join ");
System.exit(2);
}
@SuppressWarnings("deprecation")
Job job = new Job(conf, "Join job");
job.setNumReduceTasks(1);
job.setJarByClass(STjoin.class);
job.setMapperClass(SMap.class);
job.setReducerClass(SReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}