hadoop多表关联

最新推荐文章于 2021-12-06 14:26:59 发布

最新推荐文章于 2021-12-06 14:26:59 发布 · 250 阅读

文章标签：

#大数据 #数据库

2.多表关联

多表关联和单表关联类似，它也是通过对原始数据进行一定的处理，从其中挖掘出关心的信息。下面进入这个实例。

实例描述

输入是两个文件，一个代表工厂表，包含工厂名列和地址编号列；另一个代表地址表，包含地址名列和地址编号列。要求从输入数据中找出工厂名和地址名的对应关系，输出"工厂名——地址名"表。

样例输入如下所示。

1）factory：

factoryname 　　　　addressed

Beijing Red Star 　　　　1

Shenzhen Thunder 　　　　3

Guangzhou Honda 　　　　2

Beijing Rising 　　　　1

Guangzhou Development Bank2

Tencent 　　　　　　　　3

Back of Beijing 　　　　 1

2）address：

addressID addressname

1 　　　　Beijing

2 　　　　Guangzhou

3 　　　　Shenzhen

4 　　　　Xian

样例输出如下所示。

factoryname 　　　　addressname

Back of Beijing 　　　　 Beijing

Beijing Red Star 　　　　Beijing

Beijing Rising 　　　　　 Beijing

Guangzhou Development BankGuangzhou

Guangzhou Honda 　　　　Guangzhou

Shenzhen Thunder 　　　　Shenzhen

Tencent 　　　　　　　　Shenzhen

设计思路

多表关联和单表关联相似，都类似于数据库中的自然连接。相比单表关联，多表关联的左右表和连接列更加清楚。所以可以采用和单表关联的相同的处理方式，map识别出输入的行属于哪个表之后，对其进行分割，将连接的列值保存在key中，另一列和左右表标识保存在value中，然后输出。reduce拿到连接结果之后，解析value内容，根据标志将左右表内容分开存放，然后求笛卡尔积，最后直接输出。

这个实例的具体分析参考单表关联实例。下面给出代码。

程序代码

程序代码如下所示：

packagecom.hebut.mr;

importjava.io.IOException;

importjava.util.*;

importorg.apache.hadoop.conf.Configuration;

importorg.apache.hadoop.fs.Path;

importorg.apache.hadoop.io.IntWritable;

importorg.apache.hadoop.io.Text;

importorg.apache.hadoop.mapreduce.Job;

importorg.apache.hadoop.mapreduce.Mapper;

importorg.apache.hadoop.mapreduce.Reducer;

importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;

importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

importorg.apache.hadoop.util.GenericOptionsParser;

publicclassMTjoin {

publicstaticinttime= 0;

/*

*在map中先区分输入行属于左表还是右表，然后对两列值进行分割，

*保存连接列在key值，剩余列和左右表标志在value中，最后输出

*/

publicstaticclassMapextendsMapper<Object, Text, Text, Text> {

//实现map函数

publicvoidmap(Object key, Text value, Context context)

throwsIOException, InterruptedException {

String line = value.toString();//每行文件

String relationtype =newString();//左右表标识

//输入文件首行，不处理

if(line.contains("factoryname") ==true

|| line.contains("addressed") ==true) {

return;

}

//输入的一行预处理文本

StringTokenizer itr =newStringTokenizer(line);

String mapkey =newString();

String mapvalue =newString();

inti = 0;

while(itr.hasMoreTokens()) {

//先读取一个单词

String token = itr.nextToken();

//判断该地址ID就把存到"values[0]"

if(token.charAt(0) >='0'&& token.charAt(0) <='9') {

mapkey = token;

if(i > 0) {

relationtype ="1";

}else{

relationtype ="2";

}

continue;

}

//存工厂名

mapvalue += token +" ";

i++;

}

//输出左右表

context.write(newText(mapkey),newText(relationtype +"+"+ mapvalue));

}

}

/*

* reduce解析map输出，将value中数据按照左右表分别保存，

　　*然后求出笛卡尔积，并输出。

*/

publicstaticclassReduceextendsReducer<Text, Text, Text, Text> {

//实现reduce函数

publicvoidreduce(Text key, Iterable<Text> values, Context context)

throwsIOException, InterruptedException {

//输出表头

if(0 ==time) {

context.write(newText("factoryname"),newText("addressname"));

time++;

}

intfactorynum = 0;

String[] factory =newString[10];

intaddressnum = 0;

String[]address=newString[10];

Iteratorite = values.iterator();

while(ite.hasNext()) {

String record = ite.next().toString();

intlen = record.length();

inti = 2;

if(0 == len) {

continue;

}

//取得左右表标识

charrelationtype = record.charAt(0);

//左表

if('1'== relationtype) {

factory[factorynum] = record.substring(i);

factorynum++;

}

//右表

if('2'== relationtype) {

address[addressnum] = record.substring(i);

addressnum++;

}

}

//求笛卡尔积

if(0 != factorynum && 0 != addressnum) {

for(intm = 0; m < factorynum; m++) {

for(intn = 0; n < addressnum; n++) {

//输出结果

context.write(newText(factory[m]),

newText(address[n]));

}

}

}

}

}

publicstaticvoidmain(String[] args)throwsException {

Configuration conf =newConfiguration();

//这句话很关键

conf.set("mapred.job.tracker","192.168.1.2:9001");

String[] ioArgs =newString[] {"MTjoin_in","MTjoin_out"};

String[] otherArgs =newGenericOptionsParser(conf, ioArgs).getRemainingArgs();

if(otherArgs.length!= 2) {

System.err.println("Usage: Multiple Table Join <in> <out>");

System.exit(2);

}

Job job =newJob(conf,"Multiple Table Join");

job.setJarByClass(MTjoin.class);

//设置Map和Reduce处理类

job.setMapperClass(Map.class);

job.setReducerClass(Reduce.class);

//设置输出类型

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class);

//设置输入和输出目录

FileInputFormat.addInputPath(job,newPath(otherArgs[0]));

FileOutputFormat.setOutputPath(job,newPath(otherArgs[1]));

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

}