利用MapReduce实现单表关联比多表关联稍微复杂一些。
例如有如下一个文件,两列数据:
<pre name="code" class="java">followed following
1 2
2 3
5 7
7 6
followed是被关注者,following是关注者,如果将其看做是一个简单的微博关注关系,我们可以从中看出,3关注了2,2关注了1,那么我们可以向3推荐1,因为1是3的偶像的偶像;6关注了7,7关注了5,则5是6的偶像的偶像,我们可以向6推荐5于是可以形成如下的推荐列表:
followed following
1 3
5 6
意即向6推荐5,向3推荐1.
实现思路如下:首先在map中将数据读入后生成如下两个表:一个左表一个右表:
这个是右表,在following列加“a+”标志:
followed following
1 a+2
2 a+3
5 a+7
7 a+6
这个是左表,在followed列加“b+”标志:
following <span style="font-family: Arial, Helvetica, sans-serif;">followed</span>
2 b+1
3 b+2
7 b+5
6 b+7
很显然,有一部分following其自身也是followed,将这两列数据和在一起的时候,利用map reduce中的shuffle和sort会去掉重复的,即找出既是following,又是followed的人,在这里是7和2,这两个人就可以被当做“中间人”
而reduce接收到的数据是这样的:
<span style="background-color: rgb(255, 102, 102);">然后,将类似于这样的数据在reduce里面解析,以a+开头的是原来的following,以b+开头的是原来的followed,如果在这里出现多组a+和b+,则应该对两个集合做笛卡尔积求出结果,在这里由于只有一对,一眼可以看出结果,就可以用不作直接出结果了,这样,我们就可以知道应该给3推荐1,给6推荐5.</span>
<span style="background-color: rgb(255, 102, 102);">具体实现代码如下:</span>
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class STjoin {
public static int time = 0;
public static class Map extends Mapper<Object, Text, Text, Text> {
public void map(Object key, Text value, Context context) throws IOException, InterruptedException{
String followedName = new String();
String followingName = new String();
String leftOrRightISbORa = new String();
String line = value.toString();
int i = 0;
while(line.charAt(i) != ' '){
i++;
}
String[] values = {line.substring(0,i),line.substring(i+1)};
followedName = values[0];
followingName = values[1];
leftOrRightISbORa = "a"; //a represents the right table
context.write(new Text(values[0]), new Text(leftOrRightISbORa+"+"+followingName+"+"+followedName)); //right table
leftOrRightISbORa = "b"; //b represents the left table
context.write(new Text(values[1]), new Text(leftOrRightISbORa+"+"+followingName+"+"+followedName)); //left table
}
}
public static class Reduce extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws InterruptedException,IOException {
if (time == 0) {
context.write(new Text("following"), new Text("followed")); //generate the table head
time++;
}
int followingNum = 0;
ArrayList<String> followings = new ArrayList<>();
//String followings[] = new String[100000];
int followedNum = 0;
ArrayList<String> followeds = new ArrayList<>();
//String followeds[] = new String[100000];
Iterator iterator = values.iterator();
while(iterator.hasNext()){
String record = iterator.next().toString();
int len = record.length();
int i = 2;
if (len == 0) continue;
char leftORright = record.charAt(0);
String followingName = new String();
String followedName = new String();
while (record.charAt(i)!='+') {
followingName = followingName + record.charAt(i);
i++;
}
i = i+1; //jump over the sigh of '+'
while(i<len){
followedName = followedName + record.charAt(i);
i++;
}
if (leftORright == 'a') {
followings.add(followingName);
//followings[followedNum]=followingName;
followingNum++;
}else {
followeds.add(followedName);
// followeds[followedNum]=followedName;
followedNum++;
}
if (followedNum !=0 && followingNum!=0) {
for(int m = 0;m<followingNum;m++)
for(int n = 0; n<followedNum; n++){
context.write(new Text(followings.get(m)), new Text (followeds.get(n)));
}
}
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
String[] otherArgs = new GenericOptionsParser(configuration,args).getRemainingArgs();
if (otherArgs.length!=2) {
System.err.println("111111111111111111111");
System.exit(2);
} //判断参数是否是两个
<pre name="code" class="java"> FileSystem fileSystem = FileSystem.get(URI.create(otherArgs[1]),configuration);
Path path = new Path(otherArgs[1]);
if (fileSystem.exists(path)) {
fileSystem.delete(path,true);
} //如果输出地址已经存在,则删除这个地址
Job job = new Job(configuration,"single table join");job.setJarByClass(STjoin.class);job.setMapperClass(Map.class);job.setReducerClass(Reduce.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);FileInputFormat.addInputPath(job,
new Path(otherArgs[0]));FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));System.exit((job.waitForCompletion(true) ? 0:1));}}