Note:
1. 内容主要参考<Hadoop in Action> Chapter 5.2
2. 代码主要参考: http://qindongliang.iteye.com/blog/2052842
3. 这是基于老的API的实现,这种方法并不高效简洁
数据:(原始数据可以从movielens-1m里面去要,这里将原始数据进行了简单的修改方便演示与自测)
文件: u.user
结构: user id | age | gender | occupation | zip code
样例:
1|24|M|technician|85711
2|53|F|other|94043
3|23|M|writer|32067
4|24|M|technician|43537
5|33|F|other|15213
6|42|M|executive|98101
7|57|M|administrator|91344
8|36|M|administrator|05201
9|29|M|student|01002
10|53|M|lawyer|90703
|
文件: u.data
结构: user id | item id | rating | timestamp.
样例:
1 242 3 881250949
1 302 3 891717742
2 377 1 878887116
4 51 2 880606923
6 346 1 886397596
8 474 4 884182806
10 265 2 881171488
1 465 5 891628467
2 451 3 886324817
6 86 3 883603013
|
任务:
将两个文件的内容连接起来,输出部分内容:(inner join)
输出结构: user id | age | rating
输出示例:
1age=24,ratings=3
1age=24,ratings=3
1age=24,ratings=5
10age=53,ratings=2
2age=53,ratings=1
2age=53,ratings=3
4age=24,ratings=2
6age=42,ratings=1
6age=42,ratings=3
8age=36,ratings=4
|
思路:
在Map函数之中处理各个文件每一行的值。并根据文件名为每一行的值加上一个groupKey.
在这个例子之中,groupKey即为user id, 比如 1,2,3 ...
经过map函数处理之后,会有如下数据出现:(假设将其json化)
[{
tag: 'u.user',
value: '1|24|M|technician|85711'
}, {
tag: 'u.data',
value: '1 242 3 881250949'
}, {
tag: 'u.data',
value: '1 377 1 878887116'
}]
Hadoop会将相同的groupKey的值放在一起,所以在Reduce函数之中,需要做的事情就是将这一系列的值合并在一起。注意:上面的list里面值的顺序是不固定的,无法确定u.user一定排在首位。
在演示最终代码之前,需要注意,英文版的<Hadoop in Action>上面的代码在我的版本上面是有问题的,需要修改如下几个地方:
1. 在TaggedWritable之中新增一个默认的构造方法,
public static class TaggedWritable extends TaggedMapOutput {
public TaggedWritable() {
}
}
不然会提示如下错误: 原因是在反射的时候一定需要一个默认的构造函数
java.lang.NoSuchMethodException: ch5.ReduceSideJoin$TaggedWritable.<init>()
2. readFields方法要增加一些处理空的代码,否则会报NullException
public void readFields(DataInput in) throws IOException {
this.tag.readFields(in);
//加入此部分代码,否则,可能报空指针异常
String temp=in.readUTF();
if (this.data == null|| !this.data.getClass().getName().equals(temp)) {
try {
this.data = (Writable) ReflectionUtils.newInstance(
Class.forName(temp), null);
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
this.data.readFields(in);
}
最后实现的代码如下:
package ch5;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.contrib.utils.join.DataJoinMapperBase;
import org.apache.hadoop.contrib.utils.join.DataJoinReducerBase;
import org.apache.hadoop.contrib.utils.join.TaggedMapOutput;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/***
*
* Hadoop1.2的版本,旧版本实现的Reduce侧连接
*
* @author qindongliang
*
* 大数据交流群:376932160
* 搜索技术交流群:324714439
*
*
*/
public class ReduceSideJoin extends Configured implements Tool {
/**
*
* Map实现
*
* */
public static class MapClass extends DataJoinMapperBase {
/**
* 读取输入的文件路径
*
* **/
protected Text generateInputTag(String inputFile) {
//返回文件路径,做标记
return new Text(inputFile);
}
/***
* 分组的Key
*
* **/
protected Text generateGroupKey(TaggedMapOutput aRecord) {
Text tag = aRecord.getTag();
String line = ((Text)aRecord.getData()).toString();
if(line.trim().length() < 2) return null;
String sep = "\t";
if(tag.toString().contains("u.user")) {
sep = "[|]";
}
String[] tokens = line.split(sep);
String groupKey = tokens[0];
return new Text(groupKey);
}
protected TaggedMapOutput generateTaggedMapOutput(Object value) {
TaggedWritable retv = new TaggedWritable((Text) value);
retv.setTag(this.inputTag);
return retv;
}
}
/**
*
* Reduce进行笛卡尔积
*
* **/
public static class Reduce extends DataJoinReducerBase {
/***
* 笛卡尔积
*
* */
protected TaggedMapOutput combine(Object[] tags, Object[] values) {
if (tags.length < 2) return null; //
// 开始连接两边的内容
String str = "";
String userInfo = "";
List<String> ratingDataList = new ArrayList<String>();
for(int i = 0 ; i < tags.length; i++) {
Text curTag = (Text) tags[i];
String line = ((TaggedWritable)values[i]).getData().toString();
if(curTag.toString().contains("u.user")) {
String[] tokens = line.split("[|]"); // 对于u.user 分隔符是| 并且只需要他的年龄这一列
userInfo = "age=" + tokens[1];
} else {
String[] tokens = line.split("\t"); // 对于u.data 分隔符是制表符"\t" 需要的是ratings这一列
ratingDataList.add(tokens[2]);
}
}
str = userInfo + ",ratings=" + StringUtils.join(ratingDataList, "|");
TaggedWritable retv = new TaggedWritable(new Text(str));
retv.setTag((Text) tags[0]);
return retv;
}
}
/**
*
* 自定义的输出类型
*
* ***/
public static class TaggedWritable extends TaggedMapOutput {
private Writable data;
/**
* 注意加上构造方法
*
* */
public TaggedWritable() {
}
public TaggedWritable(Writable data) {
this.tag = new Text("");
this.data = data;
}
public Writable getData() {
return data;
}
public void write(DataOutput out) throws IOException {
this.tag.write(out);
//此行代码很重要
out.writeUTF(this.data.getClass().getName());
this.data.write(out);
}
public void readFields(DataInput in) throws IOException {
this.tag.readFields(in);
//加入此部分代码,否则,可能报空指针异常
String temp=in.readUTF();
if (this.data == null|| !this.data.getClass().getName().equals(temp)) {
try {
this.data = (Writable) ReflectionUtils.newInstance(
Class.forName(temp), null);
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
this.data.readFields(in);
}
}
public int run(String[] args) throws Exception {
Configuration conf = getConf();
JobConf job = new JobConf(conf, ReduceSideJoin.class);
job.setJarByClass(ReduceSideJoin.class);
String path = "/home/hadoop/DataSet/movielens-output";
FileSystem fs=FileSystem.get(conf);
Path out = new Path(path);
if(fs.exists(out)){
fs.delete(out, true);
System.out.println("输出路径存在,已删除!");
}
Path in = new Path("/home/hadoop/DataSet/movielens");
FileInputFormat.setInputPaths(job, in);
FileOutputFormat.setOutputPath(job, out);
job.setJobName("ReduceSideJoin");
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
job.setInputFormat(TextInputFormat.class);
job.setOutputFormat(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(TaggedWritable.class);
JobClient.runJob(job);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(),
new ReduceSideJoin(),
args);
System.exit(res);
}
}