mapreduce编程模型复习
将两类数据进行join
根据用户uid对两类数据进行聚合。
思路:
map端:
不管worker读到的是什么文件,我们的map方法中是可以通过context来区分的
对于order数据,map中切字段,封装为一个joinbean,打标记:t_order
对于user数据,map中切字段,封装为一个joinbean,打标记:t_user
然后,以uid作为key,以joinbean作为value返回
reduce端:
用迭代器迭代出一组相同uid的所有数据joinbean,然后判断
如果是标记字段为t_order的,则加入一个arraylist中
如果标记字段为t_user的,则放入一个Joinbean对象中
然后,遍历arraylist,对里面的每一个JoinBean填充userBean中的user数据,然后输出这个joinBean即可
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class JoinBean implements Writable {
private String orderId;
private String userId;
private String userName;
private int userAge;
private String userFriend;
private String tableName;
public void set(String orderId, String userId, String userName, int userAge, String userFriend, String tableName) {
this.orderId = orderId;
this.userId = userId;
this.userName = userName;
this.userAge = userAge;
this.userFriend = userFriend;
this.tableName = tableName;
}
public String getTableName() {
return tableName;
}
public void setTableName(String tableName) {
this.tableName = tableName;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public String getUserName() {
return userName;
}
public void setUserName(String userName) {
this.userName = userName;
}
public int getUserAge() {
return userAge;
}
public void setUserAge(int userAge) {
this.userAge = userAge;
}
public String getUserFriend() {
return userFriend;
}
public void setUserFriend(String userFriend) {
this.userFriend = userFriend;
}
@Override
public String toString() {
return this.orderId + "," + this.userId + "," + this.userAge + "," + this.userName + "," + this.userFriend;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.orderId);
out.writeUTF(this.userId);
out.writeUTF(this.userName);
out.writeInt(this.userAge);
out.writeUTF(this.userFriend);
out.writeUTF(this.tableName);
}
@Override
public void readFields(DataInput in) throws IOException {
this.orderId = in.readUTF();
this.userId = in.readUTF();
this.userName = in.readUTF();
this.userAge = in.readInt();
this.userFriend = in.readUTF();
this.tableName = in.readUTF();
}
}
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 本例是使用最low的方式实现
*
* 还可以利用Partitioner+CompareTo+GroupingComparator 组合拳来高效实现
* @author ThinkPad
*
*/
public class ReduceSideJoin {
public static class ReduceSideJoinMapper extends Mapper<LongWritable, Text, Text, JoinBean> {
String fileName = null;
JoinBean bean = new JoinBean();
Text k = new Text();
/**
* maptask在做数据处理时,会先调用一次setup() 钓完后才对每一行反复调用map()
*/
@Override
protected void setup(Mapper<LongWritable, Text, Text, JoinBean>.Context context)
throws IOException, InterruptedException {
FileSplit inputSplit = (FileSplit) context.getInputSplit();
fileName = inputSplit.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, JoinBean>.Context context)
throws IOException, InterruptedException {
String[] fields = value.toString().split(",");
if (fileName.startsWith("order")) {
bean.set(fields[0], fields[1], "NULL", -1, "NULL", "order");
} else {
bean.set("NULL", fields[0], fields[1], Integer.parseInt(fields[2]), fields[3], "user");
}
k.set(bean.getUserId());
context.write(k, bean);
}
}
public static class ReduceSideJoinReducer extends Reducer<Text, JoinBean, JoinBean, NullWritable> {
@Override
protected void reduce(Text key, Iterable<JoinBean> beans, Context context)
throws IOException, InterruptedException {
ArrayList<JoinBean> orderList = new ArrayList<>();
JoinBean userBean = null;
try {
// 区分两类数据
for (JoinBean bean : beans) {
if ("order".equals(bean.getTableName())) {
JoinBean newBean = new JoinBean();
BeanUtils.copyProperties(newBean, bean);
orderList.add(newBean);
}else{
userBean = new JoinBean();
BeanUtils.copyProperties(userBean, bean);
}
}
// 拼接数据,并输出
for(JoinBean bean:orderList){
bean.setUserName(userBean.getUserName());
bean.setUserAge(userBean.getUserAge());
bean.setUserFriend(userBean.getUserFriend());
context.write(bean, NullWritable.get());
}
} catch (IllegalAccessException | InvocationTargetException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(ReduceSideJoin.class);
job.setMapperClass(ReduceSideJoinMapper.class);
job.setReducerClass(ReduceSideJoinReducer.class);
job.setNumReduceTasks(2);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(JoinBean.class);
job.setOutputKeyClass(JoinBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\join\\input"));
FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\join\\out1"));
job.waitForCompletion(true);
}
}