大数据之路(二)——MapReduce 编程案例(join算法具体实现)-优快云博客

本文链接：https://blog.youkuaiyun.com/wy888882/article/details/88769780

mapreduce编程模型复习

在这里插入图片描述

将两类数据进行join

在这里插入图片描述
根据用户uid对两类数据进行聚合。
思路：
map端：
不管worker读到的是什么文件，我们的map方法中是可以通过context来区分的
对于order数据，map中切字段，封装为一个joinbean，打标记：t_order
对于user数据，map中切字段，封装为一个joinbean，打标记：t_user
然后，以uid作为key，以joinbean作为value返回

reduce端：
用迭代器迭代出一组相同uid的所有数据joinbean，然后判断
如果是标记字段为t_order的，则加入一个arraylist中
如果标记字段为t_user的，则放入一个Joinbean对象中

然后，遍历arraylist，对里面的每一个JoinBean填充userBean中的user数据，然后输出这个joinBean即可

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class JoinBean implements Writable {

	private String orderId;
	private String userId;
	private String userName;
	private int userAge;
	private String userFriend;
	private String tableName;

	public void set(String orderId, String userId, String userName, int userAge, String userFriend, String tableName) {
		this.orderId = orderId;
		this.userId = userId;
		this.userName = userName;
		this.userAge = userAge;
		this.userFriend = userFriend;
		this.tableName = tableName;
	}

	public String getTableName() {
		return tableName;
	}

	public void setTableName(String tableName) {
		this.tableName = tableName;
	}

	public String getOrderId() {
		return orderId;
	}

	public void setOrderId(String orderId) {
		this.orderId = orderId;
	}

	public String getUserId() {
		return userId;
	}

	public void setUserId(String userId) {
		this.userId = userId;
	}

	public String getUserName() {
		return userName;
	}

	public void setUserName(String userName) {
		this.userName = userName;
	}

	public int getUserAge() {
		return userAge;
	}

	public void setUserAge(int userAge) {
		this.userAge = userAge;
	}

	public String getUserFriend() {
		return userFriend;
	}

	public void setUserFriend(String userFriend) {
		this.userFriend = userFriend;
	}

	@Override
	public String toString() {
		return this.orderId + "," + this.userId + "," + this.userAge + "," + this.userName + "," + this.userFriend;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(this.orderId);
		out.writeUTF(this.userId);
		out.writeUTF(this.userName);
		out.writeInt(this.userAge);
		out.writeUTF(this.userFriend);
		out.writeUTF(this.tableName);

	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.orderId = in.readUTF();
		this.userId = in.readUTF();
		this.userName = in.readUTF();
		this.userAge = in.readInt();
		this.userFriend = in.readUTF();
		this.tableName = in.readUTF();

	}

}

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 本例是使用最low的方式实现
 * 
 * 还可以利用Partitioner+CompareTo+GroupingComparator 组合拳来高效实现
 * @author ThinkPad
 *
 */
public class ReduceSideJoin {

	public static class ReduceSideJoinMapper extends Mapper<LongWritable, Text, Text, JoinBean> {
		String fileName = null;
		JoinBean bean = new JoinBean();
		Text k = new Text();

		/**
		 * maptask在做数据处理时，会先调用一次setup() 钓完后才对每一行反复调用map()
		 */
		@Override
		protected void setup(Mapper<LongWritable, Text, Text, JoinBean>.Context context)
				throws IOException, InterruptedException {
			FileSplit inputSplit = (FileSplit) context.getInputSplit();
			fileName = inputSplit.getPath().getName();
		}

		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, JoinBean>.Context context)
				throws IOException, InterruptedException {

			String[] fields = value.toString().split(",");

			if (fileName.startsWith("order")) {
				bean.set(fields[0], fields[1], "NULL", -1, "NULL", "order");
			} else {
				bean.set("NULL", fields[0], fields[1], Integer.parseInt(fields[2]), fields[3], "user");
			}
			k.set(bean.getUserId());
			context.write(k, bean);

		}

	}

	public static class ReduceSideJoinReducer extends Reducer<Text, JoinBean, JoinBean, NullWritable> {

		@Override
		protected void reduce(Text key, Iterable<JoinBean> beans, Context context)
				throws IOException, InterruptedException {
			ArrayList<JoinBean> orderList = new ArrayList<>();
			JoinBean userBean = null;

			try {
				// 区分两类数据
				for (JoinBean bean : beans) {
					if ("order".equals(bean.getTableName())) {
						JoinBean newBean = new JoinBean();
						BeanUtils.copyProperties(newBean, bean);
						orderList.add(newBean);
					}else{
						userBean = new JoinBean();
						BeanUtils.copyProperties(userBean, bean);
					}

				}
				
				// 拼接数据，并输出
				for(JoinBean bean:orderList){
					bean.setUserName(userBean.getUserName());
					bean.setUserAge(userBean.getUserAge());
					bean.setUserFriend(userBean.getUserFriend());
					
					context.write(bean, NullWritable.get());
					
				}
			} catch (IllegalAccessException | InvocationTargetException e) {
				e.printStackTrace();
			}

		}

	}
	
	
	public static void main(String[] args) throws Exception {

		
		Configuration conf = new Configuration();  
		
		Job job = Job.getInstance(conf);

		job.setJarByClass(ReduceSideJoin.class);

		job.setMapperClass(ReduceSideJoinMapper.class);
		job.setReducerClass(ReduceSideJoinReducer.class);
		
		job.setNumReduceTasks(2);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(JoinBean.class);
		
		job.setOutputKeyClass(JoinBean.class);
		job.setOutputValueClass(NullWritable.class);

		FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\join\\input"));
		FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\join\\out1"));

		job.waitForCompletion(true);
	}

}