我的hadoop初学程序-----------------SingleTableJoin-------------------单表连接

本文链接：https://blog.youkuaiyun.com/xin15200793067/article/details/12914039
package bin;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;


public class SingleTableJoin {
	//"单表关联"这个实例要求从给出的数据中寻找所关心的数据，它是对原始数据所包含信息的挖掘。
	//给出child-parent（孩子——父母）表，要求输出grandchild-grandparent（孙子——爷奶）表。
	//如何实现表的自连接；其次就是连接列的设置；最后是结果的整理。
	//编写map/reduce程序时，首先应该考虑它的工作机制，map的结果获得<key,value>,shuffle是把各个节点key相同的shuffle到一起
	//将读入的数据分割成child跟parent，将parent设置成key,	child设置成value进行输出
	//代码的关键点和入手点是parent-----父母层，它连接着grandchild层和grandparent层
	public static int time=0;
	
	public static class SingleTableMap extends Mapper<Object, Text, Text, Text>{
		public void map(Object key,Text value,Context context) {
			String childName=new String();//孩子名称
			String parentName=new String();//父母名称
			String relationType=new String();//区分左表还是右表
			
			//对输入文本进行处理
			StringTokenizer tokenizer=new StringTokenizer(value.toString());
			String[] values=new String[2];
			int i=0;
			while (tokenizer.hasMoreElements()) {
				values[i] = tokenizer.nextToken();//每一行只有两个值
				i++;
			}
			if (values[0].compareTo("child")!=0) {//不要第一行
				childName=values[0];
				parentName=values[1];
				
				relationType="1";//左表：以parent作为key map
				try {
					context.write(new Text(values[1]), new Text(relationType+"+"+childName+"+"+parentName));
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				} catch (InterruptedException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				
				relationType="2";//右表：以child作为key map
				try {
					context.write(new Text(values[0]), new Text(relationType+"+"+childName+"+"+parentName));
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				} catch (InterruptedException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				
			}
		}
	}
	
	public static class SingleJoinReduce extends Reducer<Text, Text, Text, Text>{
		public void reduce(Text key,Iterable<Text> values,Context context) {
			if (0==time) {
				try {
					context.write(new Text("grandchild"), new Text("grandparent"));//第一次执行时加上头部grandchild grandparent。
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				} catch (InterruptedException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				time++;
			}
			int grandchildNum=0;
			String[] grandchild=new String[10];//用来存放所有的grandchild
			int grandparentNum=0;
			String[] grandparent=new String[10];//用来存放所有的grandparent
			
			Iterator<Text> iterator=values.iterator();
			while (iterator.hasNext()) {//对map的每一个key的values做分析处理
				String record=iterator.next().toString();
				int len=record.length();
				
				if (len==0) {
					continue;//这一条不太懂，难道能够有一个value，使它的长度为0,然后iterator.hasNext()竟然还不是0
				}
				char relationtype=record.charAt(0);//从string字符串中获取是左表还是右表的标示字段，这里只用一位的1/2来标示是为了这里读取时比较方便
				String childname=new String();
				String parentname=new String();
				int i=2;//初始值 ：前面已经摘掉了一个左表右表标示字段，字段之后还有一个“+”，所以while循环设置是从下标为2开始的，下标2处为第一个有效ｃｈｉｌｄ字符
				while (record.charAt(i)!='+') {
					childname+=record.charAt(i);
					i++;
				}
				i=i+1;//仍然去除掉字符‘+’
				while (i<len) {
					parentname+=record.charAt(i);
					i++;
				}
				if (relationtype=='1') {//只要reduce的输入有<key,values>它的values，而且是按照字典序排好了的
					grandchild[grandchildNum]=childname;
					grandchildNum++;
				}
				if (relationtype=='2') {
					grandparent[grandparentNum]=parentname;
					grandparentNum++;
				}
			}
			//因为每一次循环针对于shuffle之后的同一个key，也就是同一个父母层，对于同一个parent来说，它的child跟它的parent只要是做一个全连接就可以实现列出所有的grandchild和grandparent的关系。
			
			if (grandchildNum!=0&&grandparentNum!=0) {//必须是同时有孩子和父母的parent级节点才是我们需要的
				for (int m=0;m<grandchildNum;m++)
					for(int n=0;n<grandparentNum;n++){
						try {
							context.write(new Text(grandchild[m]), new Text(grandparent[n]));
						} catch (IOException e) {
							// TODO Auto-generated catch block
							e.printStackTrace();
						} catch (InterruptedException e) {
							// TODO Auto-generated catch block
							e.printStackTrace();
						}
					}
//				至此，对单表连接的原理做一个说明：有两排字符串：A,B要找出A1->B1同时B1=A5->B5,这种A1和B5的关系
//				把A作为key,把B作为key,这就有了两倍的(A,B)规模，此时做shuffle可以达到全连接效果。即，此时传递给reducer的<key,values>,其中的key在现实意义中应该是父母，它对应的values应该是它的child和parent，即grandchild和grandparent.
				
			}
		}
	}
	
	
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration configuration =new Configuration();
		String[] otherArgs = new GenericOptionsParser(configuration,args).getRemainingArgs();
		//GenericOptionsParser可以让Map/Reduce程序具有Hadoop常用的属性，对作业进行了部署。
		//若每次传入的数据量不是两个，则显示报错信息。
		 if (otherArgs.length !=2) {
			System.err.println("Usage: SingleTable <in> <out>");
			System.exit(2);
		}
		 Job job =new Job(configuration, "tracert SingleTable");//新建一个job,给它起个名字，以便跟踪察看任务的执行情况--都有
		 job.setJarByClass(SingleTableJoin.class);//主类---都有
		 //当在hadoop集群上运行作业时，需要把代码打包成一个jar文件，hadoop会在集群分发这个文件，通过job的setJarByClass方法把代码所在的类设置好，
		 //hadoop会根据这个类找到所在的jar文件，这步之后，才可能会有hadoop去分发jar包.
		 
		 //设置需要使用的map,combiner,reducer类
		 job.setMapperClass(SingleTableMap.class);
		 job.setReducerClass(SingleJoinReduce.class);
		 
		 //设置map reduce 的输出健和输出值类型
		 job.setOutputKeyClass(Text.class);
		 job.setOutputValueClass(Text.class);
		 
		 //设置文件输入类型和文件输出类型
		 FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		 FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		 
		 System.exit(job.waitForCompletion(true)? 0 : 1);
	}

}