http://18.163.52.98/
/** author LU ZHENGYU UPDATA ON Spet. 9 2019
*
*/
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.io.IntWritable.Comparator;
import org.apache.hadoop.io.WritableComparable;
import java.io.IOException;
import java.util.ArrayList;
public class MoviesCount {
/*********第一个Mapreduce 统计相同MovieID,Rating对的的UserID
* Mapper 把MovieID 和 Rating 作为Key UID作为Value
* Reducer 从 UID的Iterable 找出 所有组合 并输出,value为1
*/
public static class First_Mapper extends Mapper<Object, Text, Text, IntWritable> {
@Override
public void map(Object key, Text value, Context context) throws NumberFormatException, IOException, InterruptedException {
// 不处理CSV表中第一行的数据
if (key.toString().equals("0")) {
return;
}
// 逐行读取,内容在value中 按逗号拆分每一行 userId,movieId,rating e.g 1,1,4
String[] stringrray = value.toString().split(",");
// 构建movieId和rating的key
String MRkey = stringrray[1] + ", " + stringrray[2];
context.write(new Text(MRkey), new IntWritable(Integer.parseInt(stringrray[0])));
}
}
public static class First_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
// 构建数组存储一个Iterable对象中的UID,并做组合操作
ArrayList<Integer> arrayList = new ArrayList<Integer>();
for (IntWritable value : values) {
arrayList.add(value.get());
}
for (int i = 0; i < arrayList.size(); i++) {
for (int j = i + 1; j < arrayList.size(); j++) {
String Smallone;
String Bigone;
if( Integer.parseInt(arrayList.get(i).toString()) > Integer.parseInt(arrayList.get(j).toString())){
Smallone = arrayList.get(j).toString();
Bigone = arrayList.get(i).toString();
}
else{ Smallone = arrayList.get(i).toString();
Bigone = arrayList.get(j).toString();
}
String UIDkey = Bigone + "," + Smallone ;
context.write(new Text(UIDkey), one);
}
}
}
}
/*********第二个Mapreduce 对相同的UID_A, UID_B做累计统计
* Mapper 直接把 UID_A , UID_B <1> 传给 reduce
* Reducer 对 UID对的求和
*/
public static class Second_Mapper extends Mapper<Object, Text, Text, IntWritable> {
@Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
//注意输出的是制表符 不是空格
String[] array = value.toString().split("\t");
context.write(new Text(array[0]), new IntWritable(Integer.parseInt(array[1])));
}
}
public static class Second_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int Count = 0;
for (IntWritable value : values) {
Count += value.get();
}
context.write(key, new IntWritable(Count));
}
}
/*********第三个Mapreduce 对 结果排序
* Mapper 把 UID_A , UID_B <sum> key,value对换 传给reduce
* Reducer 排序 再对换输出到文件
*/
public static class Third_Mapper extends Mapper<Object,Text,IntWritable,Text>{
@Override
public void map(Object key,Text value,Context context) throws NumberFormatException, IOException, InterruptedException{
String[] split = value.toString().split("\t");
context.write(new IntWritable(Integer.parseInt(split[1])),new Text(split[0]));
}
}
public static class Third_Reducer extends Reducer<IntWritable,Text,Text,IntWritable>{
@Override
public void reduce(IntWritable key,Iterable<Text>values,Context context) throws IOException, InterruptedException{
for (Text text : values) {
context.write(text,key);
}
}
}
/**
* comparator refer to https://blog.youkuaiyun.com/Gamer_gyt/article/details/48025805
* 输出排序
*/
private static class DecreasingComparator extends Comparator {
@SuppressWarnings("rawtypes")
public int compare(WritableComparable a, WritableComparable b) {
return -super.compare(a, b);
}
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return -super.compare(b1, s1, l1, b2, s2, l2);
}
}
public static void main(String[] args) throws IOException {
JobConf conf = new JobConf(MoviesCount.class);
//Refer to http://www.tuicool.com/articles/N7buuy
//配置第一个job
Job job1 = new Job(conf, "join1");
job1.setJarByClass(MoviesCount.class);
//设置对应的Mapper 和 Reducer
job1.setMapperClass(First_Mapper.class);
job1.setReducerClass(First_Reducer.class);
//map阶段的输出的key
job1.setMapOutputKeyClass(Text.class);
//map阶段的输出的value
job1.setMapOutputValueClass(IntWritable.class);
//reduce阶段的输出的key
job1.setOutputKeyClass(Text.class);
//reduce阶段的输出的value
job1.setOutputValueClass(IntWritable.class);
//加入控制容器
ControlledJob ctrl_job1 = new ControlledJob(conf);
ctrl_job1.setJob(job1);
//job1的输入输出文件路径
FileInputFormat.addInputPath(job1, new Path(args[0]));
FileOutputFormat.setOutputPath(job1, new Path(args[1]));
//配置第二个job
Job job2 = new Job(conf, "Join2");
job2.setJarByClass(MoviesCount.class);
job2.setMapperClass(Second_Mapper.class);
job2.setReducerClass(Second_Reducer.class);
//map阶段的输出的key
job2.setMapOutputKeyClass(Text.class);
//map阶段的输出的value
job2.setMapOutputValueClass(IntWritable.class);
//reduce阶段的输出的key
job2.setOutputKeyClass(Text.class);
//reduce阶段的输出的value
job2.setOutputValueClass(IntWritable.class);
//作业2加入控制容器
ControlledJob ctrl_job2 = new ControlledJob(conf);
ctrl_job2.setJob(job2);
// job2的启动,依赖于job1作业的完成
ctrl_job2.addDependingJob(ctrl_job1);
//输入路径是上一个作业的输出路径,因此这里填args[1],要和上面对应好
FileInputFormat.addInputPath(job2, new Path(args[1]));
//输出路径从新传入一个参数,这里需要注意,因为我们最后的输出文件一定要是没有出现过得
//因此我们在这里new Path(args[2])因为args[2]在上面没有用过,只要和上面不同就可以了
FileOutputFormat.setOutputPath(job2, new Path(args[2]));
//配置第三个job
Job job3 = new Job(conf, "Join3");
job3.setJarByClass(MoviesCount.class);
job3.setMapperClass(Third_Mapper.class);
job3.setReducerClass(Third_Reducer.class);
//map阶段的输出的key
job3.setMapOutputKeyClass(IntWritable.class);
//map阶段的输出的value
job3.setMapOutputValueClass(Text.class);
//reduce阶段的输出的key
job3.setOutputKeyClass(Text.class);
//reduce阶段的输出的value
job3.setOutputValueClass(IntWritable.class);
job3.setSortComparatorClass(DecreasingComparator.class);
//作业3加入控制容器
ControlledJob ctrl_job3 = new ControlledJob(conf);
ctrl_job3.setJob(job3);
// job3的启动,依赖于job2作业的完成
ctrl_job3.addDependingJob(ctrl_job2);
FileInputFormat.addInputPath(job3, new Path(args[2]));
FileOutputFormat.setOutputPath(job3, new Path(args[3]));
//主的控制容器,控制上面的总的两个子作业
//导包的时候注意要导新包org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
//参数为组名,所有job组成一个组
JobControl jobCtrl = new JobControl("ctrl");
//添加到总的JobControl里,进行控制
jobCtrl.addJob(ctrl_job1);
jobCtrl.addJob(ctrl_job2);
jobCtrl.addJob(ctrl_job3);
//在线程启动,记住一定要有这个
new Thread(jobCtrl).start();
while (true) {
if (jobCtrl.allFinished()) {
//如果作业成功完成,就打印成功作业的信息
System.out.println(jobCtrl.getSuccessfulJobList());
jobCtrl.stop();
break;
}
}
}
}
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
<properties>
<hadoopVersion>2.9.2</hadoopVersion>
</properties>
<dependencies>
<!-- Hadoop start -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoopVersion}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoopVersion}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoopVersion}</version>
</dependency>
<!-- Hadoop -->
<dependency>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
<version>1.7</version>
<scope>system</scope>
<systemPath>D:/java7/lib/tools.jar</systemPath>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
</dependencies>
# Configure logging for testing: optionally with log file
log4j.rootLogger=WARN, stdout
# log4j.rootLogger=WARN, stdout, logfile
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n