问题还原:
求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
ratings.dat
用户ID,电影ID,评分,评分时间戳
1::1193::5::978300760
movies.dat
电影ID,电影名字,电影类型
2::Jumanji (1995)::Adventure|Children's|Fantasy
数据地址:链接:https://pan.baidu.com/s/1qj7RWDYiVnDKBJFFFcYhlw 密码:katx
思路:
根据数据样例分析,使用电影Id作为连接条件;
select a.*,b.* from a join b on a.id = b.id;
电影信息:movies.dat是个小数据,属于大小表Join,选择使用mapJoin,根据mapJoin的思路,采用缓存小文件的处理方式。
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* (1)求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
* ratings.dat
* 用户ID,电影ID,评分,评分时间戳
* 1::1193::5::978300760
*
* movies.dat
* 电影ID,电影名字,电影类型
* 2::Jumanji (1995)::Adventure|Children's|Fantasy
* 思路:
* 1、首先将ratings.dat和movies.dat两个数据进行join;
* 2、然后对数据进行切分统计:在mapper端输出,movie是key,1是value;
* 3、在reducer端进行合并统计
* 4、然后在第二个mapreduce程序中,进行排序输出
*
*/
public class Top10_Rating_Movie {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
job.setJarByClass(Top10_Rating_Movie.class);
//job.addArchiveToClassPath(new Path("")); //缓存压缩文件到classPath
//job.addCacheArchive(new URI("")); //缓存普通文件到工作目录
//job.addFileToClassPath(new Path("")); //缓存普通文件到classPath
//这里使用的时候缓存普通文件到工作目录
//这里要在本地进行测试,所以使用的本地文件系统的文件路径
//如果要打jar包运行在集群上,使用HDFS文件系统的路径
job.addCacheFile(new URI("file:/G:/files/mr/day3/q2/input/movies.dat"));
job.setMapperClass(StepOneMapper.class);
job.setReducerClass(StepOneReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//因为输入文件夹中有多个文件,为了区分,这里指定到文件名称
Path inputPath = new Path("G:/files/mr/day3/q2/input/ratings.dat");
Path outputPath = new Path("G:/files/mr/day3/q2/output1_1");
//判断输出目录是否存在,如果存在,就删除
if(fs.exists(outputPath)){
fs.delete(outputPath, true);
}
//设置数据的输入输出路径
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
//System.exit(job.waitForCompletion(true) ? 0:1);
Job job2 = Job.getInstance(conf);
job2.setJarByClass(Top10_Rating_Movie.class);
job2.setMapperClass(StepTwoMapper.class);
job2.setReducerClass(StepTwoReducer.class);
job2.setOutputKeyClass(Movie.class);
job2.setOutputValueClass(NullWritable.class);
//因为输入文件夹中有多个文件,为了区分,这里指定到文件名称
Path inputPath2 = new Path("G:/files/mr/day3/q2/output1_1");
Path outputPath2 = new Path("G:/files/mr/day3/q2/output1_2");
//判断输出目录是否存在,如果存在,就删除
if(fs.exists(outputPath2)){
fs.delete(outputPath2, true);
}
//设置数据的输入输出路径
FileInputFormat.setInputPaths(job2, inputPath2);
FileOutputFormat.setOutputPath(job2, outputPath2);
ControlledJob stepOne = new ControlledJob(job.getConfiguration());
ControlledJob stepTwo = new ControlledJob(job2.getConfiguration());
stepTwo.addDependingJob(stepOne);
JobControl jc = new JobControl("MV");
jc.addJob(stepOne);
jc.addJob(stepTwo);
//public class JobControl implements Runnable {}
Thread t = new Thread(jc);
t.start();
while(!jc.allFinished()){
Thread.sleep(1000);
}
System.exit(0);
}
/**
*
*/
public static class StepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
//用户ID,电影ID,评分,评分时间戳
//1::1193::5::978300760
//电影ID,电影名字,电影类型
//2::Jumanji (1995)::Adventure|Children's|Fantasy
/**
* 根据开始的分析,在mapper中的setup方法中,对缓存的小文件进行切分处理
* 这里只要电影的Id和电影名称
* 使用HashMap进行存储
* 在map()方法中进行join操作;
*/
Map<String,String> movieMap = new HashMap<>();
@Override
protected void setup(Context context)throws IOException, InterruptedException {
/*
//使用的时候getLocalCacheArchives()这个方法被标记位过时,但是不影响使用
//获取缓存文件的路径
Path[] cacheFilePath = context.getLocalCacheArchives();
//由于只缓存了一个文件,所以在这里就不进行判断了
String path = cacheFilePath[0].toUri().toString();
//通过File IO读取指定路径的文件进行处理
BufferedReader bf1 = new BufferedReader(new FileReader(new File(path)));
//以上这些是为了在集群中运行
*/
//在本地测试,这样读取缓存文件,因为文件已经缓存到工作空间中了,所以直接通过文件名读取即可
BufferedReader bf = new BufferedReader(new FileReader(new File("movies.dat")));
String line = null;
while((line=bf.readLine()) != null){
String[] movies = line.toString().split("::");
//这里直接将userId看作字符串进行处理
String movieId = movies[0];
String movieName = movies[1];
movieMap.put(movieId, movieName);
}
//关闭流
bf.close();
}
Text K = new Text();
IntWritable V = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//用户ID,电影ID,评分,评分时间戳
//1::1193::5::978300760
//逐行读取ratings.dat文件 这里不多说读取的机制,详情见以后的源码分析
String[] ratings = value.toString().split("::");
String movieId = ratings[1];
//这里就不展示mapjoin的输出结果了
//如果想要查看 可以在map中拼接输出,然后直接让reducer端原样输出就可以了
//判断评分记录中的电影ID是否在电影信息中
//如果在就输出
if(movieMap.containsKey(movieId)){
String outKey = movieId+"::"+movieMap.get(movieId);
K.set(outKey);
context.write(K, V);
}
}
}
public static class StepOneReducer extends Reducer<Text, IntWritable, Text, NullWritable>{
Text K = new Text();
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context context)
throws IOException, InterruptedException {
//map端的输出最终可以看作 <k,{v1,v2,v3,...vn}>这种形式
//所以在这里迭代统计评分次数,就是一个wordCount
int sum = 0;
for (IntWritable intWritable : values) {
//因为intWritable.get()都是1,所以直接++
sum++;
}
K.set(key.toString()+"::"+sum);
context.write(K,NullWritable.get());
}
}
public static class StepTwoMapper extends Mapper<LongWritable, Text, Movie, NullWritable>{
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
//直接读取第一步的输出结果,构建Movie对象,输出即可
String[] line = value.toString().split("::");
Movie mv = new Movie(line[0],line[1],Integer.parseInt(line[2]));
context.write(mv, NullWritable.get());
}
}
public static class StepTwoReducer extends Reducer<Movie, NullWritable, Movie, NullWritable>{
//通过计数器控制输出Top10
int count = 0;
@Override
protected void reduce(Movie key, Iterable<NullWritable> values,Context context)
throws IOException, InterruptedException {
for (NullWritable nt : values) {
if(count < 10){
context.write(key,NullWritable.get());
}
count++;
}
}
}
}
对于第二个Mapper和Redcuer中采用自定义Bean的输出格式为了容易排序。
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
/**
* WritableComparable这个接口继承了Writable和Comparable接口
* 主要用来做序列化/反序列化和 自定义排序规则使用
*
*/
public class Movie implements WritableComparable<Movie>{
private String movieId; //电影id
private String movieName;//电影名称
private int ratingTime;//评分总次数
public String getMovieId() {
return movieId;
}
public void setMovieId(String movieId) {
this.movieId = movieId;
}
public String getMovieName() {
return movieName;
}
public void setMovieName(String movieName) {
this.movieName = movieName;
}
public int getRatingTime() {
return ratingTime;
}
public void setRatingTime(int ratingTime) {
this.ratingTime = ratingTime;
}
public Movie() {
}
public Movie(String movieId, String movieName, int ratingTime) {
this.movieId = movieId;
this.movieName = movieName;
this.ratingTime = ratingTime;
}
@Override
public String toString() {
return movieId + ", " + movieName + ", " + ratingTime;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(movieId);
out.writeUTF(movieName);
out.writeInt(ratingTime);
}
//!!!序列化和反序列化字段要严格对应
//!!!序列化和反序列化字段要严格对应
//!!!序列化和反序列化字段要严格对应
@Override
public void readFields(DataInput in) throws IOException {
movieId = in.readUTF();
movieName = in.readUTF();
ratingTime = in.readInt();
}
@Override
public int compareTo(Movie o) {
//因为要对电影的总评分次数倒序排列
return o.ratingTime - this.ratingTime;
}
}
不善言辞,有问题大家指出来,共同学习。