电影评分次数Top10问题

最新推荐文章于 2023-05-03 22:15:18 发布

_a_0_

最新推荐文章于 2023-05-03 22:15:18 发布

阅读量3.3k

点赞数

CC 4.0 BY-SA版权

分类专栏： # Hadoop 文章标签： mapJoin Top10

本文链接：https://blog.youkuaiyun.com/zyz_home/article/details/79646296

Hadoop 专栏收录该内容

20 篇文章

订阅专栏

本文介绍如何使用Hadoop MapReduce实现求解被评分次数最多的10部电影及其评分次数，包括数据预处理、MapReduce编程实现及自定义Bean类等步骤。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

问题还原：

求被评分次数最多的10部电影，并给出评分次数（电影名，评分次数）

ratings.dat
用户ID，电影ID，评分，评分时间戳
1::1193::5::978300760
movies.dat
电影ID，电影名字，电影类型
2::Jumanji (1995)::Adventure|Children's|Fantasy

数据地址：链接：https://pan.baidu.com/s/1qj7RWDYiVnDKBJFFFcYhlw 密码：katx

思路：

根据数据样例分析，使用电影Id作为连接条件；

select a.*,b.* from a join b on a.id = b.id;

电影信息：movies.dat是个小数据，属于大小表Join，选择使用mapJoin，根据mapJoin的思路，采用缓存小文件的处理方式。

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * （1）求被评分次数最多的10部电影，并给出评分次数（电影名，评分次数）
 * 	 ratings.dat
 *   用户ID，电影ID，评分，评分时间戳
 *   1::1193::5::978300760
 *   
 *   movies.dat
 *   电影ID，电影名字，电影类型
 *	 2::Jumanji (1995)::Adventure|Children's|Fantasy
 * 思路：
 *  1、首先将ratings.dat和movies.dat两个数据进行join；
 *  2、然后对数据进行切分统计：在mapper端输出，movie是key，1是value；
 *  3、在reducer端进行合并统计
 *  4、然后在第二个mapreduce程序中,进行排序输出
 *
 */
public class Top10_Rating_Movie {
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(conf);
		
		Job job = Job.getInstance(conf);
		job.setJarByClass(Top10_Rating_Movie.class);
		
		//job.addArchiveToClassPath(new Path("")); //缓存压缩文件到classPath
		//job.addCacheArchive(new URI("")); //缓存普通文件到工作目录
	    //job.addFileToClassPath(new Path("")); //缓存普通文件到classPath
		//这里使用的时候缓存普通文件到工作目录
		//这里要在本地进行测试，所以使用的本地文件系统的文件路径
		//如果要打jar包运行在集群上，使用HDFS文件系统的路径
		job.addCacheFile(new URI("file:/G:/files/mr/day3/q2/input/movies.dat"));
		
		job.setMapperClass(StepOneMapper.class);
		job.setReducerClass(StepOneReducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		
		//因为输入文件夹中有多个文件，为了区分，这里指定到文件名称
		Path inputPath = new Path("G:/files/mr/day3/q2/input/ratings.dat");
		Path outputPath = new Path("G:/files/mr/day3/q2/output1_1");
		
		//判断输出目录是否存在，如果存在，就删除
		if(fs.exists(outputPath)){
			fs.delete(outputPath, true);
		}
		
		//设置数据的输入输出路径
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		
		
		//System.exit(job.waitForCompletion(true) ? 0:1);
		
		
		
		
		Job job2 = Job.getInstance(conf);
		job2.setJarByClass(Top10_Rating_Movie.class);
		
		job2.setMapperClass(StepTwoMapper.class);
		job2.setReducerClass(StepTwoReducer.class);
		
		job2.setOutputKeyClass(Movie.class);
		job2.setOutputValueClass(NullWritable.class);
		
		//因为输入文件夹中有多个文件，为了区分，这里指定到文件名称
		Path inputPath2 = new Path("G:/files/mr/day3/q2/output1_1");
		Path outputPath2 = new Path("G:/files/mr/day3/q2/output1_2");
		
		//判断输出目录是否存在，如果存在，就删除
		if(fs.exists(outputPath2)){
			fs.delete(outputPath2, true);
		}
		
		//设置数据的输入输出路径
		FileInputFormat.setInputPaths(job2, inputPath2);
		FileOutputFormat.setOutputPath(job2, outputPath2);
		
		
		ControlledJob stepOne = new ControlledJob(job.getConfiguration());
		ControlledJob stepTwo = new ControlledJob(job2.getConfiguration());
		
		stepTwo.addDependingJob(stepOne);
		
		JobControl jc = new JobControl("MV");
		
		jc.addJob(stepOne);
		jc.addJob(stepTwo);
		
		//public class JobControl implements Runnable {}
		Thread t = new Thread(jc);
		t.start();
		
		while(!jc.allFinished()){
			Thread.sleep(1000);
		}
		
		System.exit(0);
		
	}
	
	
	/**
	 *	
	 */
	public static class StepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
		//用户ID，电影ID，评分，评分时间戳
		//1::1193::5::978300760
		//电影ID，电影名字，电影类型
		//2::Jumanji (1995)::Adventure|Children's|Fantasy
		
		/**
		 * 根据开始的分析，在mapper中的setup方法中，对缓存的小文件进行切分处理
		 * 这里只要电影的Id和电影名称
		 * 使用HashMap进行存储
		 * 在map()方法中进行join操作；
		 */
		Map<String,String> movieMap = new HashMap<>();
		
		@Override
		protected void setup(Context context)throws IOException, InterruptedException {
		/*
			//使用的时候getLocalCacheArchives()这个方法被标记位过时，但是不影响使用
			//获取缓存文件的路径
			Path[] cacheFilePath = context.getLocalCacheArchives();
			//由于只缓存了一个文件，所以在这里就不进行判断了
			String path = cacheFilePath[0].toUri().toString();
			//通过File IO读取指定路径的文件进行处理
			BufferedReader bf1 = new BufferedReader(new FileReader(new File(path)));
			
			//以上这些是为了在集群中运行
		*/
			
			//在本地测试，这样读取缓存文件，因为文件已经缓存到工作空间中了，所以直接通过文件名读取即可
			BufferedReader bf = new BufferedReader(new FileReader(new File("movies.dat")));
			String line = null;
			while((line=bf.readLine()) != null){
				String[] movies = line.toString().split("::");
				//这里直接将userId看作字符串进行处理
				String movieId = movies[0];
				String movieName = movies[1];
				movieMap.put(movieId, movieName);
			}
			
			//关闭流
			bf.close();
		}
		
		Text K = new Text();
		IntWritable V = new IntWritable(1);
		@Override
		protected void map(LongWritable key, Text value, Context context) 
				throws IOException, InterruptedException {
			
			//用户ID，电影ID，评分，评分时间戳
			//1::1193::5::978300760
			//逐行读取ratings.dat文件  这里不多说读取的机制，详情见以后的源码分析
			String[] ratings = value.toString().split("::");
			String movieId = ratings[1];
			
			//这里就不展示mapjoin的输出结果了
			//如果想要查看 可以在map中拼接输出，然后直接让reducer端原样输出就可以了
			
			//判断评分记录中的电影ID是否在电影信息中
			//如果在就输出
			if(movieMap.containsKey(movieId)){
				String outKey = movieId+"::"+movieMap.get(movieId);
				K.set(outKey);
				context.write(K, V);
			}
			
		}
		
	}
	
	
	public static class StepOneReducer extends Reducer<Text, IntWritable, Text, NullWritable>{
	
		Text K = new Text();
		
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,Context context)
				throws IOException, InterruptedException {
			
			//map端的输出最终可以看作 <k,{v1,v2,v3,...vn}>这种形式
			//所以在这里迭代统计评分次数，就是一个wordCount
			int sum = 0;
			for (IntWritable intWritable : values) {
				//因为intWritable.get()都是1，所以直接++
				sum++;
			}
			
			K.set(key.toString()+"::"+sum);
			context.write(K,NullWritable.get());
		}
	}
	
	public static class StepTwoMapper extends Mapper<LongWritable, Text, Movie, NullWritable>{
		
		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {

			//直接读取第一步的输出结果，构建Movie对象，输出即可
			String[] line = value.toString().split("::");
			Movie mv = new Movie(line[0],line[1],Integer.parseInt(line[2]));
			
			context.write(mv, NullWritable.get());
		}
		
	}
	
	
	public static class StepTwoReducer extends Reducer<Movie, NullWritable, Movie, NullWritable>{
		//通过计数器控制输出Top10
		int count = 0;
		
		@Override
		protected void reduce(Movie key, Iterable<NullWritable> values,Context context)
				throws IOException, InterruptedException {
		
			for (NullWritable nt : values) {
				if(count < 10){
					context.write(key,NullWritable.get());
				}
				count++;
			}
			
		}
	}
	
}

对于第二个Mapper和Redcuer中采用自定义Bean的输出格式为了容易排序。

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

/**
 * WritableComparable这个接口继承了Writable和Comparable接口
 * 主要用来做序列化/反序列化和 自定义排序规则使用
 *
 */
public class Movie implements WritableComparable<Movie>{
	private String movieId; //电影id
	private String movieName;//电影名称
	private int ratingTime;//评分总次数
	
	public String getMovieId() {
		return movieId;
	}
	public void setMovieId(String movieId) {
		this.movieId = movieId;
	}
	public String getMovieName() {
		return movieName;
	}
	public void setMovieName(String movieName) {
		this.movieName = movieName;
	}
	public int getRatingTime() {
		return ratingTime;
	}
	public void setRatingTime(int ratingTime) {
		this.ratingTime = ratingTime;
	}
	
	public Movie() {

	}
	
	public Movie(String movieId, String movieName, int ratingTime) {
		this.movieId = movieId;
		this.movieName = movieName;
		this.ratingTime = ratingTime;
	}
	
	@Override
	public String toString() {
		return movieId + ", " + movieName + ", " + ratingTime;
	}
	
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(movieId);
		out.writeUTF(movieName);
		out.writeInt(ratingTime);
	}
	
	//！！！序列化和反序列化字段要严格对应
	//！！！序列化和反序列化字段要严格对应
	//！！！序列化和反序列化字段要严格对应
	
	@Override
	public void readFields(DataInput in) throws IOException {
		movieId = in.readUTF();
		movieName = in.readUTF();
		ratingTime = in.readInt();
	}
	

	
	@Override
	public int compareTo(Movie o) {
		//因为要对电影的总评分次数倒序排列
		return o.ratingTime - this.ratingTime;
	}
	
	
	
}

不善言辞，有问题大家指出来，共同学习。