HADOOP统计出现的次数

package com.zhiyou.bd23;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {
	//创建map类
	public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
		private Text outputKey = new Text();
		private IntWritable outputValue = new IntWritable(1);
		private String[] infos;
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			infos = value.toString().split("\\s+");
			for(String word:infos){
				outputKey.set(word);
				context.write(outputKey, outputValue);
			}
		}
		
	}
	//创建reduce类
	public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
		private IntWritable outputValue = new IntWritable();
		private int sum;
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,
				Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
			sum = 0;
			for(IntWritable value:values){
				sum += value.get();
			}
			outputValue.set(sum);
			context.write(key, outputValue);
		}
	}
	//创建main方法,配置并启动job
	public static void main(String[] args) throws Exception {
		Configuration configuration = new Configuration();
		Job job = Job.getInstance(configuration);
		job.setJarByClass(WordCount.class);
		job.setJobName("hadoopwordcount");
		job.setMapperClass(WordCountMapper.class);
		job.setReducerClass(WordCountReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		Path inputPath = new Path(args[0]);
		Path outputPath = new Path(args[1]);
		outputPath.getFileSystem(configuration).delete(outputPath, true);
		FileInputFormat.addInputPath(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		System.exit(job.waitForCompletion(true)?0:1);
	}
}

### 使用Hadoop统计电影评分次数 #### 方法概述 为了实现基于Hadoop的大规模数据分析,特别是针对电影评分次数统计,通常会采用MapReduce编程模型来处理大规模数据集。通过编写自定义Mapper和Reducer函数,可以在分布式环境中有效地完成这一任务。 #### 数据准备 假设存在一个名为`ratings.txt`的数据文件,其中每一行记录了一个用户的评分行为,格式如下: ``` UserID::MovieID::Rating::Timestamp ``` 此文件包含了用户ID、电影ID、评分以及时间戳四个字段的信息[^1]。 #### MapReduce设计思路 - **Mapper阶段**:读取输入文件中的每一条记录,提取出电影ID并将其作为键值对(key-value pair)输出给后续处理环节; - **Reducer阶段**:接收来自Mapper的结果,按照相同的key聚合所有value,即计算每个movie ID对应的总评分数量;最后将结果保存到指定路径下。 #### 示例代码展示 下面给出Python版本的WordCount样例改编而成用于统计各部影片被评价了多少次的具体实现方式: ```python #!/usr/bin/env python import sys # Mapper function to process input data and emit key-value pairs. def mapper(): for line in sys.stdin: try: user_id, movie_id, rating, timestamp = line.strip().split('::') print(f"{movie_id}\t{rating}") except ValueError as e: continue if __name__ == "__main__": mapper() ``` 对于上述mapper脚本,在实际应用时还需要创建相应的reducer逻辑来进行计数汇总操作: ```python #!/usr/bin/env python from operator import itemgetter import sys current_movie = None current_count = 0 movie = None for line in sys.stdin: try: movie, count = line.split('\t', 1) if current_movie is not None and current_movie != movie: print(f'{current_movie}\t{current_count}') current_count = 0 current_movie = movie current_count += int(count) except Exception as e: pass if current_movie is not None: print(f'{current_movie}\t{current_count}') ``` 以上两部分构成了完整的映射规约(MapReduce)算法流程,能够满足题目所描述的需求——利用Hadoop框架统计不同作品获得评论的数量分布情况[^3]。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值