WordCount

最新推荐文章于 2025-08-22 22:42:14 发布

原创最新推荐文章于 2025-08-22 22:42:14 发布 · 134 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#hadoop

hadoop 专栏收录该内容

3 篇文章

订阅专栏

本文详细解析了Hadoop环境下WordCount程序的具体实现过程，包括MapReduce的工作原理、输入输出格式配置及具体代码示例。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

package com.my.wordcount;

import java.io.IOException;
import java.net.URI;
import java.net.URL;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FsUrlStreamHandlerFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
*
* 新的API的WordCount写法
* 源文件hellow，内容如下：
* hellow hadoop
* hellow world
* 注意：
* 重写map函数时，保留@Override关键字
* 重写reduce函数时，一定要注意@Override关键字，否则不会执行reduce阶段的<k3,v3>
* 步骤如下：
* 1：这是在map阶段。
* 1.1：读取hdfs中的文件，每一行解析为一个<k,v>,并且每一个键值对调用一次map函数。
* 如下：
* hdfs上传的文件经过InputFormat（按照偏移量处理：每一行的文本偏移量为key，每一行的内容为value）解析为：
* <0 ,hellow hadoop>
* <14,hellow world>
* 两个键值对<k1,v1>，每一个键值对分别调用Map函数，按照空格拆分单词，获取到每一行文本中的单词
*
* 1.2：覆盖map函数，接受1产生的<k,v>进行处理后转换为新的<k,v>输出。
* 如下：
* <hellow,1>,<hadoop,1>,<hellow,1>,<world,1>
*
* 1.3：对2输出的<k,v>进行分区，默认为1个分区。
* 1.4：对不同分区中的数据进行排序（按k排序），分组。分组是指相同key的value放到一个集合中。
* 如下：
* 排序后<hadoop,1>,<hellow,1>,<hellow,1>,<world,1>
* 分组后<hadoop,{1}>,<hellow,{1,1}>,<world,{1}>
*
* 1.5：(可选，涉及到了数据算法)对分组后的数据进行规约。
* 如下：
* 这里的算法为单词计数法
*
* 2：这是在reduce阶段。
* 2.1：多个map任务的输出，按照不同的分区通过网络copy到不同的reduce节点。
* 如下：
* 将分组后的<hadoop,{1}>,<hellow,{1,1}>,<world,{1}>复制到每个节点
*
* 2.2：对多个map的输出进行排序，合并，覆盖reduce函数，接受分组后的数据按照业务逻辑处理后产生新的<k,v>。
* 如下：
* 通过遍历相同key的value计算总和，得到每个单词的出现的次数
* <hadoop,1>,<hellow,2>,<world,1>
*
* 2.3：对reduce的输出写入到hdfs中。
* 如下：
* 写入到hefs的文件中
*
*/
public class WordCount extends Configured implements Tool {

public final static String HDFS_INPUT = "hdfs://pc1:9000/hellow";
public final static String HDFS_OUTPUT = "hdfs://pc1:9000/hellowOut";

public static void main(String[] args) throws Exception {
String[] paths = {HDFS_INPUT,HDFS_OUTPUT};
int status = ToolRunner.run(new WordCount(), paths);
System.out.println("System.gc("+status+");");
}

@Override
public int run(String[] args) throws Exception{
boolean status = false;
URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());

Job job = null;
try {
job = new Job(getConf());
job.setJarByClass(WordCount.class);
job.setJobName(WordCount.class.getSimpleName());
} catch (IOException e) {
e.printStackTrace();
}

FileSystem fileSystem = FileSystem.get(new URI(HDFS_OUTPUT),new Configuration());
if (fileSystem.exists(new Path(args[1]))){
System.out.println("如果"+args[1]+"存在则删除！");
fileSystem.delete(new Path(args[1]), true);
}

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);

job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);

job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);

FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));

status = job.waitForCompletion(true);

return status?0:1;
}

/**
*
* @author
*
*/
static class Map extends Mapper<LongWritable, Text, Text, LongWritable>{
public void map(LongWritable k1, Text v1, Context context) throws IOException ,InterruptedException {
System.out.println("k1:"+k1.get()+",v1:"+v1.toString());
final String[] splited = v1.toString().split(" ");
for (String word : splited) {
context.write(new Text(word), new LongWritable(1l));
}
};
}

static class Reduce extends Reducer<Text, LongWritable, Text, LongWritable>{
public void reduce(Text k2, java.lang.Iterable<LongWritable> v2, Context context) throws java.io.IOException ,InterruptedException {
System.out.println("k2:"+k2+",v2:"+v2.toString());
long count = 0L;
for (LongWritable item : v2) {
count += item.get();
}
context.write(k2, new LongWritable(count));
};
}

}

以上代码复制可用