第一个hadoop程序开发_opens the new annotation wizard to create the type-优快云博客

本文链接：https://blog.youkuaiyun.com/niupf/article/details/43092893

本文介绍了如何在Hadoop环境下开发第一个WordCount程序，包括虚拟机配置、文件上传、程序源代码修改和执行过程。通过示例展示了如何处理以'||'分隔的数据，实现统计每个单词出现次数的功能。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1.虚拟机：

VMware Workstation （安装目录E:\Program Files\vm）

hadoopMaster 192.168.119.129(如果ip有变化请修改etx/hosts)

hadoopSlave 192.168.119.130

2.WinSCP

配置WinSCP使其能链接到master,如果ip 不能ping通，关掉本地连接打开虚拟机的网络设置

vm网络不能连接解决方法
开启管理工具--服务中的VMware DHCP Service和VMware NAT Service两个服务

3.运行流程

->打开虚拟机hadoopMaster hadoopSlave

->启动hadoop ./start-all.sh(保证俩台机器都启动，启动前检查是否有运行hadoop,用ps -ef | grep hadoop 查看是否运行，如果有运行的进程，则杀掉)

->文件准备参考hadoop/bin/aaaa.txt文件上传hadoop ./hadoop fs -put aaaa.txt input 将aaaa.txt上传到hadoop的input目录

->在hadoop/bin目录下运行 ./hadoop jar -word.jar wordcount input output

->查看运行结果：./hadoop fs -cat output/part-r-00000

3.程序开发

->下载hadoop自带的hadoop-examples-*.jar和WordCount.java，源代码在src/examples下面

->修改WordCount.java 中的Mapper和Reduce

->将修改后的WordCount.java的编译文件class拷贝到hadoop-examples-*.jar里

->然后上传到hadoop

4.程序源代码

package org.apache.hadoop.examples;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
	/**
	 * 实例
	 * 
	 * 把文件
	 *npf||40
	 *zhz||20
	 *zhz||20
	 *zhz||20
	 *zhz||20
	 *zhz||20
	 *	
	 * 转为
	 *  npf  40
	 *	zhz  460
	 */
public class WordCount {
	
	public static class TokenizerMapper extends
			Mapper<Object, Text, Text, IntWritable> {
		/**
		 * Object key  map中的key，可以理解为行编码
		 * Text value  文件中的一行的全部记录
		 * Context context 上下文
		 */
		public void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			//一行中的所有记录
			String str = value.toString();
			//用||分割成数组，如：npf||12,分割成{npf,12}
			StringTokenizer itr = new StringTokenizer(str, "||");
			//遍历
			while (itr.hasMoreTokens()) {
				//获取第一个值  npf
				String mykey = itr.nextToken();
				//获取第二个值 12
				int myvalue = Integer.parseInt(itr.nextToken());
				//设置map输出key value
				context.write(new Text(mykey), new IntWritable(myvalue));
			}

		}
	}

	public static class IntSumReducer extends
			Reducer<Text, IntWritable, Text, IntWritable> {
		private IntWritable result = new IntWritable();
		/**
		 * Text key  reduce中的key，是map中定义的
		 * Iterable<IntWritable> values  同一key的所有值
		 * Context context 上下文
		 */
		public void reduce(Text key, Iterable<IntWritable> values,
				Context context) throws IOException, InterruptedException {
			//初始化合计
			int sum = 0;
			//循环values,做累加
			for (IntWritable val : values) {
				sum += val.get();
			}
			//将合计值付给reduce的输出value
			result.set(sum);
			//输出reduce的key value
			context.write(key, result);
		}
	}

	public static void main(String[] args) throws Exception {
		// 读取hadoop配置  
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args)
				.getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: wordcount <in> <out>");
			System.exit(2);
		}
		// 实例化一道作业  
		Job job = new Job(conf, "word count");
		job.setJarByClass(WordCount.class);
		// Mapper类型  
		job.setMapperClass(TokenizerMapper.class);
		job.setCombinerClass(IntSumReducer.class);
		// Reducer类型  
		job.setReducerClass(IntSumReducer.class);
		// rduce输出Key的类型，
		job.setOutputKeyClass(Text.class);
		// rduce输出Value的类型  
		job.setOutputValueClass(IntWritable.class);
		// 输入hdfs路径  
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		// 输出hdfs路径 
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		// 提交job  
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}