hadoop2 wordcount 执行输出解释

最新推荐文章于 2021-04-27 22:01:44 发布

原创最新推荐文章于 2021-04-27 22:01:44 发布 · 226 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#大数据 #java #运维

hadoop2 专栏收录该内容

29 篇文章

订阅专栏

本文介绍Hadoop2环境下WordCount应用的具体实现方法，包括Mapper和Reducer的编写，以及如何通过Ant构建并运行程序。

0 每一行执行一次map函数每个组调用一次reduce函数

1 代码：这里主要看main函数写法注意和hadoop1写法不同处的对比使用ant运行

运行集群参看: hadoop2 搭建自动切换的ha集群 yarn集群

注意 Mapper Reducer类以及 map() reduce() 方法对应的Context的包的引用

打包运行时， job.setJarByClass是必须的

package mapreduce;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCountApp {

	public static class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
		/**
		 * 每一行执行一次map函数
		 * @param key 表示字节在源文件中偏移量
		 * @param value 行文本内容
		 */
		
		protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,Text,LongWritable>.Context context) throws java.io.IOException ,InterruptedException {
			final String[] splited = value.toString().split("\t");
			for (String word : splited) {
				context.write(new Text(word), new LongWritable(1));
			}
		};
	}
	
	//产生输出：<hello,1><you,1><hello,1><me,1>
	//按照key进行排序：<hello,1><hello,1><me,1><you,1>
	
	//分组：<hello,{1,1}><me,{1}><you,{1}>【把相同key的value放到一起】    reduce方法是每一组调用一次 左侧结果 为3组 则调用3次reduce方法
	public static class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
		/**
		 * 每个组调用一次reduce函数
		 * @param word 表示单词
		 * @param times 表示相同key的value的迭代器
		 */
		protected void reduce(Text word, java.lang.Iterable<LongWritable> times, org.apache.hadoop.mapreduce.Reducer<Text,LongWritable,Text,LongWritable>.Context context) throws java.io.IOException ,InterruptedException {
			long sum = 0L;
			for (LongWritable longWritable : times) {
				sum += longWritable.get();
			}
			
			context.write(word, new LongWritable(sum));
		};
	}
	
	
	public static void main(String[] args) throws Exception {
		// 设置Job对象
		final Configuration conf = new Configuration();
		final Job job = new Job(conf);
		job.setJobName(WordCountApp.class.getSimpleName());
		job.setJarByClass(WordCountApp.class);
		// 给Job对象设置自定义 mapper  reducer
		job.setMapperClass(WordCountMapper.class);
		job.setReducerClass(WordCountReducer.class);
		// 设置map reduce输出参数类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		// 设置Job任务要处理的数据源和输出数据目的地
		FileInputFormat.addInputPaths(job, "/test"); // 注意是addInputPaths 用的是复数的方法
		FileOutputFormat.setOutputPath(job, new Path("/out1"));
		// 执行Job
		job.waitForCompletion(true);
	}
}

2 build.xml:

<?xml version="1.0" encoding="UTF-8"?>

<!-- 该文件与src文件夹、lib文件夹同一级  -->
<project name="hadoop2测试项目" basedir="." default="sshexec">

	<!--属性设置-->
	<property environment="env" />
	<property file="build.properties" />
	<property name="src.dir" value="${basedir}/src" />
	<property name="java.lib.dir" value="${env.JAVA_HOME}/lib" />
	<property name="classes.dir" value="${basedir}/classes" />
	<property name="dist.dir" value="${basedir}/dist" />
	<property name="project.lib.dir" value="${basedir}/lib" />
	<property name="localpath.dir" value="${basedir}" />
	<property name="remote.home" value="~"/>
	<!--可以修改：hadoop集群的hostname或者ip-->
	<property name="remote.hostname" value="h2master"/>
	<!--可以修改：登录hadoop集群所在linux的用户名-->
	<property name="remote.username" value="root"/>
	<!--可以修改：登录hadoop集群所在liniux的密码-->
	<property name="remote.password" value="123456"/>
	<!--可以修改：每次需要运行的main类，写到这里。运行时拼接为hadoop jar xxx.jar MainClass -->
	<property name="main.class" value="mapreduce.WordCountApp"/>
	<!--可以修改：hadoop集群在linux的部署路径-->
	<property name="hadoop.path" value="/usr/local/hadoop2.5"/>
	
	<!-- 基本编译路径设置 -->
	<path id="compile.classpath">
		<fileset dir="${java.lib.dir}">
			<include name="tools.jar" />
		</fileset>
		<fileset dir="${project.lib.dir}">
			<include name="*.jar" />
		</fileset>
	</path>

	<!-- 运行路径设置 -->
	<path id="run.classpath">
		<path refid="compile.classpath" />
		<pathelement location="${classes.dir}" />
	</path>
	<!-- 清理,删除临时目录 -->
	<target name="clean" description="清理,删除临时目录">
		<!--delete dir="${build.dir}" /-->
		<delete dir="${dist.dir}" />
		<delete dir="${classes.dir}" />
		<echo level="info">清理完毕</echo>
	</target>
	<!-- 初始化,建立目录,复制文件 -->
	<target name="init" depends="clean" description="初始化,建立目录,复制文件">
		<mkdir dir="${classes.dir}" />
		<mkdir dir="${dist.dir}" />
	</target>
	<!-- 编译源文件-->
	<target name="compile" depends="init" description="编译源文件">
		<javac srcdir="${src.dir}" destdir="${classes.dir}" source="1.6" target="1.6"  includeAntRuntime="false">
			<classpath refid="compile.classpath" />
			<compilerarg line="-encoding UTF-8 "/>  
		</javac>
	</target>

	<!-- 打包类文件 -->
	<target name="jar" depends="compile" description="打包类文件">
		<jar jarfile="${dist.dir}/jar.jar">
			<fileset dir="${classes.dir}" includes="**/*.*" />
		</jar>
	</target>
	
	<!--上传到服务器
	**需要把lib目录下的jsch-0.1.51拷贝到$ANT_HOME/lib下，如果是Eclipse下的Ant环境必须在Window->Preferences->Ant->Runtime->Classpath中加入jsch-0.1.51。
	-->
	<target name="ssh" depends="jar">
		<scp file="${dist.dir}/jar.jar" todir="${remote.username}@${remote.hostname}:${remote.home}" password="${remote.password}" trust="true"/>
	</target>
	
	<target name="sshexec" depends="ssh">
	      <sshexec host="${remote.hostname}" username="${remote.username}"  password="${remote.password}" trust="true" command="${hadoop.path}/bin/hadoop jar ${remote.home}/jar.jar ${main.class}"/>
	</target>
	
</project>

3 执行过程解释：

15/01/12 17:15:48 INFO mapreduce.Job: Counters: 49    一共有49个计数器
  [sshexec] 	File System Counters     计数器组名称，如下是计数器组下计数器详细信息表征参数展示
  [sshexec] 		FILE: Number of bytes read=65    计数器名称FILE ---> 作业运行时操作linxu磁盘任务
  [sshexec] 		FILE: Number of bytes written=197929
  [sshexec] 		FILE: Number of read operations=0
  [sshexec] 		FILE: Number of large read operations=0
  [sshexec] 		FILE: Number of write operations=0
  [sshexec] 		HDFS: Number of bytes read=104    计数器名称HDFS ---> 作业运行时使用hdfs详细信息
  [sshexec] 		HDFS: Number of bytes written=19
  [sshexec] 		HDFS: Number of read operations=6
  [sshexec] 		HDFS: Number of large read operations=0
  [sshexec] 		HDFS: Number of write operations=2
  [sshexec] 	Job Counters   计数器组名称
  [sshexec] 		Launched map tasks=1 加载map任务数量 可以简单理解为map数量 = 加载hdfs文件所在block个数
  [sshexec] 		Launched reduce tasks=1 
  [sshexec] 		Data-local map tasks=1 如果数据存放和数据运行都在一台机器 则Data-local map tasks = map tasks Data-local：节省网络开支不在需要从别的节点读取数据
  [sshexec] 		Total time spent by all maps in occupied slots (ms)=104236
  [sshexec] 		Total time spent by all reduces in occupied slots (ms)=18430
  [sshexec] 		Total time spent by all map tasks (ms)=104236
  [sshexec] 		Total time spent by all reduce tasks (ms)=18430
  [sshexec] 		Total vcore-seconds taken by all map tasks=104236
  [sshexec] 		Total vcore-seconds taken by all reduce tasks=18430
  [sshexec] 		Total megabyte-seconds taken by all map tasks=106737664
  [sshexec] 		Total megabyte-seconds taken by all reduce tasks=18872320
  [sshexec] 	Map-Reduce Framework  计数器组名称
  [sshexec] 		Map input records=2  (输入文件为两行记录 因此执行两次map任务)
  [sshexec] 		Map output records=4 输出为<hello,1> <you,1> <hello,1> <me,1> 
  [sshexec] 		Map output bytes=51
  [sshexec] 		Map output materialized bytes=65
  [sshexec] 		Input split bytes=85
  [sshexec] 		Combine input records=0
  [sshexec] 		Combine output records=0
  [sshexec] 		Reduce input groups=3 (reduce输入分组数 <hello,{1,1}> <you,{1}> <me,{1}>)
  [sshexec] 		Reduce shuffle bytes=65
  [sshexec] 		Reduce input records=4 (reduce输入分组数 <hello,{1,1}> <you,{1}> <me,{1}>)
  [sshexec] 		Reduce output records=3 执行完reduce后输出结果 hello 2  you 1 me 1
  [sshexec] 		Spilled Records=8
  [sshexec] 		Shuffled Maps =1
  [sshexec] 		Failed Shuffles=0
  [sshexec] 		Merged Map outputs=1
  [sshexec] 		GC time elapsed (ms)=1768
  [sshexec] 		CPU time spent (ms)=2990
  [sshexec] 		Physical memory (bytes) snapshot=212107264
  [sshexec] 		Virtual memory (bytes) snapshot=721317888
  [sshexec] 		Total committed heap usage (bytes)=125792256
  [sshexec] 	Shuffle Errors  计数器组名称
  [sshexec] 		BAD_ID=0
  [sshexec] 		CONNECTION=0
  [sshexec] 		IO_ERROR=0
  [sshexec] 		WRONG_LENGTH=0
  [sshexec] 		WRONG_MAP=0
  [sshexec] 		WRONG_REDUCE=0
  [sshexec] 	File Input Format Counters   计数器名称
  [sshexec] 		Bytes Read=19
  [sshexec] 	File Output Format Counters   计数器名称
  [sshexec] 		Bytes Written=19
BUILD SUCCESSFUL

自定义计数器，使用combiner 自定义partitioner 和hadoop1一致，详细参看hadoop1做法