0 每一行执行一次map函数 每个组调用一次reduce函数
1 代码: 这里主要看main函数写法 注意和hadoop1写法不同处的对比 使用ant运行
运行集群参看: hadoop2 搭建自动切换的ha集群 yarn集群
注意 Mapper Reducer类以及 map() reduce() 方法对应的Context的包的引用
打包运行时, job.setJarByClass是必须的
package mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountApp {
public static class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
/**
* 每一行执行一次map函数
* @param key 表示字节在源文件中偏移量
* @param value 行文本内容
*/
protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,Text,LongWritable>.Context context) throws java.io.IOException ,InterruptedException {
final String[] splited = value.toString().split("\t");
for (String word : splited) {
context.write(new Text(word), new LongWritable(1));
}
};
}
//产生输出:<hello,1><you,1><hello,1><me,1>
//按照key进行排序:<hello,1><hello,1><me,1><you,1>
//分组:<hello,{1,1}><me,{1}><you,{1}>【把相同key的value放到一起】 reduce方法是每一组调用一次 左侧结果 为3组 则调用3次reduce方法
public static class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
/**
* 每个组调用一次reduce函数
* @param word 表示单词
* @param times 表示相同key的value的迭代器
*/
protected void reduce(Text word, java.lang.Iterable<LongWritable> times, org.apache.hadoop.mapreduce.Reducer<Text,LongWritable,Text,LongWritable>.Context context) throws java.io.IOException ,InterruptedException {
long sum = 0L;
for (LongWritable longWritable : times) {
sum += longWritable.get();
}
context.write(word, new LongWritable(sum));
};
}
public static void main(String[] args) throws Exception {
// 设置Job对象
final Configuration conf = new Configuration();
final Job job = new Job(conf);
job.setJobName(WordCountApp.class.getSimpleName());
job.setJarByClass(WordCountApp.class);
// 给Job对象设置自定义 mapper reducer
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
// 设置map reduce输出参数类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 设置Job任务要处理的数据源和输出数据目的地
FileInputFormat.addInputPaths(job, "/test"); // 注意是addInputPaths 用的是复数的方法
FileOutputFormat.setOutputPath(job, new Path("/out1"));
// 执行Job
job.waitForCompletion(true);
}
}
2 build.xml:
<?xml version="1.0" encoding="UTF-8"?>
<!-- 该文件与src文件夹、lib文件夹同一级 -->
<project name="hadoop2测试项目" basedir="." default="sshexec">
<!--属性设置-->
<property environment="env" />
<property file="build.properties" />
<property name="src.dir" value="${basedir}/src" />
<property name="java.lib.dir" value="${env.JAVA_HOME}/lib" />
<property name="classes.dir" value="${basedir}/classes" />
<property name="dist.dir" value="${basedir}/dist" />
<property name="project.lib.dir" value="${basedir}/lib" />
<property name="localpath.dir" value="${basedir}" />
<property name="remote.home" value="~"/>
<!--可以修改:hadoop集群的hostname或者ip-->
<property name="remote.hostname" value="h2master"/>
<!--可以修改:登录hadoop集群所在linux的用户名-->
<property name="remote.username" value="root"/>
<!--可以修改:登录hadoop集群所在liniux的密码-->
<property name="remote.password" value="123456"/>
<!--可以修改:每次需要运行的main类,写到这里。运行时拼接为hadoop jar xxx.jar MainClass -->
<property name="main.class" value="mapreduce.WordCountApp"/>
<!--可以修改:hadoop集群在linux的部署路径-->
<property name="hadoop.path" value="/usr/local/hadoop2.5"/>
<!-- 基本编译路径设置 -->
<path id="compile.classpath">
<fileset dir="${java.lib.dir}">
<include name="tools.jar" />
</fileset>
<fileset dir="${project.lib.dir}">
<include name="*.jar" />
</fileset>
</path>
<!-- 运行路径设置 -->
<path id="run.classpath">
<path refid="compile.classpath" />
<pathelement location="${classes.dir}" />
</path>
<!-- 清理,删除临时目录 -->
<target name="clean" description="清理,删除临时目录">
<!--delete dir="${build.dir}" /-->
<delete dir="${dist.dir}" />
<delete dir="${classes.dir}" />
<echo level="info">清理完毕</echo>
</target>
<!-- 初始化,建立目录,复制文件 -->
<target name="init" depends="clean" description="初始化,建立目录,复制文件">
<mkdir dir="${classes.dir}" />
<mkdir dir="${dist.dir}" />
</target>
<!-- 编译源文件-->
<target name="compile" depends="init" description="编译源文件">
<javac srcdir="${src.dir}" destdir="${classes.dir}" source="1.6" target="1.6" includeAntRuntime="false">
<classpath refid="compile.classpath" />
<compilerarg line="-encoding UTF-8 "/>
</javac>
</target>
<!-- 打包类文件 -->
<target name="jar" depends="compile" description="打包类文件">
<jar jarfile="${dist.dir}/jar.jar">
<fileset dir="${classes.dir}" includes="**/*.*" />
</jar>
</target>
<!--上传到服务器
**需要把lib目录下的jsch-0.1.51拷贝到$ANT_HOME/lib下,如果是Eclipse下的Ant环境必须在Window->Preferences->Ant->Runtime->Classpath中加入jsch-0.1.51。
-->
<target name="ssh" depends="jar">
<scp file="${dist.dir}/jar.jar" todir="${remote.username}@${remote.hostname}:${remote.home}" password="${remote.password}" trust="true"/>
</target>
<target name="sshexec" depends="ssh">
<sshexec host="${remote.hostname}" username="${remote.username}" password="${remote.password}" trust="true" command="${hadoop.path}/bin/hadoop jar ${remote.home}/jar.jar ${main.class}"/>
</target>
</project>
3 执行过程解释:
15/01/12 17:15:48 INFO mapreduce.Job: Counters: 49 一共有49个计数器
[sshexec] File System Counters 计数器组名称,如下是计数器组下计数器详细信息表征参数展示
[sshexec] FILE: Number of bytes read=65 计数器名称FILE ---> 作业运行时操作linxu磁盘任务
[sshexec] FILE: Number of bytes written=197929
[sshexec] FILE: Number of read operations=0
[sshexec] FILE: Number of large read operations=0
[sshexec] FILE: Number of write operations=0
[sshexec] HDFS: Number of bytes read=104 计数器名称HDFS ---> 作业运行时使用hdfs详细信息
[sshexec] HDFS: Number of bytes written=19
[sshexec] HDFS: Number of read operations=6
[sshexec] HDFS: Number of large read operations=0
[sshexec] HDFS: Number of write operations=2
[sshexec] Job Counters 计数器组名称
[sshexec] Launched map tasks=1 加载map任务数量 可以简单理解为map数量 = 加载hdfs文件所在block个数
[sshexec] Launched reduce tasks=1
[sshexec] Data-local map tasks=1 如果数据存放和数据运行都在一台机器 则Data-local map tasks = map tasks Data-local:节省网络开支不在需要从别的节点读取数据
[sshexec] Total time spent by all maps in occupied slots (ms)=104236
[sshexec] Total time spent by all reduces in occupied slots (ms)=18430
[sshexec] Total time spent by all map tasks (ms)=104236
[sshexec] Total time spent by all reduce tasks (ms)=18430
[sshexec] Total vcore-seconds taken by all map tasks=104236
[sshexec] Total vcore-seconds taken by all reduce tasks=18430
[sshexec] Total megabyte-seconds taken by all map tasks=106737664
[sshexec] Total megabyte-seconds taken by all reduce tasks=18872320
[sshexec] Map-Reduce Framework 计数器组名称
[sshexec] Map input records=2 (输入文件为两行记录 因此执行两次map任务)
[sshexec] Map output records=4 输出为<hello,1> <you,1> <hello,1> <me,1>
[sshexec] Map output bytes=51
[sshexec] Map output materialized bytes=65
[sshexec] Input split bytes=85
[sshexec] Combine input records=0
[sshexec] Combine output records=0
[sshexec] Reduce input groups=3 (reduce输入分组数 <hello,{1,1}> <you,{1}> <me,{1}>)
[sshexec] Reduce shuffle bytes=65
[sshexec] Reduce input records=4 (reduce输入分组数 <hello,{1,1}> <you,{1}> <me,{1}>)
[sshexec] Reduce output records=3 执行完reduce后输出结果 hello 2 you 1 me 1
[sshexec] Spilled Records=8
[sshexec] Shuffled Maps =1
[sshexec] Failed Shuffles=0
[sshexec] Merged Map outputs=1
[sshexec] GC time elapsed (ms)=1768
[sshexec] CPU time spent (ms)=2990
[sshexec] Physical memory (bytes) snapshot=212107264
[sshexec] Virtual memory (bytes) snapshot=721317888
[sshexec] Total committed heap usage (bytes)=125792256
[sshexec] Shuffle Errors 计数器组名称
[sshexec] BAD_ID=0
[sshexec] CONNECTION=0
[sshexec] IO_ERROR=0
[sshexec] WRONG_LENGTH=0
[sshexec] WRONG_MAP=0
[sshexec] WRONG_REDUCE=0
[sshexec] File Input Format Counters 计数器名称
[sshexec] Bytes Read=19
[sshexec] File Output Format Counters 计数器名称
[sshexec] Bytes Written=19
BUILD SUCCESSFUL
自定义 计数器, 使用combiner 自定义partitioner 和hadoop1一致, 详细参看hadoop1做法