http://chqz1987.blog.163.com/blog/static/514383112013261505866/
假如执行下面命令:
cd /home/hadoop/ && hadoop jar ./test/wordcount/wordcount.jar org.codetree.hadoop.v1.WordCount /test/chqz/input /test/chqz/output
那么这个命令内部到底做了些什么呢?
1、首先,在 ${HADOOP_HOME}/bin/hadoop 脚本中我们可以看到有如下代码:
由于这里$starting_secure_dn=false的(这里可以参见hadoop脚本),所以最终会执行下面这行代码:
从上面shell脚本中,我们可以明确看出当执行hadoop jar命令时,实际上执行了org.apache.hadoop.util.RunJar类。
下面#run it这一行代码实质上是为执行这个类的main方法设置所需的类路径classpath。
2、继续研究org.apache.hadoop.util.RunJar来内发生的事情:
因此,命令hadoop jar ./test/wordcount/wordcount.jar org.codetree.hadoop.v1.WordCount /test/chqz/input /test/chqz/output的各段的含义:
(1) hadoop:${HADOOP_HOME}/bin下的shell脚本名。
(2) jar:hadoop脚本需要的command参数。
(3) ./test/wordcount/wordcount.jar:要执行的jar包在本地文件系统中的完整路径,参递给RunJar类。
(4) org.codetree.hadoop.v1.WordCount:main方法所在的类,参递给RunJar类。
(5) /test/chqz/input:传递给WordCount类,作为DFS文件系统的路径,指示输入数据来源。
(6) /test/chqz/output:传递给WordCount类,作为DFS文件系统的路径,指示输出数据路径。
3、org.codetree.hadoop.v1.WordCount类代码:
Source Code
| WordCount.java | |
|---|---|
| 1. | package org.myorg; |
| 2. | |
| 3. | import java.io.IOException; |
| 4. | import java.util.*; |
| 5. | |
| 6. | import org.apache.hadoop.fs.Path; |
| 7. | import org.apache.hadoop.conf.*; |
| 8. | import org.apache.hadoop.io.*; |
| 9. | import org.apache.hadoop.mapred.*; |
| 10. | import org.apache.hadoop.util.*; |
| 11. | |
| 12. | public class WordCount { |
| 13. | |
| 14. | public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> { |
| 15. | private final static IntWritable one = new IntWritable(1); |
| 16. | private Text word = new Text(); |
| 17. | |
| 18. | public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { |
| 19. | String line = value.toString(); |
| 20. | StringTokenizer tokenizer = new StringTokenizer(line); |
| 21. | while (tokenizer.hasMoreTokens()) { |
| 22. | word.set(tokenizer.nextToken()); |
| 23. | output.collect(word, one); |
| 24. | } |
| 25. | } |
| 26. | } |
| 27. | |
| 28. | public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> { |
| 29. | public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { |
| 30. | int sum = 0; |
| 31. | while (values.hasNext()) { |
| 32. | sum += values.next().get(); |
| 33. | } |
| 34. | output.collect(key, new IntWritable(sum)); |
| 35. | } |
| 36. | } |
| 37. | |
| 38. | public static void main(String[] args) throws Exception { |
| 39. | JobConf conf = new JobConf(WordCount.class); |
| 40. | conf.setJobName("wordcount"); |
| 41. | |
| 42. | conf.setOutputKeyClass(Text.class); |
| 43. | conf.setOutputValueClass(IntWritable.class); |
| 44. | |
| 45. | conf.setMapperClass(Map.class); |
| 46. | conf.setCombinerClass(Reduce.class); |
| 47. | conf.setReducerClass(Reduce.class); |
| 48. | |
| 49. | conf.setInputFormat(TextInputFormat.class); |
| 50. | conf.setOutputFormat(TextOutputFormat.class); |
| 51. | |
| 52. | FileInputFormat.setInputPaths(conf, new Path(args[0])); |
| 53. | FileOutputFormat.setOutputPath(conf, new Path(args[1])); |
| 54. | |
| 55. | JobClient.runJob(conf); |
| 57. | } |
| 58. | } |
| 59. |
本文详细解析了Hadoop环境下WordCount程序的工作原理及执行流程。通过具体命令解释了如何运行WordCount任务,并深入分析了其背后的机制,包括如何通过MapReduce处理文本数据。
3037

被折叠的 条评论
为什么被折叠?



