hadoop版本为hadoop1.2.1
eclipse版本为eclipse-standard-kepler-SR2-win32-x86_64
WordCount.java为hadoop-1.2.1\src\examples\org\apache\hadoop\examples\WordCount.java

1 /**
2 * Licensed under the Apache License, Version 2.0 (the "License");
3 * you may not use this file except in compliance with the License.
4 * You may obtain a copy of the License at
5 *
6 * http://www.apache.org/licenses/LICENSE-2.0
7 *
8 * Unless required by applicable law or agreed to in writing, software
9 * distributed under the License is distributed on an "AS IS" BASIS,
10 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 * See the License for the specific language governing permissions and
12 * limitations under the License.
13 */
14
15
16 package org.apache.hadoop.examples;
17
18 import java.io.IOException;
19 import java.util.StringTokenizer;
20
21 import org.apache.hadoop.conf.Configuration;
22 import org.apache.hadoop.fs.Path;
23 import org.apache.hadoop.io.IntWritable;
24 import org.apache.hadoop.io.Text;
25 import org.apache.hadoop.mapreduce.Job;
26 import org.apache.hadoop.mapreduce.Mapper;
27 import org.apache.hadoop.mapreduce.Reducer;
28 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
29 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
30 import org.apache.hadoop.util.GenericOptionsParser;
31
32 public class WordCount {
33
34 public static class TokenizerMapper
35 extends Mapper<Object, Text, Text, IntWritable>{
36
37 private final static IntWritable one = new IntWritable(1);
38 private Text word = new Text();
39
40 public void map(Object key, Text value, Context context
41 ) throws IOException, InterruptedException {
42 StringTokenizer itr = new StringTokenizer(value.toString());
43 while (itr.hasMoreTokens()) {
44 word.set(itr.nextToken());
45 context.write(word, one);
46 }
47 }
48 }
49
50 public static class IntSumReducer
51 extends Reducer<Text,IntWritable,Text,IntWritable> {
52 private IntWritable result = new IntWritable();
53
54 public void reduce(Text key, Iterable<IntWritable> values,
55 Context context
56 ) throws IOException, InterruptedException {
57 int sum = 0;
58 for (IntWritable val : values) {
59 sum += val.get();
60 }
61 result.set(sum);
62 context.write(key, result);
63 }
64 }
65
66 public static void main(String[] args) throws Exception {
67 Configuration conf = new Configuration();
68 String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
69 if (otherArgs.length != 2) {
70 System.err.println("Usage: wordcount <in> <out>");
71 System.exit(2);
72 }
73 Job job = new Job(conf, "word count");
74 job.setJarByClass(WordCount.class);
75 job.setMapperClass(TokenizerMapper.class);
76 job.setCombinerClass(IntSumReducer.class);
77 job.setReducerClass(IntSumReducer.class);
78 job.setOutputKeyClass(Text.class);
79 job.setOutputValueClass(IntWritable.class);
80 FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
81 FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
82 System.exit(job.waitForCompletion(true) ? 0 : 1);
83 }
84 }
在eclipse中新建java project,project名为WordCount
在project中新建class,类名为WordCount
再将上述代码覆盖eclipse中的WordCount.java
并将页首的package改了wordcount,改后的源码如下

1 package wordcount;
2
3 import java.io.IOException;
4 import java.util.StringTokenizer;
5
6 import org.apache.hadoop.conf.Configuration;
7 import org.apache.hadoop.fs.Path;
8 import org.apache.hadoop.io.IntWritable;
9 import org.apache.hadoop.io.Text;
10 import org.apache.hadoop.mapreduce.Job;
11 import org.apache.hadoop.mapreduce.Mapper;
12 import org.apache.hadoop.mapreduce.Reducer;
13 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
15 import org.apache.hadoop.util.GenericOptionsParser;
16
17 public class WordCount {
18
19 public static class TokenizerMapper
20 extends Mapper<Object, Text, Text, IntWritable>{
21
22 private final static IntWritable one = new IntWritable(1);
23 private Text word = new Text();
24
25 public void map(Object key, Text value, Context context
26 ) throws IOException, InterruptedException {
27 StringTokenizer itr = new StringTokenizer(value.toString());
28 while (itr.hasMoreTokens()) {
29 word.set(itr.nextToken());
30 context.write(word, one);
31 }
32 }
33 }
34
35 public static class IntSumReducer
36 extends Reducer<Text,IntWritable,Text,IntWritable> {
37 private IntWritable result = new IntWritable();
38
39 public void reduce(Text key, Iterable<IntWritable> values,
40 Context context
41 ) throws IOException, InterruptedException {
42 int sum = 0;
43 for (IntWritable val : values) {
44 sum += val.get();
45 }
46 result.set(sum);
47 context.write(key, result);
48 }
49 }
50
51 public static void main(String[] args) throws Exception {
52 Configuration conf = new Configuration();
53 String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
54 if (otherArgs.length != 2) {
55 System.err.println("Usage: wordcount <in> <out>");
56 System.exit(2);
57 }
58 Job job = new Job(conf, "word count");
59 job.setJarByClass(WordCount.class);
60 job.setMapperClass(TokenizerMapper.class);
61 job.setCombinerClass(IntSumReducer.class);
62 job.setReducerClass(IntSumReducer.class);
63 job.setOutputKeyClass(Text.class);
64 job.setOutputValueClass(IntWritable.class);
65 FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
66 FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
67 System.exit(job.waitForCompletion(true) ? 0 : 1);
68
69 }
70 }
1 import org.apache.hadoop.conf.Configuration;
2 import org.apache.hadoop.fs.Path;
3 import org.apache.hadoop.io.IntWritable;
4 import org.apache.hadoop.io.Text;
5 import org.apache.hadoop.mapreduce.Job;
6 import org.apache.hadoop.mapreduce.Mapper;
7 import org.apache.hadoop.mapreduce.Reducer;
8 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
9 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
10 import org.apache.hadoop.util.GenericOptionsParser;
可以看到源码import了好几个hadoop自定义类,非JDK环境自带的类,所以需要把这些依赖包导入eclipse中,不然编译器如何能找到这些类呢,得明确让编译器知道这些类所在位置。
这时候编译并运行一下,会发现有如下错误
Exception in thread "main" java.lang.Error: Unresolved compilation problems:
The import org.apache.commons cannot be resolved
The import org.apache.commons cannot be resolved
The import org.codehaus cannot be resolved
The import org.codehaus cannot be resolved
Log cannot be resolved to a type
LogFactory cannot be resolved
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
Log cannot be resolved to a type
JsonFactory cannot be resolved to a type
JsonFactory cannot be resolved to a type
JsonGenerator cannot be resolved to a type
at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:60)
at wordcount.WordCount.main(WordCount.java:52)
原因是缺少依赖的jar库文件,再把缺少的jar库文件添加入库即可。
使用Add External JARs添加hadoop1.2.1\lib目录下所有jar文件。
再一次编译并运行,成功
最后打包成为jar文件
file->export
其中,WordCount.jar不是强求与类名相同,可以改为其他,譬如CountWord.jar,关系不大,然后点击Finish即可。
之后就可以在hadoop上运行了。运行WordCount详解可以参考Hadoop集群(第6期)_WordCount运行详解
1 hadoop jar WordCount.jar WordCount input output
注意上述代码中是没有
1 package org.apache.hadoop.examples;
倘若使用了package,那么jar文件中就有层次的,不再如hadoop jar WordCount.jar WordCount input output就可以运行了,需要详细指出WordCount(这个是主类的类名),运行命令改为
hadoop jar WordCount.jar org.apache.hadoop.examples.WordCount input output
关于这里打包的内容,在[hadoop]命令行编译并运行hadoop例子WordCount有讲述
转载自:林羽飞扬