编写自己WordCount程序
package rock.lee.wordcount;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MyWordCount {
/**
* @author Rock Lee
*
* @Description
* LongWritable,输入
* key类型 Text,
* 输入value类型
* Text, 输出key类型
* IntWritable,输出vlaue类型
*/
static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private static final IntWritable ONE = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
//读取每行的数据
String lineValue = value.toString();
//对每行数据进行分割\r\n\t
StringTokenizer stzer = new StringTokenizer(lineValue);
Text text = new Text();
while (stzer.hasMoreTokens()) {
//获取分割后的每个值
String val = stzer.nextToken();
//key值
text.set(val);
//key-->value
context.write(text, ONE);
}
}
}
/**
*
* @author Rock Lee
*
* @Description
*/
static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum+= val.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
//获取配置信息
Configuration configuration = new Configuration();
//创建任务,设置名称
Job job = new Job(configuration,"WC");
//设置任务运行类
job.setJarByClass(MyWordCount.class);
//设置Mapper和Reducer类
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReduce.class);
//设置输入/输出路径
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//设置输出结果key/value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//提交任务,等待运行结果,并在客户端显示运行信息
boolean success = job.waitForCompletion(true);
System.exit(success?0:1);
}
}
将程序打包成wc.jar上传到Linux上,创建文件one、two
[root@centos data]# more one hello word bye word [root@centos data]# more two hello word bye hadoop
把文件上传到HDFS上,目录为/opt/wc/input
[root@centos data]# hadoop fs -lsr / -rw-r--r-- 1 root supergroup 21 2015-06-11 04:08 /opt/wc/input/one -rw-r--r-- 1 root supergroup 23 2015-06-11 04:08 /opt/wc/input/two
运行wc.jar
[root@centos data]# hadoop jar wc.jar /opt/wc/input/ /opt/wc/output Warning: $HADOOP_HOME is deprecated. 15/06/11 04:29:10 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same. 15/06/11 04:29:10 INFO input.FileInputFormat: Total input paths to process : 2 15/06/11 04:29:10 INFO util.NativeCodeLoader: Loaded the native-hadoop library 15/06/11 04:29:10 WARN snappy.LoadSnappy: Snappy native library not loaded 15/06/11 04:29:10 INFO mapred.JobClient: Running job: job_201506110402_0006 15/06/11 04:29:11 INFO mapred.JobClient: map 0% reduce 0% 15/06/11 04:29:32 INFO mapred.JobClient: map 50% reduce 0% 15/06/11 04:29:42 INFO mapred.JobClient: map 100% reduce 0% 15/06/11 04:30:05 INFO mapred.JobClient: map 100% reduce 100% 15/06/11 04:30:05 INFO mapred.JobClient: Job complete: job_201506110402_0006 15/06/11 04:30:05 INFO mapred.JobClient: Counters: 29 15/06/11 04:30:05 INFO mapred.JobClient: Job Counters 15/06/11 04:30:05 INFO mapred.JobClient: Launched reduce tasks=1 15/06/11 04:30:05 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=40074 15/06/11 04:30:05 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 15/06/11 04:30:05 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 15/06/11 04:30:05 INFO mapred.JobClient: Launched map tasks=2 15/06/11 04:30:05 INFO mapred.JobClient: Data-local map tasks=2 15/06/11 04:30:05 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=21707 15/06/11 04:30:05 INFO mapred.JobClient: File Output Format Counters 15/06/11 04:30:05 INFO mapred.JobClient: Bytes Written=30 15/06/11 04:30:05 INFO mapred.JobClient: FileSystemCounters 15/06/11 04:30:05 INFO mapred.JobClient: FILE_BYTES_READ=96 15/06/11 04:30:05 INFO mapred.JobClient: HDFS_BYTES_READ=260 15/06/11 04:30:05 INFO mapred.JobClient: FILE_BYTES_WRITTEN=160215 15/06/11 04:30:05 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=30 15/06/11 04:30:05 INFO mapred.JobClient: File Input Format Counters 15/06/11 04:30:05 INFO mapred.JobClient: Bytes Read=44 15/06/11 04:30:05 INFO mapred.JobClient: Map-Reduce Framework 15/06/11 04:30:05 INFO mapred.JobClient: Map output materialized bytes=102 15/06/11 04:30:05 INFO mapred.JobClient: Map input records=4 15/06/11 04:30:05 INFO mapred.JobClient: Reduce shuffle bytes=102 15/06/11 04:30:05 INFO mapred.JobClient: Spilled Records=16 15/06/11 04:30:05 INFO mapred.JobClient: Map output bytes=74 15/06/11 04:30:05 INFO mapred.JobClient: CPU time spent (ms)=820 15/06/11 04:30:05 INFO mapred.JobClient: Total committed heap usage (bytes)=413466624 15/06/11 04:30:05 INFO mapred.JobClient: Combine input records=0 15/06/11 04:30:05 INFO mapred.JobClient: SPLIT_RAW_BYTES=216 15/06/11 04:30:05 INFO mapred.JobClient: Reduce input records=8 15/06/11 04:30:05 INFO mapred.JobClient: Reduce input groups=4 15/06/11 04:30:05 INFO mapred.JobClient: Combine output records=0 15/06/11 04:30:05 INFO mapred.JobClient: Physical memory (bytes) snapshot=313032704 15/06/11 04:30:05 INFO mapred.JobClient: Reduce output records=4 15/06/11 04:30:05 INFO mapred.JobClient: Virtual memory (bytes) snapshot=1127878656 15/06/11 04:30:05 INFO mapred.JobClient: Map output records=8
/opt/wc/output目录不能存在,如果存在会有异常
Exception in thread "main" org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory /opt/wc/output already exists
查看结果
[root@centos data]# hadoop fs -text /opt/wc/output/part-r-00000 Warning: $HADOOP_HOME is deprecated. bye 2 hadoop 1 hello 2 word 3
通过web方式,访问50075端口查看,这种方式不能直接通过IP访问,只能通过Linux主机名称,所以要在windows的hosts文件配置主机名称到IP地址的映射

WorldCount程序流程详解
本文详细介绍了使用Java实现WordCount程序并将其打包为wc.jar,然后上传到Linux环境中运行该程序。程序接收两个文本文件作为输入,通过Hadoop MapReduce框架进行单词计数操作。在Linux系统中,文件被上传到HDFS,随后通过hadoop jar命令运行程序,生成结果文件。最后,展示了运行过程中的关键信息以及输出结果,包括总输入路径、运行时间、作业完成状态等。
1396

被折叠的 条评论
为什么被折叠?



