hadoop wordcount入门

最新推荐文章于 2024-10-04 18:47:42 发布

转载最新推荐文章于 2024-10-04 18:47:42 发布 · 77 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：http://www.cnblogs.com/xingxing1024/p/7465301.html

本文介绍如何在Ubuntu 14.04上配置伪分布式Hadoop 1.0.4，并实现WordCount入门程序。该程序通过MapReduce框架对文本进行分词计数。

配置 ubuntu14.04 伪分布式 hadoop1.04

wordcount入门程序，摘自hadoop基础教程

import java.io.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {
　　　　　//map操作
	public static class WordCountMapper extends Mapper<Object, Text, Text, IntWritable> {
		
		private final static IntWritable one = new IntWritable(1);
		private Text word = new Text();
		
		public void map(Object key, Text value, Context context) throws IOException, InterruptedException{
			String[] words = value.toString().split(" ");
			for(String str:words) {
				word.set(str);
				context.write(word, one);
			}
		}
	}
	
        //reduce操作
	public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
		public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{
			int total = 0;
			for(IntWritable val : values) {
				total++;
			}
			context.write(key, new IntWritable(total));
		}
	}
	public static void main(String[] args) {
		try{
			//创建Configuration对象，用于设置其他选项
			Configuration conf = new Configuration();
			//创建作业对象
			Job job = new Job(conf, "WordCount");
			//设置作业jarfile中主类名字
			job.setJarByClass(WordCount.class);
			//设置mapper类
			job.setMapperClass(WordCountMapper.class);
			//设置reduce类
			job.setReducerClass(WordCountReducer.class);
			//设置输出的类型
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);
			//设置输入和输出文件路径
			FileInputFormat.addInputPath(job, new Path(args[0]));
			FileOutputFormat.setOutputPath(job, new Path(args[1]));
			//等待程序退出
			System.exit(job.waitForCompletion(true)?0:1);
		}catch(Exception e) {
			//system.out.println("出错");
		}
	}
}

转载于:https://www.cnblogs.com/xingxing1024/p/7465301.html