最近入职了一家大数据公司实习,之前没有接触过大数据,所以入职之后就从基础开始学起,先做了单词计数
一、统计单词个数:
思路很简单,单词之前有空格,在mapper中将单词按照空格读进来,写到context中,{am,1},{hello,1},{word,1}...这样的形式,在reducer中读入mapper的输入,对于每一个key,统计单词出现的次数。在mapper之后,reducer处理之前,mapreduce会默认对单词进行分区排序,会将相同的单词放到一起,这也是单词计数的原理。代码很多博客都有,这里就不贴了(也可参考下方的代码)。
变形一:统计单词首字母出现的个数
和单词计数原理一样,不同的是之前的key整个单词,现在key是单词的首字母,同样是用空格分开读如单词,将首字母取出,写入context中,mapper的输入作为reducer的输出。
代码:
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value,Context context) throws IOException,InterruptedException {
String oneLine = value.toString().replace("\\s+"," ");
String[] words = oneLine.split(" ");
//将单词存入context
String initials = " ";
for(String word : words) {
if(word.length() == 1)
initials = word;
else if(word.length()>1){
initials = word.substring(0,1);
}
//System.out.println(initials);
context.write(new Text(initials),new IntWritable(1));
}
}
}
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
public class WordCountReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text word,Iterable<IntWritable> valueIterator, Context context) throws IOException,InterruptedException {
int count = 0;
Iterator<IntWritable> iterator = valueIterator.iterator();
while ( iterator.hasNext()) {
iterator.next();
count++;
}
context.write(word,new IntWritable(count));
}
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class WordCountJob {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//创建一个job实例
Job job = Job.getInstance(conf,"wordcount");
job.setJarByClass(WordCountJob.class);
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextInputFormat.setInputPaths(job,new Path(args[0]));
TextOutputFormat.setOutputPath(job,new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}