Hadoop入门系列2——安装,配置,编程,部署和运行中介绍了一个WordCount的mapreduce应用开发,接下来继续介绍一个求平均分数的mapreduce应用开发
package seu.mapreduce.computescore;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* Map处理的是一个纯文本文件,文件中存放的数据每一行表示一个学生的姓名和相应的成绩
* 若该学生存在多门课,则该学生就存放多行数据
* @author wjm
* Mapper<Key1,Value1,Key2,Value2>中,key1和value1的类型是map函数中输入的key,value的类型
* 而Context对象写入的key,value类型则是Key2和Value2的类型
*/
public class Map extends Mapper<LongWritable, Text, Text, IntWritable>{
//map函数接收一个<key,value>形式的输入,之后产生一个<key,value>形式的中间输出。MapReduce中的通信通过Context对象
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
String line = value.toString();//将输入的纯文本文件的数据转化成String
System.out.println(line);//为了便于程序调试,输出读入的文本
//将输入的数据首先按行进行分割
StringTokenizer tokenizer = new StringTokenizer(line, "\n");
//分别对每一行进行处理
while(tokenizer.hasMoreTokens()){
//每行按空格划分 nextToken()按照空格进行划分
StringTokenizer tokenizerLine = new StringTokenizer(tokenizer.nextToken());
String strName = tokenizerLine.nextToken();//学生姓名部分
String strScore = tokenizerLine.nextToken();//学生成绩部分
Text name = new Text(strName);//name of student
int scoreInt = Integer.parseInt(strScore);//score of student
//context.write(Text,IntWritable)
context.write(name, new IntWritable(scoreInt));//输出姓名和成绩
}
}
}
package seu.mapreduce.computescore;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* Hadoop负责将所有具有相同中间key值的value集合到一起传递给reduce函数
* Reducer<Key1,Value1,Key2,Value2>中key1,value1的类型同context中写入的类型相同
* 其中reduce函数接受的输入形式为<key,lists of values>
* 并且Key2,Value2则是context对象写入的类型
* @author wjm
*
*/
public class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>{
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{
int sum = 0;
int count = 0;
Iterator<IntWritable> iterator = values.iterator();
while(iterator.hasNext()){
sum += iterator.next().get();//计算总分
count++;//统计总的科目数
}
int average = (int) sum / count;
context.write(key, new IntWritable(average));
}
}
package seu.mapreduce.computescore;
import javax.xml.soap.Text;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Score_Process extends Configured implements Tool{
/**
* run()方法中设置的步骤:
* Job——setJarByClass——setJobName——setOutputKey(Value)Class——setMapper(setCombiner,setReducer)Class
* ——setInput(Output)FormatClass——setInput(Output)Paths
*/
@Override
public int run(String[] args) throws Exception {
Job job = new Job(getConf());
job.setJarByClass(Score_Process.class);
job.setJobName("Score_Process");
//注意这里设置的Key Value的类型参考reduce函数中context.write(Key,Value)的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int ret = ToolRunner.run(new Score_Process(), args);
System.exit(ret);
}
}