1.
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class MaxTemperatureMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable>{
private static final int MISSING = 9999;
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException{
String line = value.toString();
String year = line.substring(15,19);
int airTemperature;
if(line.charAt(87) == '+') {
airTemperature = Integer.parseInt(line.substring(88,92));
} else {
airTemperature = Integer.parseInt(line.substring(87,92));
}
String quality = line.substring(92,93);
if(airTemperature != MISSING && quality.matches("[01459]")) {
output.collect(new Text(year), new IntWritable(airTemperature));
}
}
public static void main(String[] args) {
// TODO Auto-generated method stub
}
}
Mapper接口是一个泛型类型,它有四个形参类型,分别指定map函数的输入键,输入值,输出键和输出值的类型。Hadoop自身提供一套可优化网络序列化传输的基本类型,而不直接使用java内嵌的类型。这些类型均可在org.apache.hadoop.io中找到。LongWritable类型相当于java中的Long、Text类型相当于java中的String、IntWritable相当于java中的Integer。
map()方法还提供了OutputCollector实例用于输出内容的写入。
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.io.*;
public class MaxTemperatureReducer extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable>{
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException{
int maxValue = Integer.MIN_VALUE;
while(values.hasNext()) {
maxValue = Math.max(maxValue, values.next().get());
}
output.collect(key, new IntWritable(maxValue));
}
public static void main(String[] args) {
// TODO Auto-generated method stub
}
}
reduce函数也有四个形式参数类型指定其输入和输出类型。reduce函数的输入类型必须与map函数的输出类型想匹配。
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.FileOutputFormat;
import com.sun.jersey.core.impl.provider.entity.XMLJAXBElementProvider.Text;
public class MaxTemperature {
public static void main(String[] args) throws IOException{
// TODO Auto-generated method stub
if(args.length != 2) {
System.out.println("Usage: MaxTemperature <input path> <output path>");
System.exit(-1);
}
JobConf conf = new JobConf(MaxTemperature.class);
conf.setJobName("Max temperature");
FileInputFormat.addInputPath(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
conf.setMapperClass(MaxTemperatureMapper.class);
conf.setReducerClass(MaxTemperatureReducer.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
JobClient.runJob(conf);
}
}
在hadoop集群上运行这个作业时,我们需要将代码打包成一个JAR文件(Hadoop会在集群上发布这个文件)。我们无须明确指定JAR文件的名称,而只需在JobConf的构造函数中传递一个类,Hadoop将通过该类查找包含有该类的JAR文件。
构造JobConf对象后,需要指定输入和输出数据的路径。调用FileInputFormat类的静态函数addInputPath()来定义输入数据的路径。
通过调用FileOutputFormat类中的静态函数setOutputPath()来指定输出路径。该函数指定了reduce函数输出文件的写入目录。在运行该任务前目录不应该存在。
接着通过setMapperClass()和setReducerClass()指定map和reduce类型。
setOutputKetClass()和setOutputValueClass()控制map和reduce的输出类型。
2.
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class NewMaxTemperature {
static class NewMaxTemperatureMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private static final int MISSING = 9999;
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String year = line.substring(15,19);
int airTemperature;
if(line.charAt(87) == '+') {
airTemperature = Integer.parseInt(line.substring(88,92));
} else {
airTemperature = Integer.parseInt(line.substring(87,92));
}
String quality = line.substring(92,93);
if(airTemperature != MISSING && quality.matches("[01459]")) {
context.write(new Text(year), new IntWritable(airTemperature));
}
}
}
static class NewMaxTemperatureReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reducer(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int maxValue = Integer.MIN_VALUE;
for(IntWritable value:values) {
maxValue = Math.max(maxValue, value.get());
}
context.write(key, new IntWritable(maxValue));
}
}
public static void main(String[] args) throws Exception{
// TODO Auto-generated method stub
if(args.length != 2) {
System.out.println("Usage: MaxTemperature <input path> <output path>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(NewMaxTemperature.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(NewMaxTemperatureMapper.class);
job.setReducerClass(NewMaxTemperatureReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
3.
有两类结点控制着作业的执行过程:一个jobtracker以及一系列tasktracker。jobtracker通过调度tasktracker上运行的任务,来协调所有运行在系统上的作业。tasktracker在运行任务的同时将运行进度报告给jobtracker,jobtracker由此记录每项作业任务的整体进度情况。如果其中一个任务失败,jobtracker可以在另外一个tasktracker节点上重新调度该任务。
hadoop将MapReduce的输入数据划分成等长的小数据块,称为输入分片,简称分片。Hadoop为每个分片构建一个map任务,并由该任务来运行用户自定义的map函数从而处理分片中的每条记录。
拥有许多分片,意味着处理每个分片所需要的时间少于整个输入数据所花的时间。
Hadoop在存储有输入数据(HDFS中的数据)的结点上运行map任务,可以获得最佳性能,