环境:ubuntu、hadoop-2.6.0、jdk-1.6
Demo说明:此demo改编自hadoop权威指南一书;
1、存在一个气温记录的txt文件,记录每一年每一个月的气温值(此处数据伪造,记录的为1990 - 1991年数据),如下:
途中+号表示正数温度,-号表示零下。
demo意图是期望通过hadoop计算出每一年的最高气温,结果期望如下:
开始构建hadoop应用,首先此处用到的核心类是Mapper、Reducer、Combiner(这并不是一个具体的类,此对象也是Reducer的一个子类,用于实现hadoop中的合并函数)这三个类,Mapper类是用于从每一行中获取出year(年份)和tempper(气温值),将输出参数的year和tempper作为Combiner的输入参数,Combiner计算出此数据块的最大值(因为在分布式计算中,同一个年份的数据可能被分割在不同的数据块中,所以,合并函数显得非常重要),将合并函数计算的year和tempper传入Reducer,reducer输出结果到HDFS;(hadooper的mapper函数的输入是基于标准流,对文件逐行读取,逐行提交给mapper)
架构如下:
NewMaxTemperMapper >> Mapper的子类;
NewMaxTemperReducer >> Reducer的子类:
NewMaxTempperCombiner >> Reducer的子类:
NewDomainWithCombiner >> Reducer的子类:
NewMaxTemperMapper.java
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class NewMaxTemperMapper extends Mapper {
private static final int MISSING = 9999;
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException,InterruptedException {
System.out.println("start to work in map method ...");
String line = value.toString();
String year = line.substring(1, 5);
int airTempper;
if (line.charAt(6) == '+') {
airTempper = Integer.parseInt(line.substring(7, 10));
} else {
airTempper = Integer.parseInt(line.substring(6, 10));
}
if (airTempper != MISSING) {
context.write(new Text(year), new IntWritable(airTempper));
}
}
}
NewMaxTemperReducer.java
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class NewMaxTemperReducer extends Reducer {
@Override
protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
System.out.println("start to work in reducer method ...");
int maxValue = Integer.MIN_VALUE;
for (IntWritable intWrit : values) {
maxValue = Math.max(maxValue, intWrit.get());
}
context.write(key, new IntWritable(maxValue));
}
}
NewMaxTempperCombiner.java
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class NewMaxTempperCombiner extends Reducer {
@Override
protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
System.out.println("NewMaxTempperCombiner - reduce() - do combiner...");
int maxValue = Integer.MIN_VALUE;
for (IntWritable intWrit : values) {
maxValue = Math.max(maxValue, intWrit.get());
}
context.write(new Text(key), new IntWritable(maxValue));
}
}
NewDomain.java
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class NewDomain {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
if (args == null || args.length != 2) {
System.err.println("input is not legal.");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(NewDomain.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(NewMaxTemperMapper.class);
job.setReducerClass(NewMaxTemperReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true)?0:1);
}
}
OK 将项目导出成一个可运行jar
将此jar放到单机模式或者伪分布式模式下,通过hadoop/bin下面的hadoop执行