MapReduce不仅可以处理文本数据,还可以处理二进制数据。
Hadoop顺序文件存储格式是二进制的键-值对序列,使用顺序文件作为MapReduce的输入,可以使用SequenceFileInputFormat。键-值对是由顺序文件格式决定,只需要保证map的输入格式正确,例如顺序文件的格式是InWritable-Intwritable,那么Mapper的输入格式也一样。
1、MaxTempMapper
package hadoop.mr.sequenfileinputformat;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* MaxTempMapper
*/
public class MaxTempMapper extends Mapper<IntWritable, IntWritable, IntWritable,IntWritable> {
protected void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {
context.write(key, value);
}
}
2、MaxTempReducer
package .hadoop.mr.sequenfileinputformat;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* MaxTempReducer
*/
public class MaxTempReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{
protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int Max = Integer.MIN_VALUE;
for(IntWritable iw:values){
Max = Max>iw.get()?Max:iw.get();
}
context.write(key,new IntWritable(Max));
}
}
3、App
package hadoop.mr.sequenfileinputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
*/
public class App {
public static void main(String[] args) throws Exception {
args = new String[]{
"F:/java1/seq/1.seq","f:/java1/seq/out"
};
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
if(fs.exists(new Path(args[1]))){
fs.delete(new Path(args[1]),true);
}
Job job = Job.getInstance(conf);
job.setJobName("MAXtemp");
job.setJarByClass(App.class);
job.setMapperClass(MaxTempMapper.class);
job.setReducerClass(MaxTempReducer.class);
//添加输入路径
FileInputFormat.addInputPath(job,new Path(args[0]));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//文件的输入格式
job.setInputFormatClass(SequenceFileInputFormat.class);
//设置mapreduce输出
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(2);
//手动设置map的个数
// job.getConfiguration().set(MRJobConfig.NUM_MAPS,"3");
job.waitForCompletion(true);
}
}
源码跟踪:
1、SequenceFileInputFormat<K, V>extendsFileInputFormat<K,
V>
2、调用FileInputFormat的getSplits方法
3、返回切片信息
4、调用SequenceFileInputFormat的createRecordReader方法创建SequenceFileRecordReader
5、调用Mapper中run方法
while(contxet.nextkeyvalue)是个循环过程
6、调用SequenceFileRecordReader的nextKeyValue方法