hadoop权威指南第二章笔记

最新推荐文章于 2021-07-28 22:18:08 发布

walter1990

最新推荐文章于 2021-07-28 22:18:08 发布

阅读量823

点赞数

CC 4.0 BY-SA版权

本文链接：https://blog.youkuaiyun.com/suichen1/article/details/38070059

本文介绍了一个使用Hadoop MapReduce实现的温度数据分析应用案例。该案例通过自定义Mapper和Reducer处理气象数据，计算每年最高气温。文章详细展示了Java代码实现细节，包括如何过滤无效数据、解析数据字段及执行最大值计算。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;

public class MaxTemperatureMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable>{
	private static final int MISSING = 9999;
	public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException{
		String line = value.toString();
		String year = line.substring(15,19);
		int airTemperature;
		
		if(line.charAt(87) == '+') {
			airTemperature = Integer.parseInt(line.substring(88,92));
		} else {
			airTemperature = Integer.parseInt(line.substring(87,92));
		}
		
		String quality = line.substring(92,93);
		if(airTemperature != MISSING && quality.matches("[01459]")) {
			output.collect(new Text(year), new IntWritable(airTemperature));
		}
	}
	public static void main(String[] args) {
		// TODO Auto-generated method stub

	}

}

Mapper接口是一个泛型类型，它有四个形参类型，分别指定map函数的输入键，输入值，输出键和输出值的类型。Hadoop自身提供一套可优化网络序列化传输的基本类型，而不直接使用java内嵌的类型。这些类型均可在org.apache.hadoop.io中找到。LongWritable类型相当于java中的Long、Text类型相当于java中的String、IntWritable相当于java中的Integer。

map()方法还提供了OutputCollector实例用于输出内容的写入。

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.io.*;
public class MaxTemperatureReducer extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable>{
	public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException{
		int maxValue = Integer.MIN_VALUE;
		while(values.hasNext()) {
			maxValue = Math.max(maxValue, values.next().get());
		}
		output.collect(key, new IntWritable(maxValue));
	}
	public static void main(String[] args) {
		// TODO Auto-generated method stub

	}

}

reduce函数也有四个形式参数类型指定其输入和输出类型。reduce函数的输入类型必须与map函数的输出类型想匹配。

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.FileOutputFormat;

import com.sun.jersey.core.impl.provider.entity.XMLJAXBElementProvider.Text;

public class MaxTemperature {

	public static void main(String[] args) throws IOException{
		// TODO Auto-generated method stub
		if(args.length != 2) {
			System.out.println("Usage: MaxTemperature <input path> <output path>");
			System.exit(-1);
		}
		
		JobConf conf = new JobConf(MaxTemperature.class);
		conf.setJobName("Max temperature");
		
		FileInputFormat.addInputPath(conf, new Path(args[0]));
		FileOutputFormat.setOutputPath(conf, new Path(args[1]));
		conf.setMapperClass(MaxTemperatureMapper.class);
		conf.setReducerClass(MaxTemperatureReducer.class);
		conf.setOutputKeyClass(Text.class);
		conf.setOutputValueClass(IntWritable.class);
		
		JobClient.runJob(conf);
		
	}

}

在hadoop集群上运行这个作业时，我们需要将代码打包成一个JAR文件(Hadoop会在集群上发布这个文件)。我们无须明确指定JAR文件的名称，而只需在JobConf的构造函数中传递一个类，Hadoop将通过该类查找包含有该类的JAR文件。

构造JobConf对象后，需要指定输入和输出数据的路径。调用FileInputFormat类的静态函数addInputPath()来定义输入数据的路径。

通过调用FileOutputFormat类中的静态函数setOutputPath()来指定输出路径。该函数指定了reduce函数输出文件的写入目录。在运行该任务前目录不应该存在。

接着通过setMapperClass()和setReducerClass()指定map和reduce类型。

setOutputKetClass()和setOutputValueClass()控制map和reduce的输出类型。

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class NewMaxTemperature {
	static class NewMaxTemperatureMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
		private static final int MISSING = 9999;
		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String line = value.toString();
			String year = line.substring(15,19);
			int airTemperature;
			
			if(line.charAt(87) == '+') {
				airTemperature = Integer.parseInt(line.substring(88,92));
			} else {
				airTemperature = Integer.parseInt(line.substring(87,92));
			}
			String quality = line.substring(92,93);
			if(airTemperature != MISSING && quality.matches("[01459]")) {
				context.write(new Text(year), new IntWritable(airTemperature));
			}
		}
	}
	
	static class NewMaxTemperatureReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
		public void reducer(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
			int maxValue = Integer.MIN_VALUE;
			
			for(IntWritable value:values) {
				maxValue = Math.max(maxValue, value.get());
			}
			
			context.write(key, new IntWritable(maxValue));
		}
	}
	public static void main(String[] args) throws Exception{
		// TODO Auto-generated method stub
		if(args.length != 2) {
			System.out.println("Usage: MaxTemperature <input path> <output path>");
			System.exit(-1);
		}
		
		Job job = new Job();
		job.setJarByClass(NewMaxTemperature.class);
		
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.setMapperClass(NewMaxTemperatureMapper.class);
		job.setReducerClass(NewMaxTemperatureReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

}

有两类结点控制着作业的执行过程：一个jobtracker以及一系列tasktracker。jobtracker通过调度tasktracker上运行的任务，来协调所有运行在系统上的作业。tasktracker在运行任务的同时将运行进度报告给jobtracker,jobtracker由此记录每项作业任务的整体进度情况。如果其中一个任务失败，jobtracker可以在另外一个tasktracker节点上重新调度该任务。

hadoop将MapReduce的输入数据划分成等长的小数据块，称为输入分片，简称分片。Hadoop为每个分片构建一个map任务，并由该任务来运行用户自定义的map函数从而处理分片中的每条记录。

拥有许多分片，意味着处理每个分片所需要的时间少于整个输入数据所花的时间。

Hadoop在存储有输入数据(HDFS中的数据)的结点上运行map任务，可以获得最佳性能，