1.原始数据分析
0067011990999991950051507004888888889999999N9+00001+9999999999999999999999
0067011990999991950051512004888888889999999N9+00221+9999999999999999999999
0067011990999991950051518004888888889999999N9-00111+9999999999999999999999
0067011990999991949032412004888888889999999N9+01111+9999999999999999999999
0067011990999991950032418004888888880500001N9+00001+9999999999999999999999
0067011990999991950051507004888888880500001N9+00781+9999999999999999999999
数据说明:
1.第15-19个字符是year
2.第45-50位是温度表示,+表示零上, -表示零下,且温度的值不能是9999,9999表示异常数据 第50位值只能是0、1、4、5、9几个数字
2.写Mapper程序:
package cn.edu.bjut.temperautre;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class TemperatureMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private static final Integer ERROR_TEMPER = 9999;
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String content = value.toString();
String year = content.substring(15, 19); //获取year
Integer temperature = null; //获取温度
if('+' == content.charAt(45)) {
temperature = Integer.parseInt(content.substring(46, 50));
} else {
temperature = Integer.parseInt(content.substring(45, 50));
}
if(temperature <= ERROR_TEMPER && content.substring(50, 51).matches("[01459]")) {
context.write(new Text(year), new IntWritable(temperature));
}
}
}
3.写Reducer程序:
package cn.edu.bjut.temperautre;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class TemperatureReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int maxTemperature = Integer.MIN_VALUE;
for(IntWritable intWritable : values) {
maxTemperature = Math.max(maxTemperature, intWritable.get());
}
context.write(key, new IntWritable(maxTemperature));
}
}
4.写主程序:
package cn.edu.bjut.temperautre;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MainJob {
public static void main(String[] args) throws Exception {
if(2 != args.length) {
System.out.println("Usage: MaxTemperature<input path> <output path>");
System.exit(-1);
}
Configuration conf = new Configuration();
Job job = new Job(conf, "temperature");
job.setJarByClass(MainJob.class);
job.setMapperClass(TemperatureMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(TemperatureReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
Path outPath = new Path(args[1]);
FileSystem fileSystem = FileSystem.get(conf);
if(fileSystem.exists(outPath)) {
fileSystem.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
System.exit(job.waitForCompletion(true)?1:0);
}
}
5.打成jar,只需要打包对应的源代码即可,上传到linux服务器
6.创建文件source.txt 并把需要分析的文本数据copy到该文件中, 执行 hadoop fs -put source.txt /user/root/data/1/source.txt 将文件存放在hdfs中。(这里所提供的目录仅供参考,可以自己定义的)
7.hadoop jar one.jar /user/root/data/1 /result_one