MapReduce功能实现七---小综合(多个job串行处理计算平均值)

MapReduce功能实现系列
      MapReduce功能实现一—Hbase和Hdfs之间数据相互转换
      MapReduce功能实现二—排序
      MapReduce功能实现三—Top N
      MapReduce功能实现四—小综合(从hbase中读取数据统计并在hdfs中降序输出Top 3)
      MapReduce功能实现五—去重(Distinct)、计数(Count)
      MapReduce功能实现六—最大值(Max)、求和(Sum)、平均值(Avg)
      MapReduce功能实现七—小综合(多个job串行处理计算平均值)
      MapReduce功能实现八—分区(Partition)
      MapReduce功能实现九—Pv、Uv
      MapReduce功能实现十—倒排索引(Inverted Index)
      MapReduce功能实现十一—join
 

  复杂的MapReduce处理中,往往需要将复杂的处理过程,分解成多个简单的Job来执行,第1个Job的输出做为第2个Job的输入,相互之间有一定依赖关系。下面以求平均值为例:

  1. 求Sum
  2. 求Count
  3. 计算平均数
[hadoop@h71 q1]$ vi ceshi.txt
2
8
8
3
2
3
5
3
0
2
7
[hadoop@h71 q1]$ hadoop fs -put ceshi.txt /input

  每1个步骤看成一个Job,其中Job3必须等待Job1、Job2完成,并将Job1、Job2的输出结果做为输入,下面的代码演示了如何将这3个Job串起来:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 
import java.io.IOException;
 
public class Avg2 {
 
    public static boolean deleteFile(Configuration conf, String remoteFilePath, boolean recursive) throws IOException {
        FileSystem fs = FileSystem.get(conf);
        boolean result = fs.delete(new Path(remoteFilePath), recursive);
        fs.close();
        return result;
    }
 
    public static boolean deleteFile(Configuration conf, String remoteFilePath) throws IOException {
        return deleteFile(conf, remoteFilePath, true);
    }
	
    private static final Text TEXT_SUM = new Text("SUM");
    private static final Text TEXT_COUNT = new Text("COUNT");
    private static final Text TEXT_AVG = new Text("AVG");
 
    //计算Sum
    public static class SumMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
        public long sum = 0;
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            sum += Long.parseLong(value.toString());
        }
        protected void cleanup(Context context) throws IOException, InterruptedException {
            context.write(TEXT_SUM, new LongWritable(sum));
        }
    }
 
    public static class SumReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
        public long sum = 0;
        public void reduce(Text key, Iterable<LongWritable> values, Context context)
                throws IOException, InterruptedException {
            for (LongWritable v : values) {
                sum += v.get();
            }
            context.write(TEXT_SUM, new LongWritable(sum));
        }
    }
 
    //计算Count
    public static class CountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
        public long count = 0;
        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            count += 1;
        }
        protected void cleanup(Context context) throws IOException, InterruptedException {
            context.write(TEXT_COUNT, new LongWritable(count));
        }
    }
 
    public static class CountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
        public long count = 0;
        public void reduce(Text key, Iterable<LongWritable> values, Context context)
                throws IOException, InterruptedException {
            for (LongWritable v : values) {
                count += v.get();
            }
            context.write(TEXT_COUNT, new LongWritable(count));
        }
    }
 
    //计算Avg
    public static class AvgMapper extends Mapper<LongWritable, Text, LongWritable, LongWritable> {
        public long count = 0;
        public long sum = 0;
        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String[] v = value.toString().split("\t");
            if (v[0].equals("COUNT")) {
                count = Long.parseLong(v[1]);
            } else if (v[0].equals("SUM")) {
                sum = Long.parseLong(v[1]);
            }
        }
        protected void cleanup(Context context) throws IOException, InterruptedException {
            context.write(new LongWritable(sum), new LongWritable(count));
        }
    }
 
 
    public static class AvgReducer extends Reducer<LongWritable, LongWritable, Text, DoubleWritable> {
        public long sum = 0;
        public long count = 0;
        public void reduce(LongWritable key, Iterable<LongWritable> values, Context context)
                throws IOException, InterruptedException {
            sum += key.get();
            for (LongWritable v : values) {
                count += v.get();
            }
        }
        protected void cleanup(Context context) throws IOException, InterruptedException {
            context.write(TEXT_AVG, new DoubleWritable(new Double(sum) / count));
        }
    }
 
 
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
 
        String inputPath = "/input/ceshi.txt";
        String maxOutputPath = "/output/sum/";
        String countOutputPath = "/output/count/";
        String avgOutputPath = "/output/avg/";
 
        //删除输出目录(可选,省得多次运行时,总是报OUTPUT目录已存在)
        Avg2.deleteFile(conf, maxOutputPath);
        Avg2.deleteFile(conf, countOutputPath);
        Avg2.deleteFile(conf, avgOutputPath);
 
        Job job1 = Job.getInstance(conf, "Sum");
        job1.setJarByClass(Avg2.class);
        job1.setMapperClass(SumMapper.class);
        job1.setCombinerClass(SumReducer.class);
        job1.setReducerClass(SumReducer.class);
        job1.setOutputKeyClass(Text.class);
        job1.setOutputValueClass(LongWritable.class);
        FileInputFormat.addInputPath(job1, new Path(inputPath));
        FileOutputFormat.setOutputPath(job1, new Path(maxOutputPath));
 
 
        Job job2 = Job.getInstance(conf, "Count");
        job2.setJarByClass(Avg2.class);
        job2.setMapperClass(CountMapper.class);
        job2.setCombinerClass(CountReducer.class);
        job2.setReducerClass(CountReducer.class);
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(LongWritable.class);
        FileInputFormat.addInputPath(job2, new Path(inputPath));
        FileOutputFormat.setOutputPath(job2, new Path(countOutputPath));
 
 
        Job job3 = Job.getInstance(conf, "Average");
        job3.setJarByClass(Avg2.class);
        job3.setMapperClass(AvgMapper.class);
        job3.setReducerClass(AvgReducer.class);
        job3.setMapOutputKeyClass(LongWritable.class);
        job3.setMapOutputValueClass(LongWritable.class);
        job3.setOutputKeyClass(Text.class);
        job3.setOutputValueClass(DoubleWritable.class);
 
        //将job1及job2的输出为做job3的输入
        FileInputFormat.addInputPath(job3, new Path(maxOutputPath));
        FileInputFormat.addInputPath(job3, new Path(countOutputPath));
        FileOutputFormat.setOutputPath(job3, new Path(avgOutputPath));
 
        //提交job1及job2,并等待完成
        if (job1.waitForCompletion(true) && job2.waitForCompletion(true)) {
            System.exit(job3.waitForCompletion(true) ? 0 : 1);
        }
    }
}

在Linux中执行该代码:

[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac Avg2.java 
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar Avg2*class
[hadoop@h71 q1]$ hadoop jar xx.jar Avg2
[hadoop@h71 q1]$ hadoop fs -lsr /output
drwxr-xr-x   - hadoop supergroup          0 2017-03-18 22:45 /output/avg
-rw-r--r--   2 hadoop supergroup          0 2017-03-18 22:45 /output/avg/_SUCCESS
-rw-r--r--   2 hadoop supergroup         22 2017-03-18 22:45 /output/avg/part-r-00000
drwxr-xr-x   - hadoop supergroup          0 2017-03-18 22:44 /output/count
-rw-r--r--   2 hadoop supergroup          0 2017-03-18 22:44 /output/count/_SUCCESS
-rw-r--r--   2 hadoop supergroup          9 2017-03-18 22:44 /output/count/part-r-00000
drwxr-xr-x   - hadoop supergroup          0 2017-03-18 22:44 /output/sum
-rw-r--r--   2 hadoop supergroup          0 2017-03-18 22:44 /output/sum/_SUCCESS
-rw-r--r--   2 hadoop supergroup          7 2017-03-18 22:44 /output/sum/part-r-00000

[hadoop@h71 q1]$ hadoop fs -cat /output/sum/part-r-00000
SUM     43
[hadoop@h71 q1]$ hadoop fs -cat /output/count/part-r-00000
COUNT   11
[hadoop@h71 q1]$ hadoop fs -cat /output/avg/part-r-00000
AVG     3.909090909090909

参考:http://www.cnblogs.com/yjmyzz/p/4540469.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小强签名设计

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值