Top N 问题(MR,Hive,Mysql)_hive的topn对应的mr-优快云博客

本文链接：https://blog.youkuaiyun.com/learner_up/article/details/106001846

本文深入解析MapReduce中的自定义分区、排序与分组原理，通过具体案例阐述如何求每年最高气温，并探讨Hadoop权威指南中相关概念的澄清。文章详细介绍了组合键、自定义比较器及Reducer的实现，同时对比了Hive与MySQL在TopN查询上的实现方法。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

前言：

仅仅针对权威指南的吐槽，可跳过。说到这个问题实在是让人崩溃，开始看hadoop权威指南中文第四版中的9.2.4案例，求每年的最高温，要求年份按照升序，温度降序。针对组合键告诉我自定义分区只能保证每一个 reduce 接受一个年份的所有记录，而在一个分区之内，reduce 仍是通过建进行分组的分区，所以应该自定义分组。但是自定义分组前后结果配图竟然完全一样！？这是什么迷惑行为。此外，针对分区分组之后reduce 方法中直接使用context.write(key,NullWriteable.get())就能输出最高温度，感到奇怪。

MR代码实现：

1.定义自然键和自然值的组合键，该类必须实现WriteableComparable。

import java.io.*;

import org.apache.hadoop.io.*;

public class IntPair implements WritableComparable<IntPair> {

  private int first;
  private int second;
  
  public IntPair() {
  }
  
  public IntPair(int first, int second) {
    set(first, second);
  }
  
  public void set(int first, int second) {
    this.first = first;
    this.second = second;
  }
  
  public int getFirst() {
    return first;
  }

  public int getSecond() {
    return second;
  }

  @Override
  public void write(DataOutput out) throws IOException {
    out.writeInt(first);
    out.writeInt(second);
  }

  @Override
  public void readFields(DataInput in) throws IOException {
    first = in.readInt();
    second = in.readInt();
  }
  
  @Override
  public int hashCode() {
    return first * 163 + second;
  }
  
  @Override
  public boolean equals(Object o) {
    if (o instanceof IntPair) {
      IntPair ip = (IntPair) o;
      return first == ip.first && second == ip.second;
    }
    return false;
  }

  @Override
  public String toString() {
    return first + "\t" + second;
  }
  
  @Override
  public int compareTo(IntPair ip) {
    int cmp = compare(first, ip.first);
    if (cmp != 0) {
      return cmp;
    }
    return compare(second, ip.second);
  }
  
  /**
   * Convenience method for comparing two ints.
   */
  public static int compare(int a, int b) {
    return (a < b ? -1 : (a == b ? 0 : 1));
  }
  
}

import java.io.IOException;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

// vv MaxTemperatureUsingSecondarySort
public class MaxTemperatureUsingSecondarySort
  extends Configured implements Tool {
  
  static class MaxTemperatureMapper
    extends Mapper<LongWritable, Text, IntPair, NullWritable> {
  
    private NcdcRecordParser parser = new NcdcRecordParser();
    
    @Override
    protected void map(LongWritable key, Text value,
        Context context) throws IOException, InterruptedException {
      
      parser.parse(value);
      if (parser.isValidTemperature()) {
        /*[*/context.write(new IntPair(parser.getYearInt(),
            parser.getAirTemperature()), NullWritable.get());/*]*/
      }
    }
  }

  
  static class MaxTemperatureReducer
    extends Reducer<IntPair, NullWritable, IntPair, NullWritable> {
  
    @Override
    protected void reduce(IntPair key, Iterable<NullWritable> values,
        Context context) throws IOException, InterruptedException {
      
      /*[*/context.write(key, NullWritable.get());/*]*/
    }
  }
  
  public static class FirstPartitioner
    extends Partitioner<IntPair, NullWritable> {

    @Override
    public int getPartition(IntPair key, NullWritable value, int numPartitions) {
      // multiply by 127 to perform some mixing
      return Math.abs(key.getFirst() * 127) % numPartitions;
    }
  }
  
  public static class KeyComparator extends WritableComparator {
    protected KeyComparator() {
      super(IntPair.class, true);
    }
    @Override
    public int compare(WritableComparable w1, WritableComparable w2) {
      IntPair ip1 = (IntPair) w1;
      IntPair ip2 = (IntPair) w2;
      int cmp = IntPair.compare(ip1.getFirst(), ip2.getFirst());
      if (cmp != 0) {
        return cmp;
      }
      return -IntPair.compare(ip1.getSecond(), ip2.getSecond()); //reverse
    }
  }
  
  public static class GroupComparator extends WritableComparator {
    protected GroupComparator() {
      super(IntPair.class, true);
    }
    @Override
    public int compare(WritableComparable w1, WritableComparable w2) {
      IntPair ip1 = (IntPair) w1;
      IntPair ip2 = (IntPair) w2;
      return IntPair.compare(ip1.getFirst(), ip2.getFirst());
    }
  }

  @Override
  public int run(String[] args) throws Exception {
    Job job = JobBuilder.parseInputAndOutput(this, getConf(), args);
    if (job == null) {
      return -1;
    }
    
    job.setMapperClass(MaxTemperatureMapper.class);
    /*[*/job.setPartitionerClass(FirstPartitioner.class);/*]*/
    /*[*/job.setSortComparatorClass(KeyComparator.class);/*]*/
    /*[*/job.setGroupingComparatorClass(GroupComparator.class);/*]*/
    job.setReducerClass(MaxTemperatureReducer.class);
    job.setOutputKeyClass(IntPair.class);
    job.setOutputValueClass(NullWritable.class);
    
    return job.waitForCompletion(true) ? 0 : 1;
  }
  
  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new MaxTemperatureUsingSecondarySort(), args);
    System.exit(exitCode);
  }
}
// ^^ MaxTemperatureUsingSecondarySort

流程：

1.TextInputFormat 会调用RecordReader 一行行读取文本，以行号为键，文本内容为值。将一个个<LongWritable,Text>通过map转换为<IntPair,NullWritable>，最终形成List<IntPair,NullWritable>

2.在mapper的最后，会先调用job.setPartitionerClass对这个List进行分区，每个分区映射到一个reducer。每个分区内又调用job.setSortComparatorClass设置的keyComparator（覆盖了IntPair中的定义）,最终形成第一列升序，第二列降序

3.reducer 接受到所有映射到这个reducer的map输入后，调用job.setSortComparatorClass设置的keyComparator对所有数据对排序。经过排序，第一列值相同的IntPair会被放在一起。开始构造key对应得value迭代器，此时用到分组比较器，使用jobjob.setGroupingComparatorClass设置的分组函数类。

4.在reducer的context中（实际是ReducerContextImpl类）中GroupComparator会判断当前中间输入的key（本例是IntPair对象）是否和下一个key相同。如果比较器返回0（代表相同，非0代表不同），则reducer的run会一直在context的nextkey->nextkeyvalue方法中调用该比较器，直到比较器返回非0。而每次nextkeyvalue的调用，都会更新context的当前key->value对的值，然后在run中，调用reduce方法，传入context的当前key->value对。所以如本例中，中间输出中两列都是升序排列，则context会一直调用groupcomparator并更新当前key->value，直到非0时调用reduce方法传入当前key->value。（只要这个比较器比较的两个key相同，他们就属于同一个组）

5.进入Reducer的reduce方法，reduce方法的输入是所有的key和它的value迭代器。

6.分区的目的是根据Key值决定Mapper的输出记录被送到哪一个Reducer上去处理，key 比较器决定了key的排序方式，group比较器决定了哪些数据可以被划分为一组，同时进入到一个reduce 方法。

7.key比较器还有一个重要功能，就是如果job没有设定group比较器，则key比较器将充当group比较器的角色（如果设置了key比较器的话），也即mapper过程和reducer过程都会用key比较器。

8.在实现比较器时，在构造函数中，一定要调用super(XXX.class,true)，XXX是你用来比较的类的名称，true是表示生成该类对象（在比较器内部）。之后比较时，比较器会用输入数据的字节值进行反序列化来给内部XXX类对象赋值，之后该属性对象被传递到比较器的compareTo函数。如果没有以上步骤，运行时会抛出空指针异常。

9.回到原问题，为什么context.write(key,NullWriteable.get())就能输出最高温度?并不是分组到最后就能输出一个结果。在最后进入到reduce方法的结果为[1900,35],[1900,34],[1900,33]等key值以及空值迭代器。因为[1900,35]排列在第一个，而一组中context.write(key,NullWriteable.get())也只写了一次。

10.所以想要求出每组TOPK应该怎么做：

static class MaxTemperatureReducer
    extends Reducer<IntPair, NullWritable, IntPair, NullWritable> {
  
    @Override
    protected void reduce(IntPair key, Iterable<NullWritable> values,
        Context context) throws IOException, InterruptedException {
           int i=0;
           for(NullWritable  value:values){
             i++;
            if(i<=k)
              /*[*/context.write(key, NullWritable.get());/*]*/
            else 
                break;

         }
    }
  }

hive 实现top n

hive 中提供了具体的排序函数，所以总体实现是十分简单的。

row_number() over ([partition col1] [order by col2])

rank() over ([partition col1] [order by col2])

dense_rank() over ([partition col1] [order by col2])

它们都是根据col1字段分组，然后对col2字段进行排序，对排序后的每行生成一个行号，这个行号从1开始递增

col1、col2都可以是多个字段，用','分隔

区别：

1）row_number：不管col2字段的值是否相等，行号一直递增，比如：有两条记录的值相等，但一个是第一，一个是第二

2）rank：上下两条记录的col2相等时，记录的行号是一样的，但下一个col2值的行号递增N（N是重复的次数），比如：有两条并列第一，下一个是第三，没有第二

3）dense_rank：上下两条记录的col2相等时，下一个col2值的行号递增1，比如：有两条并列第一，下一个是第二

其中 row_number 还可以用between ... and ... 实现分页查询。

create table yt(
  year String,
  temperature int
)row format delimited fields terminated by '\t';


--求每年最高温前三
select * from (select *,row_number over(partition by year order by temperature desc) as tem from  yt)t where tem >=3;

--求1900年 35度的排名
select tem from (select * ,rank_number over(partition by year order by temperature desc) as tem from yt )t where year='1900' and temperature=35;

Mysql 分组求 top N

mysql 中的top n 问题实际上转化为当前记录中某列数据在所有记录中的排名，即如果存在N列数据比当前列数据大，那么当前数据的排名即为N+1（相同大小的排名相同）

数据准备：

DROP TABLE IF EXISTS `info`;
 CREATE TABLE `info`  (
   `id` int(11) NOT NULL AUTO_INCREMENT,
   `title` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
   `views` int(255) DEFAULT NULL,
   `info_type_id` int(11) DEFAULT NULL,
   PRIMARY KEY (`id`) USING BTREE
 ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
 
-- ----------------------------
-- Records of info
-- ----------------------------
INSERT INTO `info` VALUES (1, '罗志祥黑眼圈', 10, 4);
INSERT INTO `info` VALUES (2, '新冠肺炎', 22, 4);
INSERT INTO `info` VALUES (3, '流浪地球电影大火', 188, 1);
INSERT INTO `info` VALUES (4, '葛优瘫', 99, 2);
INSERT INTO `info` VALUES (5, '周杰伦出新歌了', 877, 2);
INSERT INTO `info` VALUES (6, '蔡依林西安演唱会', 86, 1);
INSERT INTO `info` VALUES (7, '中纪委调盐', 67, 3);
INSERT INTO `info` VALUES (8, '人民大会堂', 109, 3);
INSERT INTO `info` VALUES (9, '重庆称为网红城市', 202, 1);
INSERT INTO `info` VALUES (10, '胡歌结婚了', 300, 2);
INSERT INTO `info` VALUES (11, 'ipone15马上上市', 678, 2);
INSERT INTO `info` VALUES (12, '中国探月成功', 54, 4);
INSERT INTO `info` VALUES (13, 'Trump演说家', 67, 4);
 
DROP TABLE IF EXISTS `info_type`;
CREATE TABLE `info_type`  (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `name` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
 
-- ----------------------------
-- Records of info_type
-- ----------------------------
INSERT INTO `info_type` VALUES (1, '娱乐');
INSERT INTO `info_type` VALUES (2, '八卦');
INSERT INTO `info_type` VALUES (3, '政治');

需求：取热门的资讯信息列表且每个类别只取前3条

 SELECT t.* from (
        SELECT
             t1.*,
             (SELECT count(*) + 1 FROM info t2 WHERE t2.info_type_id = t1.info_type_id AND t2.views > t1.views ) top
         FROM
         info t1
     ) t where top <=3 order by t.info_type_id,top

参考链接：

https://zhuanlan.zhihu.com/p/62404730

https://blog.youkuaiyun.com/xiaocaidexuexibiji/article/details/12125699

https://blog.youkuaiyun.com/Mr_Liuzhongbin/article/details/103087148?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromBaidu-32&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromBaidu-32