mysql源码解读-执行计划之test_quick_select_mysql quick select-优快云博客

本文链接：https://blog.youkuaiyun.com/changetemper/article/details/144988987

test_quick_select

用于评估是否可以使用索引进行范围扫描（range scan）或索引合并扫描（index merge scan），并选择成本最低的访问方法。

还会考虑全表扫描（table scan 、index scan）的可能性，并在必要时优先选择范围扫描，并不是一定选择范围扫描。

int test_quick_select(THD *thd, 
                      Key_map keys_to_use,  //是当前的query可用的索引的位图信息
                      table_map prev_tables,
                      ha_rows limit, //limit的值
                      bool force_quick_range/* 强制使用范围扫描 */,
                      const enum_order interesting_order,/*范围访问方法必须能够提供的排序顺序，升序、降序、无所谓*/
                      const QEP_shared_owner *tab, //表相关信息
                      Item *cond,//条件
                      Key_map *needed_reg, /* used in make_join_query_block() 哪些索引可能被使用*/
                      QUICK_SELECT_I **quick,
                      bool ignore_table_scan, /*是否忽略全表扫描*/
                      Query_block *query_block);

函数流程

(1) 计算全表扫描的代价 table_scan_cost

const Cost_model_server *const cost_model = thd->cost_model();
TABLE *const head = tab->table();
ha_rows records = head->file->stats.records;//表统计信息 总记录数
if (!records) records++;
double scan_time = //全表扫描的cpu_cost
    cost_model->row_evaluate_cost(static_cast<double>(records)) + 1;
Cost_estimate cost_est = head->file->table_scan_cost();//全表扫描的io_cost
cost_est.add_io(1.1);
cost_est.add_cpu(scan_time);
if (ignore_table_scan) {//如果选择了忽略全表扫描 设置最大cost值 确保不会使用
    scan_time = DBL_MAX;
    cost_est.set_max_cost();
}
if (limit < records) { //如果语句中没有limit 该值必定limit > records
    cost_est.reset();//强制使用index_scan
    cost_est.add_io(head->cost_model()->page_read_cost(static_cast<double>(records)) + 1);
    cost_est.add_cpu(scan_time);
} else if (cost_est.total_cost() <= 2.0 && !force_quick_range)
    //如果全表扫描的成本非常低，并且没有强制使用快速选择 那么全表扫描可以认为就是代价最低的方式
    return 0; /* No need for quick select */

(2) 初始化PARAM结构体，在未找到最佳的table_read_plan前，后。续会根据条件cond与索引信息param构建一个range tree，查找相对应的range_scan扫描方式。

(3) 对可用索引进行选择

key_info = head->key_info;
//遍历表的所有索引，并根据给定条件决定哪些索引可以使用
for (uint idx = 0; idx < head->s->keys; idx++, key_info++) {
    KEY_PART_INFO *key_part_info;

    if (!keys_to_use.is_set(idx)) {//当前索引不可用
        continue;
    }

    if (hint_key_state(thd, head->pos_in_table_list, idx,
                       NO_RANGE_HINT_ENUM, 0)) {//检查是否有用户提供的查询提示（hints）禁止对当前索引进行范围优化
        continue;
    }

    if (key_info->flags & HA_FULLTEXT) { //full_text不适用于范围扫描
        continue;
    }

    param.key[param.keys] = key_parts;
    key_part_info = key_info->key_part;
    for (uint part = 0; part < actual_key_parts(key_info);//当前索引可用 遍历索引的每一个part
         part++, key_parts++, key_part_info++) {
        key_parts->key = param.keys;
        key_parts->part = part;
        key_parts->length = key_part_info->length;
        key_parts->store_length = key_part_info->store_length;
        key_parts->field = key_part_info->field;
        key_parts->null_bit = key_part_info->null_bit;
        key_parts->image_type = (part < key_info->user_defined_key_parts &&
                                 key_info->flags & HA_SPATIAL) ? Field::itMBR : Field::itRAW;
        key_parts->flag = key_part_info->key_part_flag;
    }
    param.real_keynr[param.keys++] = idx;
}

(4) 是否有可选择的覆盖索引进行查询

    /*在查询优化过程中，计算使用最短的覆盖索引进行全索引扫描，因为键长度最短的那个索引相应的IO_COST要小*/
    if (!head->covering_keys.is_clear_all()) {//当前表中是否存在覆盖索引
      int key_for_use = find_shortest_key(head, &head->covering_keys);
      // find_shortest_key() should return a valid key:
      assert(key_for_use != MAX_KEY);
 	 //使用覆盖索引进行index_scan
      Cost_estimate key_read_time = param.table->file->index_scan_cost(
          key_for_use, 1, static_cast<double>(records));
      key_read_time.add_cpu(
          cost_model->row_evaluate_cost(static_cast<double>(records)));
      bool chosen = false;
      if (key_read_time < cost_est) { //cost_est 全表扫描的代价 评估代价 选择相应的扫描方式
        cost_est = key_read_time;
        chosen = true;
      }
    }

(5) 考虑各种range scan

tree = get_mm_tree(&param, cond); //根据 where on 等条件 构建查找范围访问的备选方案


//对于group by的语句 尝试构建QUICK_GROUP_MIN_MAX_SELECT
group_trp = get_best_group_min_max(&param, tree, &best_cost);
    if (group_trp) {
      DBUG_EXECUTE_IF("force_lis_for_group_by", group_trp->cost_est.reset(););
      param.table->quick_condition_rows =
          min(group_trp->records, head->file->stats.records);
      Opt_trace_object grp_summary(trace, "best_group_range_summary",
                                   Opt_trace_context::RANGE_OPTIMIZER);
      if (unlikely(trace->is_started()))
        group_trp->trace_basic_info(&param, &grp_summary);
      if (group_trp->cost_est < best_cost) {
        grp_summary.add("chosen", true);
        best_trp = group_trp;
        best_cost = best_trp->cost_est;
      } else
        grp_summary.add("chosen", false).add_alnum("cause", "cost");
    }

//对于单表的非group by 检查是否有强制使用skip_scan的hint 
force_skip_scan = hint_table_state(
        param.thd, param.table->pos_in_table_list, SKIP_SCAN_HINT_ENUM, 0);

(6) 尝试选择代价更低的range scan扫描方法，包括可能的索引合并index_merge

/*
当前还存在可行的其他范围查找方法 
并且best_trp还没有指定或者没有指定force hint 
可通过get_key_scans_params探寻一个更优的范围扫描方法 但也可能不存在
*/
if (tree && (!best_trp || !best_trp->is_forced_by_hint())) { //best_trp, the best table_read_plan 
     //It is possible to use a range-based quick select (but it might be slower than 'all' table scan).
    {
        /*
          Calculate cost of single index range scan and possible intersections of these
        */
        TRP_RANGE *range_trp; // range_scan
        TRP_ROR_INTERSECT *rori_trp;  // ROR扫描

        /* Get best 'range' plan and prepare data for making other plans */
        if ((range_trp = // get_key_scans_params会对param中所有的key 计算扫描行数以及相应的cost 选择代价更低的range_scan
             get_key_scans_params(&param, tree, false, true, &best_cost))) {
            best_trp = range_trp;
            best_cost = best_trp->cost_est;
        }

        /*
          Simultaneous key scans and row deletes on several handler
          objects are not allowed so don't use ROR-intersection for
          table deletes. Also, ROR-intersection cannot return rows in
          descending order
        */
        if ((thd->lex->sql_command != SQLCOM_DELETE) && //非delete
            (param.index_merge_allowed || // 允许索引合并 或者 用户给了相关的提示
             hint_table_state(param.thd, param.table->pos_in_table_list,
                              INDEX_MERGE_HINT_ENUM, 0)) &&
            interesting_order != ORDER_DESC) { //无需以降序返回结果
            //Get best non-covering ROR-intersection plan and prepare data for building covering ROR-intersection.
            if ((rori_trp =
                 get_best_ror_intersect(&param, tree, &best_cost, true))) {
                best_trp = rori_trp;
                best_cost = best_trp->cost_est;
            }
        }
    }

    // Here we calculate cost of union index merge
    if (!tree->merges.is_empty()) {
        // Cannot return rows in descending order.
        if ((param.index_merge_allowed ||
             hint_table_state(param.thd, param.table->pos_in_table_list,
                              INDEX_MERGE_HINT_ENUM, 0)) &&
            interesting_order != ORDER_DESC &&
            param.table->file->stats.records) {
            /* Try creating index_merge/ROR-union scan. */
            SEL_IMERGE *imerge;
            TABLE_READ_PLAN *best_conj_trp = nullptr, *new_conj_trp = nullptr;
            List_iterator_fast<SEL_IMERGE> it(tree->merges);
            Opt_trace_array trace_idx_merge(trace, "analyzing_index_merge_union",
                                            Opt_trace_context::RANGE_OPTIMIZER);
            while ((imerge = it++)) {
                new_conj_trp = get_best_disjunct_quick(&param, imerge, &best_cost);
                if (new_conj_trp)
                    param.table->quick_condition_rows =
                    min(param.table->quick_condition_rows, new_conj_trp->records);
                if (!best_conj_trp ||
                    (new_conj_trp &&
                     new_conj_trp->cost_est < best_conj_trp->cost_est)) {
                    best_conj_trp = new_conj_trp;
                }
            }
            if (best_conj_trp) best_trp = best_conj_trp;
        }
    }
}

(7) 根据找到的最佳扫描计划，构建quick_select

/* 
If we got a read plan, create a quick select from it.
Only create a quick select if the storage engine supports using indexes for access.
*/
if (best_trp && (head->file->ha_table_flags() & HA_NO_INDEX_ACCESS) == 0) {
    QUICK_SELECT_I *qck;
    records = best_trp->records;
    if (!(qck = best_trp->make_quick(&param, true)) || qck->init())
        qck = nullptr;
    *quick = qck;
}

get_key_scans_params

//获取最佳的range_scan方法
static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree, //test_quick_select关于使用的索引的信息
                                       bool index_read_must_be_used, //if true <=> index_only 无需回表查询
                                       bool update_tbl_stats,
                                       const Cost_estimate *cost_est);//当前最佳扫描方法的代价 table_scan or index_scan

函数流程

对所有可用索引进行遍历

//遍历当前所有可用索引的数量 并不是表中的所有索引  
for (idx = 0; idx < param->keys; idx++) {
    key = tree->keys[idx];
    if (key) {}
}

对当前使用的索引构建查询计划

check_quick_select中在已知扫描范围的情况下，计算该扫描方式下的cost与rows，通过函数multi_range_read_info_const调用相关的代价函数与records_in_range进行行数预估。

ha_rows found_records;
Cost_estimate cost;
uint mrr_flags = 0, buf_size = 0;
uint keynr = param->real_keynr[idx];//索引id
if (key->type == SEL_ROOT::Type::MAYBE_KEY || key->root->maybe_flag) //表示该索引依赖于其他表的数据（join），优化器需要考虑这种依赖关系
    param->needed_reg->set_bit(keynr);

bool read_index_only =  //是否是无需回表的index_only读取
    index_read_must_be_used
    ? true
    : (bool)param->table->covering_keys.is_set(keynr); //判断当前查询条件下 该索引是不是覆盖索引

//评估当前执行计划的可行性 需要扫描的记录数量 + cost 如果返回值found_records = HA_POS_ERROR 表示无法使用该索引
found_records = 
    check_quick_select(param, idx, read_index_only, key, update_tbl_stats, //关键点
                       &mrr_flags, &buf_size, &cost);

if (!compound_hint_key_enabled(param->table, keynr,
                               INDEX_MERGE_HINT_ENUM)) { //检查是否启用了复合提示USE INDEX 、IGNORE INDEX 并且影响到了当前的执行计划
    continue;
}

对于check_quick_select的结果，是否是更优的索引选择

if (found_records != HA_POS_ERROR && //找到了有效的记录数
    (read_cost > cost || //并且当前索引的成本比已知的最佳成本更低
     (force_index_merge && (!use_cheapest_index_merge || !key_to_read)))) { //强制使用索引合并 且 没有指定代价最小的索引或尚未选择任何索引
    read_cost = cost; //更新相关的索引信息
    best_records = found_records;
    key_to_read = key;
    best_idx = idx;
    best_mrr_flags = mrr_flags;
    best_buf_size = buf_size;
    is_best_idx_imerge_scan = param->is_imerge_scan;
} else {}

//根据已知信息 返回最佳的执行计划
if (key_to_read) {
    if ((read_plan = new (param->mem_root)
         TRP_RANGE(key_to_read, best_idx, best_mrr_flags))) {
        read_plan->records = best_records;
        read_plan->is_ror = tree->ror_scans_map.is_set(best_idx);
        read_plan->is_imerge = is_best_idx_imerge_scan;
        read_plan->cost_est = read_cost;
        read_plan->mrr_buf_size = best_buf_size;
    }
}

return read_plan;
}

check_quick_select

//评估使用给定索引进行范围扫描时预期检索的记录数，并计算相关成本
static ha_rows check_quick_select(PARAM *param, 
                                  uint idx, // 索引id
                                  bool index_only, // 是否index_only读取
                                  SEL_ROOT *tree, 
                                  bool update_tbl_stats, // 是否更新统计信息
                                  uint *mrr_flags, uint *bufsize,
                                  Cost_estimate *cost); // 当前要对比的执行计划的代价


//在已知扫描范围的情况下，计算该扫描方式下的cost与rows 调用代价函数与records_in_range
rows = file->multi_range_read_info_const(keynr, &seq_if, (void *)&seq, 0,
                                         bufsize, mrr_flags, cost);
if (rows != HA_POS_ERROR) {//有合理的行数预估结果
    param->table->quick_rows[keynr] = rows;
    if (update_tbl_stats) { //根据当前所选择的索引 更新相关的信息
        param->table->quick_keys.set_bit(keynr);
        param->table->quick_key_parts[keynr] = param->max_key_part + 1;
        param->table->quick_n_ranges[keynr] = param->range_count;
        param->table->quick_condition_rows =
            min(param->table->quick_condition_rows, rows);
    }
    param->table->possible_quick_keys.set_bit(keynr);
}

handler::multi_range_read_info_const

ha_rows handler::multi_range_read_info_const(
    uint keyno, RANGE_SEQ_IF *seq, void *seq_init_param,
    uint n_ranges_arg MY_ATTRIBUTE((unused)), uint *bufsz, uint *flags,
    Cost_estimate *cost) {
  KEY_MULTI_RANGE range;
  range_seq_t seq_it;
  ha_rows rows, total_rows = 0;
  uint n_ranges = 0;
  THD *thd = current_thd;

  /* Default MRR implementation doesn't need buffer */
  *bufsz = 0;

  DBUG_EXECUTE_IF("bug13822652_2", thd->killed = THD::KILL_QUERY;);

  seq_it = seq->init(seq_init_param, n_ranges, *flags);
  while (!seq->next(seq_it, &range)) {
    if (unlikely(thd->killed != 0)) return HA_POS_ERROR;

    n_ranges++;
    key_range *min_endp, *max_endp;
    if (range.range_flag & GEOM_FLAG) {
      min_endp = &range.start_key;
      max_endp = nullptr;
    } else {
      min_endp = range.start_key.length ? &range.start_key : nullptr;
      max_endp = range.end_key.length ? &range.end_key : nullptr;
    }
  
    int keyparts_used = 0;
    if ((range.range_flag & UNIQUE_RANGE) &&  // 1) 该索引是unique index
        !(range.range_flag & NULL_RANGE)) //且 该索引不是is null查询 因为即使是在unique index上 也是可以存在多个null值的
      rows = 1; /* there can be at most one row */
    else if (range.range_flag & SKIP_RECORDS_IN_RANGE &&  // 2)
             !(range.range_flag & NULL_RANGE)) { //注意 对于is null查询 需要调用records_in_range() 去进行行数预估
      if ((range.range_flag & EQ_RANGE) && //当前不是唯一索引列查询 并且是等值查询 意味着可能匹配到多行数据 
          (keyparts_used = my_count_bits(range.start_key.keypart_map)) &&
          table->key_info[keyno].has_records_per_key(keyparts_used - 1)) {
            //该种情况下 根据你使用的索引的实际part数 估计该等值查询下的records_per_key作为行数预估
        rows = static_cast<ha_rows>(
            table->key_info[keyno].records_per_key(keyparts_used - 1));
      } else {
        /*
          Return HA_POS_ERROR if the range does not use all key parts and
          the key cannot use partial key searches.
        */
        if ((index_flags(keyno, 0, false) & HA_ONLY_WHOLE_INDEX)) {
          assert(
              (range.range_flag & EQ_RANGE) &&
              !table->key_info[keyno].has_records_per_key(keyparts_used - 1));
          total_rows = HA_POS_ERROR;
          break;
        }
        /*
          Since records_in_range has not been called, set the rows to 1.
          FORCE INDEX has been used, cost model values will be ignored anyway.
        */
        rows = 1;
      }
    } else { // 使用records_in_range()对索引的上下界进行判断 估计扫描行数
      DBUG_EXECUTE_IF("crash_records_in_range", DBUG_SUICIDE(););
      assert(min_endp || max_endp);
      if (HA_POS_ERROR ==
          (rows = this->records_in_range(keyno, min_endp, max_endp))) {
        /* Can't scan one range => can't do MRR scan at all */
        total_rows = HA_POS_ERROR;
        break;
      }
    }
    total_rows += rows;
  }
  //估计该索引扫描下的代价
  if (total_rows != HA_POS_ERROR) {
    const Cost_model_table *const cost_model = table->cost_model();

    /* The following calculation is the same as in multi_range_read_info(): */
    *flags |= (HA_MRR_USE_DEFAULT_IMPL | HA_MRR_SUPPORT_SORTED);

    assert(cost->is_zero());
    if (*flags & HA_MRR_INDEX_ONLY)//是否需要回表
      *cost = index_scan_cost(keyno, static_cast<double>(n_ranges),//无需回表
                              static_cast<double>(total_rows));
    else
      *cost = read_cost(keyno, static_cast<double>(n_ranges),//需要回表
                        static_cast<double>(total_rows));
    cost->add_cpu(
        cost_model->row_evaluate_cost(static_cast<double>(total_rows)) + 0.01);
  }
  return total_rows;
}