struct blk_mq_tag_set结构体
include/linux/blk-mq.h
struct blk_mq_tag_set {
unsigned int *mq_map;
const struct blk_mq_ops *ops;
unsigned int nr_hw_queues;
unsigned int queue_depth; /* max hw supported */
unsigned int reserved_tags;
unsigned int cmd_size; /* per-request extra data */
int numa_node;
unsigned int timeout;
unsigned int flags; /* BLK_MQ_F_* */
void *driver_data;
struct blk_mq_tags **tags;
struct mutex tag_list_lock;
struct list_head tag_list;
};
这个结构体的主要作用就是记录一些在硬件层面上的一些属性的值,然后用这些值做一些内存申请或者赋值给其它结构体成员的依据。
blk_mq_alloc_tag_set函数分析
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
* requested depth down, if it's too large. In that case, the set
* value will be stored in set->queue_depth.
*/
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{
int ret;
BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
if (!set->nr_hw_queues)
return -EINVAL;
if (!set->queue_depth)
return -EINVAL;
if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
return -EINVAL;
if (!set->ops->queue_rq)
return -EINVAL;
if (!set->ops->get_budget ^ !set->ops->put_budget)
return -EINVAL;
if (set->queue_depth > BLK_MQ_MAX_DEPTH) { //队列深度的最大值
pr_info("blk-mq: reduced tag depth to %u\n", BLK_MQ_MAX_DEPTH);
set->queue_depth = BLK_MQ_MAX_DEPTH;
}
/*
* If a crashdump is active, then we are potentially in a very
* memory constrained environment. Limit us to 1 queue and
* 64 tags to prevent using too much memory.
*/
if (is_kdump_kernel()) {
set->nr_hw_queues = 1;
set->queue_depth = min(64U, set->queue_depth);
}
/*
* There is no use for more h/w queues than cpus.
*/
if (set->nr_hw_queues > nr_cpu_ids) //https://www.coder.work/article/7496368
set->nr_hw_queues = nr_cpu_ids;
/*
set->tags[0] = struct blk_mq_tags *
...
set->tags[nr_cpu_ids] = struct blk_mq_tags *
*/
set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node);
if (!set->tags)
return -ENOMEM;
ret = -ENOMEM;
set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map), GFP_KERNEL, set->numa_node);
if (!set->mq_map)
goto out_free_tags;
ret = blk_mq_update_queue_map(set); //主要是给set->mq_map赋值
if (ret)
goto out_free_mq_map;
ret = blk_mq_alloc_rq_maps(set); //给set->tags里的变量赋值
if (ret)
goto out_free_mq_map;
mutex_init(&set->tag_list_lock);
INIT_LIST_HEAD(&set->tag_list);
return 0;
out_free_mq_map:
kfree(set->mq_map);
set->mq_map = NULL;
out_free_tags:
kfree(set->tags);
set->tags = NULL;
return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);
这个函数一进来的话先是对一些给进来的一些必要的值做一个合法性的检查,然后是tags和mq_map的一个内存申请和其初始化工作。
tag变量的申请内存,它是一个二级指针,申请的元素个数是cpu个数,可以理解是一个指针数组,数组里的每一个成员都是一个指针,例如,下面的这个小demo,
#include<stdio.h>
#include<stdlib.h>
int main(int argc, char **argv)
{
int **p;
int i, j, x, y;
p = malloc(sizeof(int) * 4);
p[0] = &x;
p[1] = &y;
x = 10;
y = 20;
printf("p = %d\n", *(*(p + 0)));
return 0;
}
然后是mq_map(一级指针)这个成员,它申请的内存空间也是cpu的个数。
blk_mq_update_queue_map
然后深入到这个函数,
static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
if (set->ops->map_queues) {
/*
* transport .map_queues is usually done in the following
* way:
*
* for (queue = 0; queue < set->nr_hw_queues; queue++) {
* mask = get_cpu_mask(queue)
* for_each_cpu(cpu, mask)
* set->mq_map[cpu] = queue;
* }
*
* When we need to remap, the table has to be cleared for
* killing stale mapping since one CPU may not be mapped
* to any hw queue.
*/
blk_mq_clear_mq_map(set);
return set->ops->map_queues(set);
} else
return blk_mq_map_queues(set);
}
在nvme的驱动里,没有给set->ops->map_queues赋值,所以分析blk_mq_map_queues函数。
int blk_mq_map_queues(struct blk_mq_tag_set *set)
{
unsigned int *map = set->mq_map;
unsigned int nr_queues = set->nr_hw_queues;
unsigned int cpu, first_sibling;
for_each_possible_cpu(cpu) {
/*
* First do sequential mapping between CPUs and queues.
* In case we still have CPUs to map, and we have some number of
* threads per cores then map sibling threads to the same queue for
* performace optimizations.
*/
if (cpu < nr_queues) {
map[cpu] = cpu_to_queue_index(nr_queues, cpu);
} else {
first_sibling = get_first_sibling(cpu);
if (first_sibling == cpu)
map[cpu] = cpu_to_queue_index(nr_queues, cpu);
else
map[cpu] = map[first_sibling];
}
}
return 0;
}
EXPORT_SYMBOL_GPL(blk_mq_map_queues);
前面说了mq_map其实是一个指针,这里终于看到它赋值的地方了,根据注释不难看出,这个指针(其实就是个一维数组),的下标是cpu编号,然后每个数组元素的值是硬件队列的编号。到这里这个函数就结束了,其实功能就是给mq_map赋值的。
看这里的注释,在if语句这个分支里,是先按顺序赋值,比如
map[0] = 0, map[n] = n,如果走到else分支里,则是有一个性能上的优化,这个我们先不管,总之是理解为考虑到了性能这一要求就可以了。
blk_mq_alloc_rq_maps
然后到blk_mq_alloc_rq_maps这个函数的分析
这个函数进去,里面会套着很多个函数,主要的作用就是先是申请set->tags[idx]内存,然后给set->tags[idx]里的许多结构体变量赋值。
/*
* Allocate the request maps associated with this tag_set. Note that this
* may reduce the depth asked for, if memory is tight. set->queue_depth
* will be updated to reflect the allocated depth.
*/
static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
{
unsigned int depth;
int err;
depth = set->queue_depth; //队列深度
do {
err = __blk_mq_alloc_rq_maps(set);
if (!err)
break; //正常到这一步就会退出,如果有问题先尝试改变队列大小?
set->queue_depth >>= 1;
if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
err = -ENOMEM;
break;
}
} while (set->queue_depth);
if (!set->queue_depth || err) {
pr_err("blk-mq: failed to allocate request map\n");
return -ENOMEM;
}
if (depth != set->queue_depth)
pr_info("blk-mq: reduced tag depth (%u -> %u)\n", depth, set->queue_depth);
return 0;
}
static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
{
int i;
for (i = 0; i < set->nr_hw_queues; i++) //硬件队列
if (!__blk_mq_alloc_rq_map(set, i)) //正常返回值是true
goto out_unwind;
return 0;
out_unwind:
while (--i >= 0)
blk_mq_free_rq_map(set->tags[i]);
return -ENOMEM;
}
static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)//硬件队列编号
{
int ret = 0;
/*
hctx_idx这个值之前有比较不会大于nr_cpu_ids
*/
set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, set->queue_depth, set->reserved_tags);
if (!set->tags[hctx_idx])
return false;
ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, set->queue_depth);
if (!ret)
return true; //正常到这里就返回了
blk_mq_free_rq_map(set->tags[hctx_idx]);
set->tags[hctx_idx] = NULL;
return false;
}
到了__blk_mq_alloc_rq_map函数,先看看blk_mq_alloc_rq_map函数的调用。
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int nr_tags, unsigned int reserved_tags)
{
struct blk_mq_tags *tags;
int node;
node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
tags = blk_mq_init_tags(nr_tags, reserved_tags, node, BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); //队列深度,
if (!tags)
return NULL;
//二级指针,大小:队列深度*sizeof(struct request *)
tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node);
if (!tags->rqs) {
blk_mq_free_tags(tags);
return NULL;
}
//二级指针,大小:队列深度*sizeof(struct request *)
tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node);
if (!tags->static_rqs) {
kfree(tags->rqs);
blk_mq_free_tags(tags);
return NULL;
}
return tags;
}
然后是blk_mq_alloc_rqs函数,这个函数是分配request结构体的,代码比较复杂,就不贴出来了。
大体过程就是这样了,,内存分配,成员赋值。。。。。