nvme 驱动详解 之1
http://blog.youkuaiyun.com/qqqqqq999999/article/details/47732319
首先打开driver/block下的kconfig文件,其中定义了BLK_DEV_NVMEconfig,如下。
config BLK_DEV_NVME
tristate "NVMExpress block device"
depends on PCI
---help---
The NVM Express driver is for solid statedrives directly
connected to the PCI or PCI Express bus. If you know you
don't have one of these, it is safe to answerN.
To compile this driver as a module, choose Mhere: the
module will be called nvme.
通过console,输入makemenuconfig,搜索BLK_DEV_NEME得到如下依赖关系。
Symbol: BLK_DEV_NVME [=m]
| Type : tristate
| Prompt: NVM Express block device
| Location:
| -> Device Drivers
| (1) -> Block devices (BLK_DEV [=y])
| Defined at drivers/block/Kconfig:313
| Dependson: BLK_DEV [=y] && PCI [=y]
可以看到nemv 依赖于BLK和PCI 。
打开driver/block/Makefile,搜索NVME,可以看到:
obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
nvme-y := nvme-core.o nvme-scsi.o
关于和BLK相关的文件,打开block/Makefile:
obj-$(CONFIG_BLOCK) := bio.oelevator.o blk-core.o blk-tag.o blk-sysfs.o
blk-flush.o blk-settings.o blk-ioc.oblk-map.o
blk-exec.o blk-merge.o blk-softirq.oblk-timeout.o
blk-iopoll.o blk-lib.o blk-mq.oblk-mq-tag.o
blk-mq-sysfs.o blk-mq-cpu.oblk-mq-cpumap.o ioctl.o
genhd.o scsi_ioctl.o partition-generic.oioprio.o
partitions/
哇塞,是不是很多?不要担心,NVME也只是用了BLOCK层的一些函数而已,不用把所用与BLOCK相关的文件都看了,除非你有精力去研究。
好了,到目前为止,我们知道了要看哪些文件了,nvme-core.cnvme-scsi.c是必须的,剩下的就是当我们的driver调用到block层哪些函数再去研究。
打开nvme-core,查看入口函数,module_init(nvme_init);
static int __init nvme_init(void)
{
int result;
init_waitqueue_head(&nvme_kthread_wait);//创建等待队列
nvme_workq =create_singlethread_workqueue("nvme");//创建工作队列
if (!nvme_workq)
return -ENOMEM;
result= register_blkdev(nvme_major, "nvme");//注册块设备
if (result < 0)
goto kill_workq;
else if (result > 0)
nvme_major = result;
result= pci_register_driver(&nvme_driver);//注册pci driver
if (result)
goto unregister_blkdev;
return 0;
unregister_blkdev:
unregister_blkdev(nvme_major, "nvme");
kill_workq:
destroy_workqueue(nvme_workq);
return result;
}
注册pci driver后,会调用nvme_driver中的probe函数。发现开始总是美好的,函数是如此的简洁,不要高兴的太早,痛苦的经历正在逼近。
static int nvme_probe(struct pci_devpdev, const struct pci_device_id id)
{
int node, result = -ENOMEM;
struct nvme_dev *dev;
node = dev_to_node(&pdev->dev);//获取node节点,与NUMA系统有关。
if (node == NUMA_NO_NODE)
set_dev_node(&pdev->dev, 0);
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
return -ENOMEM;
dev->entry = kzalloc_node(num_possible_cpus() *sizeof(*dev->entry),//分配msix-entry
GFP_KERNEL,node);
if (!dev->entry)
goto free;
dev->queues = kzalloc_node((num_possible_cpus() + 1) *sizeof(void *),//分配queues 资源,
GFP_KERNEL,node);//这里之所以多1,是因为有admin-queues
if (!dev->queues)
goto free;
INIT_LIST_HEAD(&dev->namespaces);//初始化namespaces链表。
dev->reset_workfn = nvme_reset_failed_dev;
INIT_WORK(&dev->reset_work, nvme_reset_workfn);
dev->pci_dev = pci_dev_get(pdev);
pci_set_drvdata(pdev, dev);
result = nvme_set_instance(dev);//设置pci设备的句柄instance,代表该设备。
if (result)
goto put_pci;
result = nvme_setup_prp_pools(dev);//设置dma需要的prp内存池。
if (result)
goto release;
kref_init(&dev->kref);
result = nvme_dev_start(dev);//创建admin queue、 io queue 、request irq
if (result)
goto release_pools;
if (dev->online_queues > 1)
result = nvme_dev_add(dev);//初始化mq,并增加一个实际可用的nvme dev,并且admin_queue可以发送cmd。
if (result)
goto shutdown;
scnprintf(dev->name, sizeof(dev->name),"nvme%d", dev->instance);
dev->miscdev.minor = MISC_DYNAMIC_MINOR;
dev->miscdev.parent = &pdev->dev;
dev->miscdev.name = dev->name;
dev->miscdev.fops = &nvme_dev_fops;
result = misc_register(&dev->miscdev);//注册一个misc设备
if (result)
goto remove;
nvme_set_irq_hints(dev);
dev->initialized = 1;
return 0;
remove:
nvme_dev_remove(dev);
nvme_dev_remove_admin(dev);
nvme_free_namespaces(dev);
shutdown:
nvme_dev_shutdown(dev);
release_pools:
nvme_free_queues(dev, 0);
nvme_release_prp_pools(dev);
release:
nvme_release_instance(dev);
put_pci:
pci_dev_put(dev->pci_dev);
free:
kfree(dev->queues);
kfree(dev->entry);
kfree(dev);
return result;
}
上面每一个主要功能的函数都简单了注释了一下,描述了做的哪些工作,下面具体看看那些函数怎么实现的。
static int nvme_set_instance(structnvme_dev *dev)
{
int instance, error;
do {
if (!ida_pre_get(&nvme_instance_ida,GFP_KERNEL))
return -ENODEV;
spin_lock(&dev_list_lock);
error = ida_get_new(&nvme_instance_ida,&instance);
spin_unlock(&dev_list_lock);
} while (error == -EAGAIN);
if (error)
return -ENODEV;
dev->instance = instance;//该函数获得设备的instance,相当于该设备的id,代表着该设备。
return 0;
}
Nvme_setup_prp_pools用来创建dma时所用的内存池,prp_page_pool是虚拟内核地址,
static int nvme_setup_prp_pools(structnvme_dev *dev)
{
struct device *dmadev = &dev->pci_dev->dev;
dev->prp_page_pool = dma_pool_create("prp listpage", dmadev,
PAGE_SIZE,PAGE_SIZE, 0);
if (!dev->prp_page_pool)
return -ENOMEM;
/* Optimisation for I/Os between 4k and 128k */
dev->prp_small_pool = dma_pool_create("prp list256", dmadev,
256, 256, 0);
if (!dev->prp_small_pool) {
dma_pool_destroy(dev->prp_page_pool);
return -ENOMEM;
}
return 0;
}
下面是一个重量级的函数之一,nvme_dev_start;
static intnvme_dev_start(struct nvme_dev *dev)
{
int result;
bool start_thread = false;
result = nvme_dev_map(dev);
if (result)
return result;
result = nvme_configure_admin_queue(dev);//配置adminsubmit queue 和complete queue,64 depth
if (result)
goto unmap;
spin_lock(&dev_list_lock);
if (list_empty(&dev_list) &&IS_ERR_OR_NULL(nvme_thread)) {
start_thread = true;
nvme_thread = NULL;
}
list_add(&dev->node, &dev_list);