nvme 是SSD的上层协议,nvme是的物理层是pcie接口。
nvme的入口函数在drivers/nvme/host/pci.c 中
static int __init nvme_init(void)
{
int result;
nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
if (!nvme_workq)
return -ENOMEM;
result = pci_register_driver(&nvme_driver);
if (result)
destroy_workqueue(nvme_workq);
return result;
}
module_init(nvme_init);
在nvme_init 中可以看到首先申请了一个全局的workqueue,然后就调用pci_register_driver 来注册一个pci的dirver,之后nvme_init的工作就完成了,但在pcie总线上检测到这个pcie设备时,就会调用nvme_probe。目前支持的nvme的pcie设备为
static const struct pci_device_id nvme_id_table[] = {
{ PCI_VDEVICE(INTEL, 0x0953),
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DISCARD_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x0a53),
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DISCARD_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x0a54),
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DISCARD_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
{ PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
{ 0, }
};
可以目前kernel支持的设备大部分是intel的
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
int node, result = -ENOMEM;
struct nvme_dev *dev;
// 通过调用dev_to_node 得到这个pci_dev的numa节点,如果没有制定的话,默认用first_memory_node,也就是第一个numa节点
node = dev_to_node(&pdev->dev);
if (node == NUMA_NO_NODE)
set_dev_node(&pdev->dev, first_memory_node);
// 申请一个nvme_dev ,可见kennel中是用nvme_dev 来表示一个nvme设备
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
return -ENOMEM;
//为每个cpu申请一个queue
dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
GFP_KERNEL, node);
if (!dev->queues)
goto free;
dev->dev = get_device(&pdev->dev);
pci_set_drvdata(pdev, dev);
result = nvme_dev_map(dev);
if (result)
goto free;
//初始化两个个work,这两个work最后都是放在全局的workqueue nvme_workq 中执行。
INIT_WORK(&dev->reset_work, nvme_reset_work);
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
// 设定一个watchdog
setup_timer(&dev->watchdog_timer, nvme_watchdog_timer,
(unsigned long)dev);
mutex_init(&dev->shutdown_lock);
//初始化一个完成量
init_completion(&dev->ioq_wait);
//prp_page_pool和prp_small_pool memory的申请
result = nvme_setup_prp_pools(dev);
if (result)
goto put_pci;
//初始化NVMe controller structures
result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
id->driver_data);
if (result)
goto release_pools;
//从开机log中可以看到如下log
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
//将dev->reset_work 放到nvme_workq 中执行
queue_work(nvme_workq, &dev->reset_work);
return 0;
release_pools:
nvme_release_prp_pools(dev);
put_pci:
put_device(dev->dev);
nvme_dev_unmap(dev);
free:
kfree(dev->queues);
kfree(dev);
return result;
}
nvme_probe 中有调用了nvme_dev_map/nvme_setup_prp_pools/nvme_init_ctrl函数,我们着个看
static int nvme_dev_map(struct nvme_dev *dev)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
if (pci_request_mem_regions(pdev, "nvme"))
return -ENODEV;
dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
if (!dev->bar)
goto release;
return 0;
release:
pci_release_mem_regions(pdev);
return -ENODEV;
}
nvme_dev_map 函数通过dev->dev从nvme_dev得到device,又通过to_pci_dev,从device得到pci_dev
随后调用pci_request_mem_regions来得到nvme这个iomem,然后通过ioremap(pci_resource_start(pdev, 0), 8192); 映射8k的bar空间
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
PAGE_SIZE, PAGE_SIZE, 0);
if (!dev->prp_page_pool)
return -ENOMEM;
/* Optimisation for I/Os between 4k and 128k */
dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
256, 256, 0);
if (!dev->prp_small_pool) {
dma_pool_destroy(dev->prp_page_pool);
return -ENOMEM;
}
return 0;
}
而nvme_setup_prp_pools 中分别设请了一个4K的名为prp list page的dma buffer,申请了一个名为prp list 256的256 bytes的dma buffer,之前我们说过如果申请的dma buffer在一个page以内,建议采用dma_pool_create来申请。
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
const struct nvme_ctrl_ops *ops, unsigned long quirks)
{
int ret;
ctrl->state = NVME_CTRL_NEW;
spin_lock_init(&ctrl->lock);
INIT_LIST_HEAD(&ctrl->namespaces);
mutex_init(&ctrl->namespaces_mutex);
kref_init(&ctrl->kref);
ctrl->dev = dev;
ctrl->ops = ops;
ctrl->quirks = quirks;
//申请两个work
INIT_WORK(&ctrl->scan_work, nvme_scan_work);
INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
ret = nvme_set_instance(ctrl);
if (ret)
goto out;
//在sys/devices下创建nvme 这个device,如果只有1个nvme的话,那就是nvme0,一次类推.
ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
MKDEV(nvme_char_major, ctrl->instance),
ctrl, nvme_dev_attr_groups,
"nvme%d", ctrl->instance);
if (IS_ERR(ctrl->device)) {
ret = PTR_ERR(ctrl->device);
goto out_release_instance;
}
get_device(ctrl->device);
ida_init(&ctrl->ns_ida);
spin_lock(&dev_list_lock);
将nvme controller添加到nvme_ctrl_list 这个list中。所有的nvme controller也都在nvme_ctrl_list 这个list中.
list_add_tail(&ctrl->node, &nvme_ctrl_list);
spin_unlock(&dev_list_lock);
return 0;
out_release_instance:
nvme_release_instance(ctrl);
out:
return ret;
}
nvme的入口函数在drivers/nvme/host/pci.c 中
static int __init nvme_init(void)
{
int result;
nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
if (!nvme_workq)
return -ENOMEM;
result = pci_register_driver(&nvme_driver);
if (result)
destroy_workqueue(nvme_workq);
return result;
}
module_init(nvme_init);
在nvme_init 中可以看到首先申请了一个全局的workqueue,然后就调用pci_register_driver 来注册一个pci的dirver,之后nvme_init的工作就完成了,但在pcie总线上检测到这个pcie设备时,就会调用nvme_probe。目前支持的nvme的pcie设备为
static const struct pci_device_id nvme_id_table[] = {
{ PCI_VDEVICE(INTEL, 0x0953),
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DISCARD_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x0a53),
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DISCARD_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x0a54),
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DISCARD_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
{ PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
{ 0, }
};
可以目前kernel支持的设备大部分是intel的
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
int node, result = -ENOMEM;
struct nvme_dev *dev;
// 通过调用dev_to_node 得到这个pci_dev的numa节点,如果没有制定的话,默认用first_memory_node,也就是第一个numa节点
node = dev_to_node(&pdev->dev);
if (node == NUMA_NO_NODE)
set_dev_node(&pdev->dev, first_memory_node);
// 申请一个nvme_dev ,可见kennel中是用nvme_dev 来表示一个nvme设备
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
return -ENOMEM;
//为每个cpu申请一个queue
dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
GFP_KERNEL, node);
if (!dev->queues)
goto free;
dev->dev = get_device(&pdev->dev);
pci_set_drvdata(pdev, dev);
result = nvme_dev_map(dev);
if (result)
goto free;
//初始化两个个work,这两个work最后都是放在全局的workqueue nvme_workq 中执行。
INIT_WORK(&dev->reset_work, nvme_reset_work);
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
// 设定一个watchdog
setup_timer(&dev->watchdog_timer, nvme_watchdog_timer,
(unsigned long)dev);
mutex_init(&dev->shutdown_lock);
//初始化一个完成量
init_completion(&dev->ioq_wait);
//prp_page_pool和prp_small_pool memory的申请
result = nvme_setup_prp_pools(dev);
if (result)
goto put_pci;
//初始化NVMe controller structures
result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
id->driver_data);
if (result)
goto release_pools;
//从开机log中可以看到如下log
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
//将dev->reset_work 放到nvme_workq 中执行
queue_work(nvme_workq, &dev->reset_work);
return 0;
release_pools:
nvme_release_prp_pools(dev);
put_pci:
put_device(dev->dev);
nvme_dev_unmap(dev);
free:
kfree(dev->queues);
kfree(dev);
return result;
}
nvme_probe 中有调用了nvme_dev_map/nvme_setup_prp_pools/nvme_init_ctrl函数,我们着个看
static int nvme_dev_map(struct nvme_dev *dev)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
if (pci_request_mem_regions(pdev, "nvme"))
return -ENODEV;
dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
if (!dev->bar)
goto release;
return 0;
release:
pci_release_mem_regions(pdev);
return -ENODEV;
}
nvme_dev_map 函数通过dev->dev从nvme_dev得到device,又通过to_pci_dev,从device得到pci_dev
随后调用pci_request_mem_regions来得到nvme这个iomem,然后通过ioremap(pci_resource_start(pdev, 0), 8192); 映射8k的bar空间
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
PAGE_SIZE, PAGE_SIZE, 0);
if (!dev->prp_page_pool)
return -ENOMEM;
/* Optimisation for I/Os between 4k and 128k */
dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
256, 256, 0);
if (!dev->prp_small_pool) {
dma_pool_destroy(dev->prp_page_pool);
return -ENOMEM;
}
return 0;
}
而nvme_setup_prp_pools 中分别设请了一个4K的名为prp list page的dma buffer,申请了一个名为prp list 256的256 bytes的dma buffer,之前我们说过如果申请的dma buffer在一个page以内,建议采用dma_pool_create来申请。
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
const struct nvme_ctrl_ops *ops, unsigned long quirks)
{
int ret;
ctrl->state = NVME_CTRL_NEW;
spin_lock_init(&ctrl->lock);
INIT_LIST_HEAD(&ctrl->namespaces);
mutex_init(&ctrl->namespaces_mutex);
kref_init(&ctrl->kref);
ctrl->dev = dev;
ctrl->ops = ops;
ctrl->quirks = quirks;
//申请两个work
INIT_WORK(&ctrl->scan_work, nvme_scan_work);
INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
ret = nvme_set_instance(ctrl);
if (ret)
goto out;
//在sys/devices下创建nvme 这个device,如果只有1个nvme的话,那就是nvme0,一次类推.
ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
MKDEV(nvme_char_major, ctrl->instance),
ctrl, nvme_dev_attr_groups,
"nvme%d", ctrl->instance);
if (IS_ERR(ctrl->device)) {
ret = PTR_ERR(ctrl->device);
goto out_release_instance;
}
get_device(ctrl->device);
ida_init(&ctrl->ns_ida);
spin_lock(&dev_list_lock);
将nvme controller添加到nvme_ctrl_list 这个list中。所有的nvme controller也都在nvme_ctrl_list 这个list中.
list_add_tail(&ctrl->node, &nvme_ctrl_list);
spin_unlock(&dev_list_lock);
return 0;
out_release_instance:
nvme_release_instance(ctrl);
out:
return ret;
}