一个设备关联到一个iommu_group,且和iommu_domain关联。一个domain代表一张映射表。
如果一个设备将驱动绑定到vfio。那么vfio会重新创建一个domain和这个设备关联,不过这个domain的type是IOMMU_DOMAIN_UNMANAGED。使用的依然是同一个组。使用iommu_domain_alloc函数创建。
[ 658.270026] ===arm_smmu_init_domain_context 0 domain ffff00ff8da47758 type 1 dev 0000:0b:10.1 pgtbl_ops 0
[ 658.279573] CPU: 31 PID: 7196 Comm: qemu-kvm Kdump: loaded Tainted: G E 5.10.134.anolis+ #24
[ 658.289264] Hardware name: PHYTIUM LTD Phytium S2500/64/Phytium S2500/64, BIOS V2.2 Feb 9 2021
[ 658.297918] Call trace:
[ 658.300358] dump_backtrace+0x0/0x210
[ 658.304002] show_stack+0x20/0x30
[ 658.307302] dump_stack+0xd8/0x130
[ 658.310689] arm_smmu_attach_dev+0x250/0x998
[ 658.314938] __iommu_attach_device+0x2c/0xd8
[ 658.319185] __iommu_attach_group+0x64/0xa0
[ 658.323346] iommu_attach_group+0x3c/0x60
[ 658.327341] vfio_iommu_attach_group.isra.28+0x44/0x50 [vfio_iommu_type1]
[ 658.334096] vfio_iommu_type1_attach_group+0x204/0x8e0 [vfio_iommu_type1]
[ 658.340853] vfio_fops_unl_ioctl+0x190/0x298 [vfio]
[ 658.345709] __arm64_sys_ioctl+0xb0/0xf0
[ 658.349612] el0_svc_common+0xbc/0x210
[ 658.353341] do_el0_svc+0x30/0xa0
[ 658.356640] el0_svc+0x20/0x30
[ 658.359679] el0_sync_handler+0x90/0xb8
[ 658.363496] el0_sync+0x160/0x180
[ 658.366819] ===arm_smmu_init_domain_context 1 domain ffff00ff8da47758 dev 0000:0b:10.1 pgtbl_ops ffff00ff8da47658
[ 658.377158] ===arm_smmu_init_domain_context 2 domain ffff00ff8da47758 dev 0000:0b:10.1 pgtbl_ops ffff00ff8da47658
前面了解到一个dma_map_ops和设备关联。当驱动使用dma的map接口进行iova和phys地址映射。在透传设备的时候,map的操作实际上上guestOS进行的行为,这种操作应该会触发kvm/qemu进行拦截。qemu/kvm将执行map操作进行iova和vaddr的管理。。。。
默认vfio驱动会创建/dev/vfio/vfio的miscdevice设备:
其它的设备数字形式的代表的iommu_group号,和/sys/kernel/iommu_groups/下的一致,因为是用一个设备透传到虚拟机。
[root@localhost ~]# ls /dev/vfio/ -l
total 0
crw------- 1 root root 244, 0 Feb 9 16:24 1
crw-rw-rw- 1 root root 10, 196 Feb 9 16:24 vfio
虚拟机的map流程:
[ 230.740469] ===__arm_lpae_map iova 10000 paddr 379cd470000 size 65536
[ 230.747060] CPU: 120 PID: 7459 Comm: qemu-kvm Kdump: loaded Not tainted 4.19.0l+ #44
[ 230.754767] Hardware name: PHYTIUM LTD Phytium S2500/64/Phytium S2500/64, BIOS V2.2 Feb 9 2021
[ 230.763423] Call trace:
[ 230.765860] dump_backtrace+0x0/0x1c0
[ 230.769506] show_stack+0x24/0x30
[ 230.772806] dump_stack+0x9c/0xbc
[ 230.776107] __arm_lpae_map+0x94/0x2f0
[ 230.779839] __arm_lpae_map+0x190/0x2f0
[ 230.783657] arm_lpae_map+0xf8/0x128
[ 230.787216] arm_smmu_map+0x90/0xb0
[ 230.790689] iommu_map+0xdc/0x280
[ 230.793991] vfio_iommu_type1_attach_group+0x4d8/0x738 [vfio_iommu_type1]
[ 230.800748] vfio_fops_unl_ioctl+0x16c/0x290 [vfio]
[ 230.805604] do_vfs_ioctl+0xc4/0x838
[ 230.809164] ksys_ioctl+0x84/0xb8
[ 230.812464] __arm64_sys_ioctl+0x28/0x38
[ 230.816368] el0_svc_handler+0xac/0xf8
[ 230.820099] el0_svc+0x8/0xc
static long vfio_iommu_type1_ioctl(void *iommu_data,
unsigned int cmd, unsigned long arg)
{
..............
} else if (cmd == VFIO_IOMMU_MAP_DMA) {
struct vfio_iommu_type1_dma_map map;
uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
VFIO_DMA_MAP_FLAG_WRITE;
minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
if (copy_from_user(&map, (void __user *)arg, minsz))
return -EFAULT;
printk("===vfio_iommu_type1_ioctl cmd VFIO_IOMMU_MAP_DMA iova %llx vaddr %llx size %llx\n",map.iova,map.vaddr,map.size);
if (map.argsz < minsz || map.flags & ~mask)
return -EINVAL;
return vfio_dma_do_map(iommu, &map);
.................................
}
传递的参数:
struct vfio_iommu_type1_dma_map {
__u32 argsz;
__u32 flags;
#define VFIO_DMA_MAP_FLAG_READ (1 << 0) /* readable from device */
#define VFIO_DMA_MAP_FLAG_WRITE (1 << 1) /* writable from device */
__u64 vaddr; /* Process virtual address */
__u64 iova; /* IO virtual address */
__u64 size; /* Size of mapping (bytes) */
};
打开/dev/vfio/1
[ 3015.631677] ===vfio_group_fops_open
[ 3015.635267] CPU: 18 PID: 8276 Comm: cat Kdump: loaded Not tainted 4.19.0l+ #44
[ 3015.642456] Hardware name: PHYTIUM LTD Phytium S2500/64/Phytium S2500/64, BIOS V2.2 Feb 9 2021
[ 3015.651112] Call trace:
[ 3015.653554] dump_backtrace+0x0/0x1c0
[ 3015.657200] show_stack+0x24/0x30
[ 3015.660502] dump_stack+0x9c/0xbc
[ 3015.663810] vfio_group_fops_open+0x38/0x150 [vfio]
[ 3015.668665] chrdev_open+0xcc/0x238
[ 3015.672138] do_dentry_open+0x11c/0x370
[ 3015.675956] vfs_open+0x38/0x48
[ 3015.679082] do_last+0x23c/0x830
[ 3015.682295] path_openat+0x88/0x258
[ 3015.685766] do_filp_open+0x88/0x100
[ 3015.689324] do_sys_open+0x180/0x210
[ 3015.692883] __arm64_sys_openat+0x2c/0x38
[ 3015.696874] el0_svc_handler+0xac/0xf8
[ 3015.700605] el0_svc+0x8/0xc
打开/dev/vfio/vfio
[ 3040.453592] ===vfop_fops_open
[ 3040.456642] CPU: 19 PID: 8303 Comm: cat Kdump: loaded Not tainted 4.19.0l+ #44
[ 3040.463829] Hardware name: PHYTIUM LTD Phytium S2500/64/Phytium S2500/64, BIOS V2.2 Feb 9 2021
[ 3040.472485] Call trace:
[ 3040.474926] dump_backtrace+0x0/0x1c0
[ 3040.478571] show_stack+0x24/0x30
[ 3040.481873] dump_stack+0x9c/0xbc
[ 3040.485180] vfio_fops_open+0x28/0x94 [vfio]
[ 3040.489432] misc_open+0x114/0x1c0
[ 3040.492819] chrdev_open+0xcc/0x238
[ 3040.496293] do_dentry_open+0x11c/0x370
[ 3040.500110] vfs_open+0x38/0x48
[ 3040.503237] do_last+0x23c/0x830
[ 3040.506449] path_openat+0x88/0x258
[ 3040.509921] do_filp_open+0x88/0x100
[ 3040.513479] do_sys_open+0x180/0x210
[ 3040.517038] __arm64_sys_openat+0x2c/0x38
[ 3040.521028] el0_svc_handler+0xac/0xf8
[ 3040.524759] el0_svc+0x8/0xc
参考如何使用vfio:
(77条消息) VFIO —将 DMA 映射暴露给用户态_宋宝华的博客-优快云博客
vfio驱动:
vfio.c核心层代码,供vfio-pci使用。
其中定义了一个static vfio {xxx}vfio;
定义可一个全局的静态变量vfio,管理所有的vfio_group和vfio_iommu_driver等
static struct vfio {
struct class *class;
struct list_head iommu_drivers_list;
struct mutex iommu_drivers_lock;
struct list_head group_list;
struct idr group_idr;
struct mutex group_lock;
struct cdev group_cdev;
dev_t group_devt;
wait_queue_head_t release_q;
} vfio;
其它结构体:
struct vfio_container {
struct kref kref;
struct list_head group_list;
struct rw_semaphore group_lock;
struct vfio_iommu_driver *iommu_driver;
void *iommu_data;
bool noiommu;
};
对iommu_group的封装。类似iommu_group本身也可以包含多个device,这里device_list也同样
struct vfio_group {
struct kref kref;
int minor;
atomic_t container_users;
struct iommu_group *iommu_group;
struct vfio_container *container;
struct list_head device_list;
struct mutex device_lock;
struct device *dev;
struct notifier_block nb;
struct list_head vfio_next;
struct list_head container_next;
struct list_head unbound_list;
struct mutex unbound_lock;
atomic_t opened;
wait_queue_head_t container_q;
bool noiommu;
struct kvm *kvm;
struct blocking_notifier_head notifier;
};
对device的封装
struct vfio_device {
struct kref kref;
struct device *dev;
const struct vfio_device_ops *ops;
struct vfio_group *group;
struct list_head group_next;
void *device_data;
};
vfio-pci.c给pci设备用。
$ lspci -n -s 0000:06:0d.0
06:0d.0 0401: 1102:0002 (rev 08)
# echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind
# echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id
vfio-pci加载后,vfio_pci_probe调用vfio_add_group_dev。
int vfio_add_group_dev(struct device *dev,
const struct vfio_device_ops *ops, void *device_data)
{
struct iommu_group *iommu_group;
struct vfio_group *group; //vfio_group是iommu_group的封装
struct vfio_device *device;
iommu_group = iommu_group_get(dev);
if (!iommu_group)
return -EINVAL;
group = vfio_group_get_from_iommu(iommu_group); 根据iommu_group查找vfio_group
if (!group) {
group = vfio_create_group(iommu_group); 新的iommu_group创建vfio_group
if (IS_ERR(group)) {
iommu_group_put(iommu_group);
return PTR_ERR(group);
}
} else {
/*
* A found vfio_group already holds a reference to the
* iommu_group. A created vfio_group keeps the reference.
*/
iommu_group_put(iommu_group);
}
//从现有的vfio_group查看使用存在该device对应的vfio_device
device = vfio_group_get_device(group, dev);
if (device) {
WARN(1, "Device %s already exists on group %d\n",
dev_name(dev), iommu_group_id(iommu_group));
vfio_device_put(device);
vfio_group_put(group);
return -EBUSY;
}
//对device创建vfio_device 并加入到vfio_group链表中
device = vfio_group_create_device(group, dev, ops, device_data);
if (IS_ERR(device)) {
vfio_group_put(group);
return PTR_ERR(device);
}
/*
* Drop all but the vfio_device reference. The vfio_device holds
* a reference to the vfio_group, which holds a reference to the
* iommu_group.
*/
vfio_group_put(group);
return 0;
}
EXPORT_SYMBOL_GPL(vfio_add_group_dev);
在这个函数中打印device等信息。
[ 509.421557] ===vfio_register_group_dev 0000:0b:00.0 vfio_device ffff00ff90f41000 vfio_group ffff00ff8d85b000 iommu_group ffff00ff8d7dc100
[ 509.453471] ===vfio_register_group_dev 0000:0b:00.1 vfio_device ffff00ff90f45400 vfio_group ffff00ff8d85b400 iommu_group ffff00ff90930100
[ 509.485476] ===vfio_register_group_dev 0000:0b:00.2 vfio_device ffff00ff90f44c00 vfio_group ffff00ff8d85ba00 iommu_group ffff00ff90930400
一个进程通常是一个container;container可以包含多个vfio_group。。这些共享同一个表。根据使用方式来分析:
int container, group, device, i;
struct vfio_group_status group_status =
{ .argsz = sizeof(group_status) };
struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
struct vfio_iommu_type1_dma_map dma_map = { .argsz = sizeof(dma_map) };
struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
/* Create a new container */
container = open("/dev/vfio/vfio", O_RDWR); //调用vfio_fops_open函数创建struct vfio_container
if (ioctl(container, VFIO_GET_API_VERSION) != VFIO_API_VERSION)
/* Unknown API version */
if (!ioctl(container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU))
/* Doesn't support the IOMMU driver we want. */
/* Open the group */
group = open("/dev/vfio/26", O_RDWR); //调用vfio_group_fops_open 获取vfio_group放入filep->private_data
/* Test the group is viable and available */
ioctl(group, VFIO_GROUP_GET_STATUS, &group_status);
if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE))
/* Group is not viable (ie, not all devices bound for vfio) */
/* Add the group to the container */
ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
// vfio_group_set_container将vfio_group加入到vfio_container中,并设置group->container = container;
/* Enable the IOMMU model we want */
ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
//调用vfio_ioctl_set_iommu->vfio_iommu_type1_open创建struct vfio_iommu * 放入vfio_container的iommu_data和设置container的iommu_driver
//container->iommu_driver = driver;
//container->iommu_data = data;
/* Get addition IOMMU info */
ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info);
/* Allocate some space and setup a DMA mapping */
dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
dma_map.size = 1024 * 1024;
dma_map.iova = 0; /* 1MB starting at 0x0 from device view */
dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
/* Get a file descriptor for the device */
device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");//通过vfio_group的ioctl,获取到device的fd,该fd是通过vfio_device.ops(vfio-pci中提供)的vfio_device_ops->open函数。供后面的ioctl调用。
/* Test and setup the device */
ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);//调用vfio_pic.c中提供的vfio_pci_ioctl
for (i = 0; i < device_info.num_regions; i++) {
struct vfio_region_info reg = { .argsz = sizeof(reg) };
reg.index = i;
ioctl(device, VFIO_DEVICE_GET_REGION_INFO, ®);
/* Setup mappings... read/write offsets, mmaps
* For PCI devices, config space is a region */
}
for (i = 0; i < device_info.num_irqs; i++) {
struct vfio_irq_info irq = { .argsz = sizeof(irq) };
irq.index = i;
ioctl(device, VFIO_DEVICE_GET_IRQ_INFO, &irq);
/* Setup IRQs... eventfds, VFIO_DEVICE_SET_IRQS */
}
/* Gratuitous device reset and go... */
ioctl(device, VFIO_DEVICE_RESET);
相关的ioctl 命令CMD:
/dev/vfio/vfio :是对打开的container进行的ioctl
提供的ioctl CMD:
VFIO_GET_API_VERSION
VFIO_CHECK_EXTENSION
VFIO_SET_IOMMU
其它调用vfio_iommu_driver的ops->ioctl
VFIO_CHECK_EXTENSION
VFIO_IOMMU_GET_INFO
VFIO_IOMMU_MAP_DMA
VFIO_IOMMU_UNMAP_DMA
/dev/vfio/%{GROUP} :对open的group描述符进行ioctl
VFIO_GROUP_GET_STATUS
VFIO_GROUP_SET_CONTAINER 将group加入到container
VFIO_GROUP_UNSET_CONTAINER
VFIO_GROUP_GET_DEVICE_FD 获取device的fd。供后续的ioctl。比如获取pci的配置空间等。
VFIO_GROUP_SET_CONTAINER:
将vfio_group加入到某个container中。调用的函数是vfio_group_set_container:
static int vfio_group_set_container(struct vfio_group *group, int container_fd)
{
struct fd f;
struct vfio_container *container;
struct vfio_iommu_driver *driver;
int ret = 0;
if (atomic_read(&group->container_users))
return -EINVAL;
if (group->noiommu && !capable(CAP_SYS_RAWIO))
return -EPERM;
f = fdget(container_fd);
if (!f.file)
return -EBADF;
/* Sanity check, is this really our fd? */
if (f.file->f_op != &vfio_fops) {
fdput(f);
return -EINVAL;
}
container = f.file->private_data;
WARN_ON(!container); /* fget ensures we don't race vfio_release */
down_write(&container->group_lock);
/* Real groups and fake groups cannot mix */
if (!list_empty(&container->group_list) &&
container->noiommu != group->noiommu) {
ret = -EPERM;
goto unlock_out;
}
driver = container->iommu_driver;
if (driver) { //需driver不为空,这个driver是在进行VFIO_SET_IOMMU之后才有。调用vfio_iommu_driver_ops下的attach_group。
ret = driver->ops->attach_group(container->iommu_data,
group->iommu_group);
if (ret)
goto unlock_out;
}
group->container = container;
container->noiommu = group->noiommu;
list_add(&group->container_next, &container->group_list); //加入到container的group_list链表。
/* Get a reference on the container and mark a user within the group */
vfio_container_get(container);
atomic_inc(&group->container_users);
unlock_out:
up_write(&container->group_lock);
fdput(f);
return ret;
}
vfio_iommu_driver_ops
vfio_iommu_type1.c:
static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
.name = "vfio-iommu-type1",
.owner = THIS_MODULE,
.open = vfio_iommu_type1_open,
.release = vfio_iommu_type1_release,
.ioctl = vfio_iommu_type1_ioctl,
.attach_group = vfio_iommu_type1_attach_group,
.detach_group = vfio_iommu_type1_detach_group,
.pin_pages = vfio_iommu_type1_pin_pages,
.unpin_pages = vfio_iommu_type1_unpin_pages,
.register_notifier = vfio_iommu_type1_register_notifier,
.unregister_notifier = vfio_iommu_type1_unregister_notifier,
};
vfio_iommu_type1_attach_group
第一个参数iommu_data是container下的iommu_data、因此同一个container下的某个group进行attach操作。该函数干了下面的事情:
1:为该group下的iommu_group申请新的domain。类型为UNMANAGED。
2:如果下一次还是该container下的其它group。则会尝试将下面的device设备attach到第一个group创建的doamin。并撤销这次的domain。其结果是同一个container下的group下的设备都是有同一个domain。
static int vfio_iommu_type1_attach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
struct vfio_iommu *iommu = iommu_data;
struct vfio_group *group;
struct vfio_domain *domain, *d;
struct bus_type *bus = NULL, *mdev_bus;
int ret;
bool resv_msi, msi_remap;
phys_addr_t resv_msi_base;
mutex_lock(&iommu->lock);
list_for_each_entry(d, &iommu->domain_list, next) {
if (find_iommu_group(d, iommu_group)) {
mutex_unlock(&iommu->lock);
return -EINVAL;
}
}
if (iommu->external_domain) {
if (find_iommu_group(iommu->external_domain, iommu_group)) {
mutex_unlock(&iommu->lock);
return -EINVAL;
}
}
group = kzalloc(sizeof(*group), GFP_KERNEL); //重新分配一个vfio_group ???
domain = kzalloc(sizeof(*domain), GFP_KERNEL); //分配一个vfio_domain
if (!group || !domain) {
ret = -ENOMEM;
goto out_free;
}
group->iommu_group = iommu_group;
/* Determine bus_type in order to allocate a domain */
ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
if (ret)
goto out_free;
mdev_bus = symbol_get(mdev_bus_type);
if (mdev_bus) { //测试没有使用该bus
if ((bus == mdev_bus) && !iommu_present(bus)) {
symbol_put(mdev_bus_type);
if (!iommu->external_domain) {
INIT_LIST_HEAD(&domain->group_list);
iommu->external_domain = domain;
} else
kfree(domain);
list_add(&group->next,
&iommu->external_domain->group_list);
mutex_unlock(&iommu->lock);
return 0;
}
symbol_put(mdev_bus_type);
}
domain->domain = iommu_domain_alloc(bus); //创建新的domain。
if (!domain->domain) {
ret = -EIO;
goto out_free;
}
if (iommu->nesting) {
int attr = 1;
ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
&attr);
if (ret)
goto out_domain;
}
ret = iommu_attach_group(domain->domain, iommu_group); //遍历该iommu_group下的device设备,调用arm_smmu_ops->attach_dev。
if (ret)
goto out_domain;
resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
INIT_LIST_HEAD(&domain->group_list);
list_add(&group->next, &domain->group_list);
msi_remap = irq_domain_check_msi_remap() ||
iommu_capable(bus, IOMMU_CAP_INTR_REMAP);
if (!allow_unsafe_interrupts && !msi_remap) {
pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
__func__);
ret = -EPERM;
goto out_detach;
}
if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
domain->prot |= IOMMU_CACHE;
/*
* Try to match an existing compatible domain. We don't want to
* preclude an IOMMU driver supporting multiple bus_types and being
* able to include different bus_types in the same IOMMU domain, so
* we test whether the domains use the same iommu_ops rather than
* testing if they're on the same bus_type.
*/
对container下的第一个group下面的循环不走,后面才会加入domain_list。
第二个或者后面的group将判断,如果domain下的ops和前面的相同,则会尝试detach掉,直接使用第一个。
attach: 将iommu_group的domain换成新的domain。并调attach_dev
detach:将iommu_group的domain换成defaul_domain。并调attach_dev
list_for_each_entry(d, &iommu->domain_list, next) {
if (d->domain->ops == domain->domain->ops &&
d->prot == domain->prot) {
iommu_detach_group(domain->domain, iommu_group);
if (!iommu_attach_group(d->domain, iommu_group)) { iommu_attach_group返回0代表成功。
list_add(&group->next, &d->group_list);
iommu_domain_free(domain->domain);
kfree(domain);
mutex_unlock(&iommu->lock);
return 0;
}
//正常情况下面的不会走了,前面已经attach成功。
ret = iommu_attach_group(domain->domain, iommu_group);
if (ret)
goto out_domain;
}
}
vfio_test_domain_fgsp(domain);
/* replay mappings on new domains */
ret = vfio_iommu_replay(iommu, domain);
if (ret)
goto out_detach;
if (resv_msi) {
ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
if (ret)
goto out_detach;
}
将该domain加入到 iommu下的domain_list.
list_add(&domain->next, &iommu->domain_list);
mutex_unlock(&iommu->lock);
return 0;
out_detach:
iommu_detach_group(domain->domain, iommu_group);
out_domain:
iommu_domain_free(domain->domain);
out_free:
kfree(domain);
kfree(group);
mutex_unlock(&iommu->lock);
return ret;
}