和内核一样,qemu也需要支持virtqueue,VirtQueue的定义如下
#define VIRTIO_PCI_VRING_ALIGN 4096
typedef struct VRingDesc
{
uint64_t addr;
uint32_t len;
uint16_t flags;
uint16_t next;
} VRingDesc;
typedef struct VRingAvail
{
uint16_t flags;
uint16_t idx;
uint16_t ring[0];
} VRingAvail;
typedef struct VRingUsedElem
{
uint32_t id;
uint32_t len;
} VRingUsedElem;
typedef struct VRingUsed
{
uint16_t flags;
uint16_t idx;
VRingUsedElem ring[0];
} VRingUsed;
typedef struct VRing
{
unsigned int num;
unsigned int align;
hwaddr desc;
hwaddr avail;
hwaddr used;
} VRing;
struct VirtQueue
{
VRing vring; /* vring的元数据 */
hwaddr pa; /* vring实际的内存地址 */
uint16_t last_avail_idx;
/* Last used index value we have signalled on */
uint16_t signalled_used;
/* Last used index value we have signalled on */
bool signalled_used_valid;
/* Notification enabled? */
bool notification;
uint16_t queue_index;
int inuse;
uint16_t vector;
void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);
VirtIODevice *vdev;
EventNotifier guest_notifier;
EventNotifier host_notifier;
};
可以看出VRing结构体的定义,qemu和内核在ABI上是一致的。virtqueue_init用于初始化vring的元数据,同时qemu提供了一系列接口来读写vring的不同成员,e.g.
static inline uint64_t vring_desc_addr(VirtIODevice *vdev, hwaddr desc_pa,
int i) /* 读取第i个VRingDesc的addr地址 */
{
hwaddr pa;
pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
return virtio_ldq_phys(vdev, pa);
}
static inline uint32_t vring_desc_len(VirtIODevice *vdev, hwaddr desc_pa, int i) /* 读取第i个VRingDesc的len */
{
hwaddr pa;
pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
return virtio_ldl_phys(vdev, pa);
}
static inline uint16_t vring_desc_flags(VirtIODevice *vdev, hwaddr desc_pa, /* 读取第i个VRingDesc的flags */
int i)
{
hwaddr pa;
pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
return virtio_lduw_phys(vdev, pa);
}
static inline uint16_t vring_desc_next(VirtIODevice *vdev, hwaddr desc_pa, /* 读取第i个VRingDesc的next索引 */
int i)
{
hwaddr pa;
pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
return virtio_lduw_phys(vdev, pa);
}
static inline uint16_t vring_avail_flags(VirtQueue *vq) /* 读取avail ring的flags */
{
hwaddr pa;
pa = vq->vring.avail + offsetof(VRingAvail, flags);
return virtio_lduw_phys(vq->vdev, pa);
}
static inline uint16_t vring_avail_idx(VirtQueue *vq) /* 读取avail ring的idx */
{
hwaddr pa;
pa = vq->vring.avail + offsetof(VRingAvail, idx);
return virtio_lduw_phys(vq->vdev, pa);
}
static inline uint16_t vring_avail_ring(VirtQueue *vq, int i) /* 读取avail ring的第i个idx */
{
hwaddr pa;
pa = vq->vring.avail + offsetof(VRingAvail, ring[i]);
return virtio_lduw_phys(vq->vdev, pa);
}
static inline uint16_t vring_used_event(VirtQueue *vq) /* 读取avail ring中保存的used_event_idx */
{
return vring_avail_ring(vq, vq->vring.num);
}
static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val) /* 修改used ring中第i个elem的id */
{
hwaddr pa;
pa = vq->vring.used + offsetof(VRingUsed, ring[i].id);
virtio_stl_phys(vq->vdev, pa, val);
}
static inline void vring_used_ring_len(VirtQueue *vq, int i, uint32_t val) /* 修改used ring中第i个elem的len */
{
hwaddr pa;
pa = vq->vring.used + offsetof(VRingUsed, ring[i].len);
virtio_stl_phys(vq->vdev, pa, val);
}
static uint16_t vring_used_idx(VirtQueue *vq) /* 读取used ring中的idx */
{
hwaddr pa;
pa = vq->vring.used + offsetof(VRingUsed, idx);
return virtio_lduw_phys(vq->vdev, pa);
}
static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val) /* 设置used ring中的idx */
{
hwaddr pa;
pa = vq->vring.used + offsetof(VRingUsed, idx);
virtio_stw_phys(vq->vdev, pa, val);
}
static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask) /* 设置used ring中flags的bit位 */
{
VirtIODevice *vdev = vq->vdev;
hwaddr pa;
pa = vq->vring.used + offsetof(VRingUsed, flags);
virtio_stw_phys(vdev, pa, virtio_lduw_phys(vdev, pa) | mask);
}
static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask) /* 清理used ring中flags的bit位 */
{
VirtIODevice *vdev = vq->vdev;
hwaddr pa;
pa = vq->vring.used + offsetof(VRingUsed, flags);
virtio_stw_phys(vdev, pa, virtio_lduw_phys(vdev, pa) & ~mask);
}
同时后端也提供了一系列接口来处理used ring,e.g.
virtqueue_pop主要用于从descriptor table中找到available ring中添加的buffer,即guest新添加并让后端处理的buffer
int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem)
{
unsigned int i, head, max;
hwaddr desc_pa = vq->vring.desc;
VirtIODevice *vdev = vq->vdev;
if (!virtqueue_num_heads(vq, vq->last_avail_idx)) /* 对比vring_avail_idx(vq)和vq->last_avail_idx,判断vq的avail idx是否有增长 */
return 0; /* 如果为0表示avail ring没有新的buffer,无需处理直接返回 */
/* When we start there are none of either input nor output. */
elem->out_num = elem->in_num = 0;
max = vq->vring.num;
i = head = virtqueue_get_head(vq, vq->last_avail_idx++); /* 从last_avail_idx开始,avail ring指向的vring desc entry索引 */
if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) { /* <span style="font-family: Arial, Helvetica, sans-serif;">如果guest enable VIRTIO_RING_F_EVENT_IDX */</span>
vring_avail_event(vq, vring_avail_idx(vq)); /* 设置avail_event_idx为最新的avail ring idx值 */
}
if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) { /* 第i个desc的flags如果enable VRING_DESC_F_INDIRECT */
if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) { /* INDIRECT的desc len必须是sizeof(VRingDesc)的整数倍 */
error_report("Invalid size for indirect buffer table");
exit(1);
}
/* loop over the indirect descriptor table */
max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc); /* 最多遍历max个VRingDesc */
desc_pa = vring_desc_addr(vdev, desc_pa, i); /* desc_pa指向indirect指向的VRingDesc数组 */
i = 0;
}
/* Collect all the descriptors */
do { /* 遍历VRingDesc的项,把addr, len填到VirtQueueElement结构体里 */
struct iovec *sg;
if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) {
if (elem->in_num >= ARRAY_SIZE(elem->in_sg)) {
error_report("Too many write descriptors in indirect table");
exit(1);
}
elem->in_addr[elem->in_num] = vring_desc_addr(vdev, desc_pa, i);
sg = &elem->in_sg[elem->in_num++];
} else {
if (elem->out_num >= ARRAY_SIZE(elem->out_sg)) {
error_report("Too many read descriptors in indirect table");
exit(1);
}
elem->out_addr[elem->out_num] = vring_desc_addr(vdev, desc_pa, i);
sg = &elem->out_sg[elem->out_num++];
}
sg->iov_len = vring_desc_len(vdev, desc_pa, i); /* sg的iov_base部分被存放到in_addr, out_addr里 */
/* If we've got too many, that implies a descriptor loop. */
if ((elem->in_num + elem->out_num) > max) {
error_report("Looped descriptor");
exit(1);
}
} while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max); /* 遍历VRingDesc,直到max */
/* Now map what we have collected */
virtqueue_map_sg(elem->in_sg, elem->in_addr, elem->in_num, 1); /* 通过cpu_physical_memory_map把地址映射成HVA,存入sg->iov_base */
virtqueue_map_sg(elem->out_sg, elem->out_addr, elem->out_num, 0);
elem->index = head; /* index设置为VRingDesc head index */
vq->inuse++;
trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num);
return elem->in_num + elem->out_num; /* 返回virtqueue_pop总共的VRingDesc个数, */
}
virtqueue_fill当virtio host端(qemu/vhost)处理完guest放入avail ring中的buffer之后,把buffer解除映射并放入used ring
void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
unsigned int len, unsigned int idx)
{
unsigned int offset;
int i;
trace_virtqueue_fill(vq, elem, len, idx);
offset = 0;
for (i = 0; i < elem->in_num; i++) { /* 取消sg_in的HVA内存映射 */
size_t size = MIN(len - offset, elem->in_sg[i].iov_len);
cpu_physical_memory_unmap(elem->in_sg[i].iov_base,
elem->in_sg[i].iov_len,
1, size);
offset += size;
}
for (i = 0; i < elem->out_num; i++) /* 取消sg_out的HVA内存映射 */
cpu_physical_memory_unmap(elem->out_sg[i].iov_base,
elem->out_sg[i].iov_len,
0, elem->out_sg[i].iov_len);
idx = (idx + vring_used_idx(vq)) % vq->vring.num; /* 计算新的used ring idx值,通过idx + used_event_idx对vring.num取模 */
/* Get a pointer to the next entry in the used ring. */
vring_used_ring_id(vq, idx, elem->index); /* 配置新的used ring项的内容,id是elem->index指向的VRingDesc的索引,len为其长度 */
vring_used_ring_len(vq, idx, len);
}
virtqueue_flush用于更新user ring的idx
void virtqueue_flush(VirtQueue *vq, unsigned int count)
{
uint16_t old, new;
/* Make sure buffer is written before we update index. */
smp_wmb();
trace_virtqueue_flush(vq, count);
old = vring_used_idx(vq);
new = old + count;
vring_used_idx_set(vq, new);
vq->inuse -= count;
if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old)))
vq->signalled_used_valid = false; /* 是否触发used_event */
}