vhost是virtio的另一种方案,用于跳过qemu,减少qemu和内核之间上下文切换的开销,对于网络IO而言提升尤其明显。vhost目前有两种实现方案,内核态和用户态,本文重点讨论内核态的vhost
vhost内核模块主要处理数据面的事情,控制面上还是交给qemu,vhost的数据结构如下
struct vhost_dev {
MemoryListener memory_listener; /* MemoryListener是物理内存操作的回调函数集合 */
struct vhost_memory *mem;
int n_mem_sections;
MemoryRegionSection *mem_sections;
struct vhost_virtqueue *vqs; /* vhost_virtqueue列表和个数 */
int nvqs;
/* the first virtuque which would be used by this vhost dev */
int vq_index;
unsigned long long features; /* vhost设备支持的features */
unsigned long long acked_features; /* guest acked的features */
unsigned long long backend_features; /* backend, e.g. tap设备,支持的features */
bool started;
bool log_enabled;
vhost_log_chunk_t *log;
unsigned long long log_size;
Error *migration_blocker;
bool force;
bool memory_changed;
hwaddr mem_changed_start_addr;
hwaddr mem_changed_end_addr;
const VhostOps *vhost_ops; /* VhostOps基于kernel和user两种形态的vhost有不同的实现,内核的实现最终调用ioctl完成 */
void *opaque;
};
struct vhost_virtqueue {
int kick;
int call;
void *desc;
void *avail;
void *used;
int num;
unsigned long long used_phys;
unsigned used_size;
void *ring;
unsigned long long ring_phys;
unsigned ring_size;
EventNotifier masked_notifier;
};
vhost的内存布局,也是由一组vhost_memory_region构成,
struct vhost_memory_region {
__u64 guest_phys_addr;
__u64 memory_size; /* bytes */
__u64 userspace_addr;
__u64 flags_padding; /* No flags are currently specified. */
};
/* All region addresses and sizes must be 4K aligned. */
#define VHOST_PAGE_SIZE 0x1000
struct vhost_memory {
__u32 nregions;
__u32 padding;
struct vhost_memory_region regions[0];
};
vhost的控制面由qemu来控制,通过ioctl操作vhost_xxx的内核模块,e.g.
long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
{
void __user *argp = (void __user *)arg;
struct file *eventfp, *filep = NULL;
struct eventfd_ctx *ctx = NULL;
u64 p;
long r;
int i, fd;
/* If you are not the owner, you can become one */
if (ioctl == VHOST_SET_OWNER) {
r = vhost_dev_set_owner(d);
goto done;
}
/* You must be the owner to do anything else */
r = vhost_dev_check_owner(d);
if (r)
goto done;
switch (ioctl) {
case VHOST_SET_MEM_TABLE:
r = vhost_set_memory(d, argp);
break;
...
default:
r = vhost_set_vring(d, ioctl, argp);
break;
}
done:
return r;
}
VHOST_SET_OWNER,用于把当前guest对应的qemu进程和vhost内核线程关联起来
VHOST_SET_OWNER
/* Caller should have device mutex */
static long vhost_dev_set_owner(struct vhost_dev *dev)
{
struct task_struct *worker;
int err;
/* Is there an owner already? */
if (dev->mm) {
err = -EBUSY;
goto err_mm;
}
/* No owner, become one */
dev->mm = get_task_mm(current); /* 拿到qemu进程的mm_struct,即guest的内存分布结构 */
worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); /* 创建vhost线程 */
if (IS_ERR(worker)) {
err = PTR_ERR(worker);
goto err_worker;
}
dev->worker = worker;
wake_up_process(worker); /* avoid contributing to loadavg */
err = vhost_attach_cgroups(dev);
if (err)
goto err_cgroup;
err = vhost_dev_alloc_iovecs(dev); /* 为vhost_virtqueue分配iovec内存空间 */
if (err)
goto err_cgroup;
return 0;
err_cgroup:
kthread_stop(worker);
dev->worker = NULL;
err_worker:
if (dev->mm)
mmput(dev->mm);
dev->mm = NULL;
err_mm:
return err;
}
VHOST_SET_MEM_TABLE,初始化vhost_dev的vhost_memory内存成员
static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
{
struct vhost_memory mem, *newmem, *oldmem;
unsigned long size = offsetof(struct vhost_memory, regions);
if (copy_from_user(&mem, m, size))
return -EFAULT;
if (mem.padding)
return -EOPNOTSUPP;
if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS)
return -E2BIG;
newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL); /* 分配多个vhost_memory_region */
if (!newmem)
return -ENOMEM;
memcpy(newmem, &mem, size);
if (copy_from_user(newmem->regions, m->regions,
mem.nregions * sizeof *m->regions)) {
kfree(newmem);
return -EFAULT;
}
if (!memory_access_ok(d, newmem, vhost_has_feature(d, VHOST_F_LOG_ALL))) {
kfree(newmem);
return -EFAULT;
}
oldmem = d->memory;
rcu_assign_pointer(d->memory, newmem);
synchronize_rcu();
kfree(oldmem);
return 0;
}
VHOST_GET_FEATURES, VHOST_SET_FEATURES,用于读写vhost支持的features,目前只有vhost_net模块用到,
enum {
VHOST_FEATURES = (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
(1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
(1ULL << VIRTIO_RING_F_EVENT_IDX) |
(1ULL << VHOST_F_LOG_ALL) |
(1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
(1ULL << VIRTIO_NET_F_MRG_RXBUF),
};
static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
unsigned long arg)
{
....
case VHOST_GET_FEATURES:
features = VHOST_FEATURES;
if (copy_to_user(featurep, &features, sizeof features))
return -EFAULT;
return 0;
case VHOST_SET_FEATURES:
if (copy_from_user(&features, featurep, sizeof features))
return -EFAULT;
if (features & ~VHOST_FEATURES)
return -EOPNOTSUPP;
return vhost_net_set_features(n, features);
....
}
VHOST_SET_VRING_CALL,设置irqfd,把中断注入guest
VHOST_SET_VRING_KICK,设置ioeventfd,获取guest notify
case VHOST_SET_VRING_KICK:
if (copy_from_user(&f, argp, sizeof f)) {
r = -EFAULT;
break;
}
eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
if (IS_ERR(eventfp)) {
r = PTR_ERR(eventfp);
break;
}
if (eventfp != vq->kick) { /* eventfp不同于vq->kick,此时需要stop vq->kick同时start eventfp */
pollstop = filep = vq->kick;
pollstart = vq->kick = eventfp;
} else
filep = eventfp; /* 两者相同,无需stop & start */
break;
case VHOST_SET_VRING_CALL:
if (copy_from_user(&f, argp, sizeof f)) {
r = -EFAULT;
break;
}
eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
if (IS_ERR(eventfp)) {
r = PTR_ERR(eventfp);
break;
}
if (eventfp != vq->call) { /* eventfp不同于vq->call,此时需要stop vq->call同时start eventfp */
filep = vq->call;
ctx = vq->call_ctx;
vq->call = eventfp;
vq->call_ctx = eventfp ?
eventfd_ctx_fileget(eventfp) : NULL;
} else
filep = eventfp;
break;
if (pollstop && vq->handle_kick)
vhost_poll_stop(&vq->poll);
if (ctx)
eventfd_ctx_put(ctx); /* pollstop之后,释放之前占用的ctx */
if (filep)
fput(filep); /* pollstop之后,释放之前占用的filep */
if (pollstart && vq->handle_kick)
vhost_poll_start(&vq->poll, vq->kick);
mutex_unlock(&vq->mutex);
if (pollstop && vq->handle_kick)
vhost_poll_flush(&vq->poll);
return r;
下面来看下vhost的数据流,vhost与kvm模块之间通过eventfd来实现,guest到host方向的kick event,通过ioeventfd实现,host到guest方向的call event,通过irqfd实现
host到guest方向
首先host处理used ring,然后判断如果KVM_IRQFD成功设置,kvm模块会通过irqfd把中断注入guest。qemu是通过virtio_pci_set_guest_notifiers -> kvm_virtio_pci_vector_use -> kvm_virtio_pci_irqfd_use -> kvm_irqchip_add_irqfd_notifier -> kvm_irqchip_assign_irqfd最终调用kvm_vm_ioctl来设置kvm模块的irqfd的,包含write fd和read fd(可选)
static int kvm_virtio_pci_vector_use(VirtIOPCIProxy *proxy, int nvqs)
{
PCIDevice *dev = &proxy->pci_dev;
VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
unsigned int vector;
int ret, queue_no;
MSIMessage msg;
for (queue_no = 0; queue_no < nvqs; queue_no++) {
if (!virtio_queue_get_num(vdev, queue_no)) {
break;
}
vector = virtio_queue_vector(vdev, queue_no);
if (vector >= msix_nr_vectors_allocated(dev)) {
continue;
}
msg = msix_get_message(dev, vector);
ret = kvm_virtio_pci_vq_vector_use(proxy, queue_no, vector, msg);
if (ret < 0) {
goto undo;
}
/* If guest supports masking, set up irqfd now.
* Otherwise, delay until unmasked in the frontend.
*/
if (k->guest_notifier_mask) {
ret = kvm_virtio_pci_irqfd_use(proxy, queue_no, vector);
if (ret < 0) {
kvm_virtio_pci_vq_vector_release(proxy, vector);
goto undo;
}
}
}
return 0;
undo:
while (--queue_no >= 0) {
vector = virtio_queue_vector(vdev, queue_no);
if (vector >= msix_nr_vectors_allocated(dev)) {
continue;
}
if (k->guest_notifier_mask) {
kvm_virtio_pci_irqfd_release(proxy, queue_no, vector);
}
kvm_virtio_pci_vq_vector_release(proxy, vector);
}
return ret;
}
如果没有设置irqfd,则guest notifier fd会通知到等待fd的qemu进程,进入注册函数virtio_queue_guest_notifier_read,调用virtio_irq,最终调用到virtio_pci_notify
static void virtio_queue_guest_notifier_read(EventNotifier *n)
{
VirtQueue *vq = container_of(n, VirtQueue, guest_notifier);
if (event_notifier_test_and_clear(n)) {
virtio_irq(vq);
}
}
void virtio_irq(VirtQueue *vq)
{
trace_virtio_irq(vq);
vq->vdev->isr |= 0x01;
virtio_notify_vector(vq->vdev, vq->vector);
}
static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector)
{
BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
if (k->notify) {
k->notify(qbus->parent, vector);
}
}
static void virtio_pci_notify(DeviceState *d, uint16_t vector)
{
VirtIOPCIProxy *proxy = to_virtio_pci_proxy_fast(d);
if (msix_enabled(&proxy->pci_dev))
msix_notify(&proxy->pci_dev, vector);
else {
VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
pci_set_irq(&proxy->pci_dev, vdev->isr & 1);
}
}
整个过程如图所示 ( 摘自http://royluo.org/2014/08/22/vhost/ )
guest到host方向
guest通过向pci配置空间写入从而产生VMEXIT,被kvm截获之后触发注册fd的notification
kvm_init:
memory_listener_register(&kvm_memory_listener, &address_space_memory);
memory_listener_register(&kvm_io_listener, &address_space_io);
static MemoryListener kvm_memory_listener = {
.region_add = kvm_region_add,
.region_del = kvm_region_del,
.log_start = kvm_log_start,
.log_stop = kvm_log_stop,
.log_sync = kvm_log_sync,
.log_global_start = kvm_log_global_start,
.log_global_stop = kvm_log_global_stop,
.eventfd_add = kvm_mem_ioeventfd_add,
.eventfd_del = kvm_mem_ioeventfd_del,
.coalesced_mmio_add = kvm_coalesce_mmio_region,
.coalesced_mmio_del = kvm_uncoalesce_mmio_region,
.priority = 10,
};
static MemoryListener kvm_io_listener = {
.eventfd_add = kvm_io_ioeventfd_add,
.eventfd_del = kvm_io_ioeventfd_del,
.priority = 10,
};
static void kvm_io_ioeventfd_add(MemoryListener *listener,
MemoryRegionSection *section,
bool match_data, uint64_t data,
EventNotifier *e)
{
int fd = event_notifier_get_fd(e);
int r;
r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
data, true, int128_get64(section->size),
match_data);
if (r < 0) {
fprintf(stderr, "%s: error adding ioeventfd: %s\n",
__func__, strerror(-r));
abort();
}
}
而kvm_io_ioeventfd_add最终调用了kvm_set_ioeventfd_pio,后者调用了kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick)进入到了kvm.ko中
static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
bool assign, uint32_t size, bool datamatch)
{
struct kvm_ioeventfd kick = {
.datamatch = datamatch ? val : 0,
.addr = addr,
.flags = KVM_IOEVENTFD_FLAG_PIO,
.len = size,
.fd = fd,
};
int r;
if (!kvm_enabled()) {
return -ENOSYS;
}
if (datamatch) {
kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
}
if (!assign) {
kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
}
r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
if (r < 0) {
return r;
}
return 0;
}
KVM_IOEVENTFD的ioctl最终调用了kvm的kvm_ioeventfd函数,后者会调用到kvm_assign_ioeventfd或者kvm_deassign_ioeventfd
int
kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
return kvm_deassign_ioeventfd(kvm, args);
return kvm_assign_ioeventfd(kvm, args);
}
static int
kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
struct _ioeventfd *p; /* ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. */
struct eventfd_ctx *eventfd; /* mostly wait_queue_head_t */
int ret;
/* must be natural-word sized */
switch (args->len) {
case 1:
case 2:
case 4:
case 8:
break;
default:
return -EINVAL;
}
/* check for range overflow */
if (args->addr + args->len < args->addr)
return -EINVAL;
/* check for extra flags that we don't understand */
if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
return -EINVAL;
eventfd = eventfd_ctx_fdget(args->fd); /* file->private_data */
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
p = kzalloc(sizeof(*p), GFP_KERNEL); /* 分配一个_ioeventfd,并把内存地址,长度,eventfd_ctx与其关联起来 */
if (!p) {
ret = -ENOMEM;
goto fail;
}
INIT_LIST_HEAD(&p->list);
p->addr = args->addr;
p->length = args->len;
p->eventfd = eventfd;
/* The datamatch feature is optional, otherwise this is a wildcard */
if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
p->datamatch = args->datamatch;
else
p->wildcard = true;
mutex_lock(&kvm->slots_lock);
/* Verify that there isnt a match already */
if (ioeventfd_check_collision(kvm, p)) {
ret = -EEXIST;
goto unlock_fail;
}
kvm_iodevice_init(&p->dev, &ioeventfd_ops);
ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev); /* 注册到kvm的pio bus或者mmio bus上 */
if (ret < 0)
goto unlock_fail;
list_add_tail(&p->list, &kvm->ioeventfds); /* 添加到kvm.ko的ioeventfds的list中 */
mutex_unlock(&kvm->slots_lock);
return 0;
unlock_fail:
mutex_unlock(&kvm->slots_lock);
fail:
kfree(p);
eventfd_ctx_put(eventfd);
return ret;
}
kvm_assign_ioeventfd中,通过注册一个pio/mmio的地址段和一个fd,当访问这块内存产生的VMEXIT就会在kvm.ko中被转化成为fd的event notification,
static const struct kvm_io_device_ops ioeventfd_ops = {
.write = ioeventfd_write,
.destructor = ioeventfd_destructor,
};
/* MMIO/PIO writes trigger an event if the addr/val match */
static int
ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
const void *val)
{
struct _ioeventfd *p = to_ioeventfd(this);
if (!ioeventfd_in_range(p, addr, len, val))
return -EOPNOTSUPP;
eventfd_signal(p->eventfd, 1);
return 0;
}
最终event notification通过eventfd_signal,唤醒vhost线程,整体的流程如下图所示
vhost的控制面和数据面如下图所示
最后,以vhost-net为例说明下vhost网络报文的初始化以及收发流程,e.g.
qemu通过netdev tap,vhost=on在创建网络设备时指定后端基于vhost,net_init_tap会对vhost的每个queue,调用net_init_tap_one初始化vhost。初始化的工作通过vhost_net_init完成
typedef struct VhostNetOptions {
VhostBackendType backend_type; /* vhost kernel or userspace */
NetClientState *net_backend; /* TAPState device */
void *opaque; /* ioctl vhostfd, /dev/vhost-net */
bool force;
} VhostNetOptions;
static int net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
const char *model, const char *name,
const char *ifname, const char *script,
const char *downscript, const char *vhostfdname,
int vnet_hdr, int fd)
{
...
if (tap->has_vhost ? tap->vhost :
vhostfdname || (tap->has_vhostforce && tap->vhostforce)) {
VhostNetOptions options;
options.backend_type = VHOST_BACKEND_TYPE_KERNEL;
options.net_backend = &s->nc;
options.force = tap->has_vhostforce && tap->vhostforce;
if ((tap->has_vhostfd || tap->has_vhostfds)) {
vhostfd = monitor_handle_fd_param(cur_mon, vhostfdname);
if (vhostfd == -1) {
return -1;
}
} else {
vhostfd = open("/dev/vhost-net", O_RDWR); /* open /dev/vhost-net for ioctl usage */
if (vhostfd < 0) {
error_report("tap: open vhost char device failed: %s",
strerror(errno));
return -1;
}
}
qemu_set_cloexec(vhostfd);
options.opaque = (void *)(uintptr_t)vhostfd;
s->vhost_net = vhost_net_init(&options); /* 初始化struct vhost_net */
if (!s->vhost_net) {
error_report("vhost-net requested but could not be initialized");
return -1;
}
}
...
}
struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[2];
int backend;
NetClientState *nc;
};
struct vhost_net *vhost_net_init(VhostNetOptions *options)
{
int r;
bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL;
struct vhost_net *net = g_malloc(sizeof *net);
if (!options->net_backend) {
fprintf(stderr, "vhost-net requires net backend to be setup\n");
goto fail;
}
if (backend_kernel) {
r = vhost_net_get_fd(options->net_backend);
if (r < 0) {
goto fail;
}
net->dev.backend_features = qemu_has_vnet_hdr(options->net_backend)
? 0 : (1 << VHOST_NET_F_VIRTIO_NET_HDR);
net->backend = r; /* backend设置为NetClientState对应的fd */
} else {
net->dev.backend_features = 0;
net->backend = -1;
}
net->nc = options->net_backend; /* nc设置为NetClientState */
net->dev.nvqs = 2; /* TX queue和RX queue */
net->dev.vqs = net->vqs; /* vhost_dev,vhost_net公用vhost_virtqueue */
r = vhost_dev_init(&net->dev, options->opaque,
options->backend_type, options->force); /* 初始化vhost_dev,这里通过VHOST_SET_OWNER的ioctl创建vhost kthread */
if (r < 0) {
goto fail;
}
if (!qemu_has_vnet_hdr_len(options->net_backend,
sizeof(struct virtio_net_hdr_mrg_rxbuf))) {
net->dev.features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF);
}
if (backend_kernel) {
if (~net->dev.features & net->dev.backend_features) {
fprintf(stderr, "vhost lacks feature mask %" PRIu64
" for backend\n",
(uint64_t)(~net->dev.features & net->dev.backend_features));
vhost_dev_cleanup(&net->dev);
goto fail;
}
}
/* Set sane init value. Override when guest acks. */
vhost_net_ack_features(net, 0);
return net;
fail:
g_free(net);
return NULL;
}
当guest启动成功,qemu会配置相应的vhost,调用virtio_net_set_status用于开启/关闭virtio-net设备及队列,virtio_net_set_status会调用到vhost_net_start用于打开vhost队列,调用vhost_net_stop用于关闭vhost队列
int vhost_net_start(VirtIODevice *dev, NetClientState *ncs,
int total_queues)
{
BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(dev)));
VirtioBusState *vbus = VIRTIO_BUS(qbus);
VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
int r, e, i;
if (!vhost_net_device_endian_ok(dev)) {
error_report("vhost-net does not support cross-endian");
r = -ENOSYS;
goto err;
}
if (!k->set_guest_notifiers) {
error_report("binding does not support guest notifiers");
r = -ENOSYS;
goto err;
}
for (i = 0; i < total_queues; i++) {
vhost_net_set_vq_index(get_vhost_net(ncs[i].peer), i * 2);
}
/* 调用virtio_pci_set_guest_notifiers来配置irqfd等信息;如果没有enable vhost,qemu同样会调用到这里 */
r = k->set_guest_notifiers(qbus->parent, total_queues * 2, true);
if (r < 0) {
error_report("Error binding guest notifier: %d", -r);
goto err;
}
/* 如果tun支持多队列的场景,会有多个NetClientState,分别代表tap设备的一个队列,每个NetClientState都会对应一个vhost_net结构 */
for (i = 0; i < total_queues; i++) {
r = vhost_net_start_one(get_vhost_net(ncs[i].peer), dev); /* 对每个队列调用vhost_net_start_one */
if (r < 0) {
goto err_start;
}
}
return 0;
err_start:
while (--i >= 0) {
vhost_net_stop_one(get_vhost_net(ncs[i].peer), dev);
}
e = k->set_guest_notifiers(qbus->parent, total_queues * 2, false);
if (e < 0) {
fprintf(stderr, "vhost guest notifier cleanup failed: %d\n", e);
fflush(stderr);
}
err:
return r;
}
static int vhost_net_start_one(struct vhost_net *net,
VirtIODevice *dev)
{
struct vhost_vring_file file = { };
int r;
if (net->dev.started) {
return 0;
}
net->dev.nvqs = 2; /* vqs包含一个TX virtqueue和一个RX virtqueue */
net->dev.vqs = net->vqs;
/* 调用<span style="font-family: Arial, Helvetica, sans-serif;">virtio_pci_set_guest_notifiers来enable vhost ioeventfd */</span>
r = vhost_dev_enable_notifiers(&net->dev, dev); /* 停止在qemu中处理guest的IO通知,开始在vhost里处理guest的IO通知 */
if (r < 0) {
goto fail_notifiers;
}
r = vhost_dev_start(&net->dev, dev);
if (r < 0) {
goto fail_start;
}
if (net->nc->info->poll) {
net->nc->info->poll(net->nc, false);
}
if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) {
qemu_set_fd_handler(net->backend, NULL, NULL, NULL);
file.fd = net->backend;
for (file.index = 0; file.index < net->dev.nvqs; ++file.index) {
const VhostOps *vhost_ops = net->dev.vhost_ops;
r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND,
&file);
if (r < 0) {
r = -errno;
goto fail;
}
}
}
return 0;
fail:
file.fd = -1;
if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) {
while (file.index-- > 0) {
const VhostOps *vhost_ops = net->dev.vhost_ops;
int r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND,
&file);
assert(r >= 0);
}
}
if (net->nc->info->poll) {
net->nc->info->poll(net->nc, true);
}
vhost_dev_stop(&net->dev, dev);
fail_start:
vhost_dev_disable_notifiers(&net->dev, dev);
fail_notifiers:
return r;
}
vhost net的各个数据结构之间关系如下图所示
我们来看下内核对vhost_net的定义,e.g.
static const struct file_operations vhost_net_fops = {
.owner = THIS_MODULE,
.release = vhost_net_release,
.unlocked_ioctl = vhost_net_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = vhost_net_compat_ioctl,
#endif
.open = vhost_net_open,
};
static struct miscdevice vhost_net_misc = {
MISC_DYNAMIC_MINOR,
"vhost-net",
&vhost_net_fops,
};
enum {
VHOST_NET_VQ_RX = 0,
VHOST_NET_VQ_TX = 1,
VHOST_NET_VQ_MAX = 2,
};
enum vhost_net_poll_state {
VHOST_NET_POLL_DISABLED = 0,
VHOST_NET_POLL_STARTED = 1,
VHOST_NET_POLL_STOPPED = 2,
};
struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; /* vhost的virtqueue封装,其handle_kick的回调函数会被ioeventfd唤醒 */
struct vhost_poll poll[VHOST_NET_VQ_MAX]; /* 对应于NetClientState的socket IO,分别用两个vhost_poll结构体 */
/* Tells us whether we are polling a socket for TX.
* We only do this when socket buffer fills up.
* Protected by tx vq lock. */
enum vhost_net_poll_state tx_poll_state;
};
static int vhost_net_open(struct inode *inode, struct file *f)
{
struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
struct vhost_dev *dev;
int r;
if (!n)
return -ENOMEM;
dev = &n->dev;
n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; /* TX virtqueue->kick的callback函数 */
n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; /* RX virtqueue->kick的callback函数 */
r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
if (r < 0) {
kfree(n);
return r;
}
vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); /* 初始化vhost_net的TX vhost_poll */
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); /* 初始化vhost_net的RX vhost_poll */
n->tx_poll_state = VHOST_NET_POLL_DISABLED;
f->private_data = n;
return 0;
}
handle_tx_kick/handle_rx_kick的实现和handle_tx_net/handle_rx_net完全一致,这里为什么要有两个不同的函数呢?看完下面的代码分析后你会有一个答案,不过我先在这里剧透下,handle_tx_kick/handle_rx_kick是阻塞在TX queue/RX queue的kick fd上的回调函数,handle_tx_net/handle_rx_net是阻塞在vhost_net TX poll/RX poll上的阻塞函数,无论对于TX还是RX而言,报文的路径都是一个两阶段的过程,e.g.
TX首先是kick virtqueue的fd,之后进行vring的buffer传递,最后通过NetClientState的socket fd发送,但socket有可能会出现缓冲区不足,或者本次发送的quota不够等情况,此时需要poll在socket的fd上阻塞等待。同理RX也是如此,一阶段阻塞在socket fd上,二阶段阻塞在virtqueue kick fd上
当guest发送报文时,ioeventfd触发了vhost_virtqueue的kick fd,POLLIN事件导致vhost_poll_wakeup被调用,最后唤醒了vhost worker线程,线程会调用注册的handle_kick函数,即handle_tx_kick
static void handle_tx_kick(struct vhost_work *work)
{
struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
poll.work);
struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
handle_tx(net);
}
static void handle_tx(struct vhost_net *net)
{
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
unsigned out, in, s;
int head;
struct msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
.msg_control = NULL,
.msg_controllen = 0,
.msg_iov = vq->iov,
.msg_flags = MSG_DONTWAIT,
};
size_t len, total_len = 0;
int err, wmem;
size_t hdr_size;
struct vhost_ubuf_ref *uninitialized_var(ubufs);
bool zcopy;
struct socket *sock = rcu_dereference(vq->private_data); /* NetClientState对应的socket以private_data的形式保存在vhost_virtqueue */
if (!sock)
return;
wmem = atomic_read(&sock->sk->sk_wmem_alloc);
if (wmem >= sock->sk->sk_sndbuf) { /* 已经申请的socket写内存,超过了发送缓冲区 */
mutex_lock(&vq->mutex);
tx_poll_start(net, sock); /* 此时无法发送,阻塞等待在sock上 */
mutex_unlock(&vq->mutex);
return;
}
mutex_lock(&vq->mutex);
vhost_disable_notify(&net->dev, vq); /* disable virtqueue的notify通知,通过VRING_USED_F_NO_NOTIFY标志位 */
if (wmem < sock->sk->sk_sndbuf / 2)
tx_poll_stop(net);
hdr_size = vq->vhost_hlen;
zcopy = vq->ubufs;
for (;;) {
/* Release DMAs done buffers first */
if (zcopy)
vhost_zerocopy_signal_used(vq);
head = vhost_get_vq_desc(&net->dev, vq, vq->iov, /* 从last_avail_idx开始,把avail desc内容拷贝过来 */
ARRAY_SIZE(vq->iov),
&out, &in,
NULL, NULL);
/* On error, stop handling until the next kick. */
if (unlikely(head < 0))
break;
/* Nothing new? Wait for eventfd to tell us they refilled. */
if (head == vq->num) { /* 此时vq->avail_idx == vq->last_avail_idx,前端没有新buf过来 */
int num_pends;
wmem = atomic_read(&sock->sk->sk_wmem_alloc);
if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
tx_poll_start(net, sock);
set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
break;
}
/* If more outstanding DMAs, queue the work.
* Handle upend_idx wrap around
*/
num_pends = likely(vq->upend_idx >= vq->done_idx) ?
(vq->upend_idx - vq->done_idx) :
(vq->upend_idx + UIO_MAXIOV - vq->done_idx);
if (unlikely(num_pends > VHOST_MAX_PEND)) {
tx_poll_start(net, sock);
set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
break;
}
if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* 重新调用vhost_enable_notify打开event notify flag */
vhost_disable_notify(&net->dev, vq); /* vhost_enable_notify返回false,说明avail_idx有了变化,那么continue */
continue;
}
break;
}
if (in) { /* Tx应该全部是out */
vq_err(vq, "Unexpected descriptor format for TX: "
"out %d, int %d\n", out, in);
break;
}
/* Skip header. TODO: support TSO. */
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out); /* hdr_size是VNET_HDR的元数据,里面没有实际报文内容 */
msg.msg_iovlen = out;
len = iov_length(vq->iov, out);
/* Sanity check */
if (!len) {
vq_err(vq, "Unexpected header len for TX: "
"%zd expected %zd\n",
iov_length(vq->hdr, s), hdr_size);
break;
}
/* use msg_control to pass vhost zerocopy ubuf info to skb */
if (zcopy) {
vq->heads[vq->upend_idx].id = head;
if (len < VHOST_GOODCOPY_LEN) {
/* copy don't need to wait for DMA done */
vq->heads[vq->upend_idx].len =
VHOST_DMA_DONE_LEN;
msg.msg_control = NULL;
msg.msg_controllen = 0;
ubufs = NULL;
} else {
struct ubuf_info *ubuf = &vq->ubuf_info[head];
vq->heads[vq->upend_idx].len = len;
ubuf->callback = vhost_zerocopy_callback;
ubuf->arg = vq->ubufs;
ubuf->desc = vq->upend_idx;
msg.msg_control = ubuf;
msg.msg_controllen = sizeof(ubuf);
ubufs = vq->ubufs;
kref_get(&ubufs->kref);
}
vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
}
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(NULL, sock, &msg, len);
if (unlikely(err < 0)) {
if (zcopy) {
if (ubufs)
vhost_ubuf_put(ubufs);
vq->upend_idx = ((unsigned)vq->upend_idx - 1) %
UIO_MAXIOV;
}
vhost_discard_vq_desc(vq, 1); /* 发送失败,回退last_avail_idx */
if (err == -EAGAIN || err == -ENOBUFS)
tx_poll_start(net, sock); /* 阻塞等待vhost_net->poll之后尝试重新发送 */
break;
}
if (err != len)
pr_debug("Truncated TX packet: "
" len %d != %zd\n", err, len);
if (!zcopy)
vhost_add_used_and_signal(&net->dev, vq, head, 0); /* 更新virtqueue used ring部分,e.g. used_elem, last_used_idx */
else
vhost_zerocopy_signal_used(vq);
total_len += len;
if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll); /* 超出了quota,重新入队列等待调度 */
break;
}
}
mutex_unlock(&vq->mutex);
}
收包过程首先是vhost阻塞在NetClientState的socket上,e.g.
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev)
static void handle_rx_net(struct vhost_work *work)
{
struct vhost_net *net = container_of(work, struct vhost_net,
poll[VHOST_NET_VQ_RX].work);
handle_rx(net);
}
static void handle_rx(struct vhost_net *net)
{
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
unsigned uninitialized_var(in), log;
struct vhost_log *vq_log;
struct msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
.msg_control = NULL, /* FIXME: get and handle RX aux data. */
.msg_controllen = 0,
.msg_iov = vq->iov,
.msg_flags = MSG_DONTWAIT,
};
struct virtio_net_hdr_mrg_rxbuf hdr = {
.hdr.flags = 0,
.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
};
size_t total_len = 0;
int err, headcount, mergeable;
size_t vhost_hlen, sock_hlen;
size_t vhost_len, sock_len;
struct socket *sock = rcu_dereference(vq->private_data);
if (!sock)
return;
mutex_lock(&vq->mutex);
vhost_disable_notify(&net->dev, vq); /* disable virtqueue event notify机制 */
vhost_hlen = vq->vhost_hlen;
sock_hlen = vq->sock_hlen;
vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
vq->log : NULL;
mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF);
while ((sock_len = peek_head_len(sock->sk))) { /* 下一个报文的长度 */
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(vq, vq->heads, vhost_len, /* get_rx_bufs用于从virtqueue中拿到多个avail desc, */
&in, vq_log, &log, /* 直到满足所有这些iov加起来可以容纳下一个报文的长度 */
likely(mergeable) ? UIO_MAXIOV : 1); /* 相当于多次调用<span style="font-family: Arial, Helvetica, sans-serif;">vhost_get_vq_desc */</span>
/* On error, stop handling until the next kick. */
if (unlikely(headcount < 0))
break;
/* OK, now we need to know about added descriptors. */
if (!headcount) {
if (unlikely(vhost_enable_notify(&net->dev, vq))) {
/* They have slipped one in as we were
* doing that: check again. */
vhost_disable_notify(&net->dev, vq);
continue;
}
/* Nothing new? Wait for eventfd to tell us
* they refilled. */
break;
}
/* We don't need to be notified again. */
if (unlikely((vhost_hlen)))
/* Skip header. TODO: support TSO. */
move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
else
/* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
* needed because sendmsg can modify msg_iov. */
copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
msg.msg_iovlen = in;
err = sock->ops->recvmsg(NULL, sock, &msg,
sock_len, MSG_DONTWAIT | MSG_TRUNC); /* 报文被收到virtqueue->iov里面 */
/* Userspace might have consumed the packet meanwhile:
* it's not supposed to do this usually, but might be hard
* to prevent. Discard data we got (if any) and keep going. */
if (unlikely(err != sock_len)) {
pr_debug("Discarded rx packet: "
" len %d, expected %zd\n", err, sock_len);
vhost_discard_vq_desc(vq, headcount); /* 回滚used ring */
continue;
}
if (unlikely(vhost_hlen) &&
memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
vhost_hlen)) {
vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
vq->iov->iov_base);
break;
}
/* TODO: Should check and handle checksum. */
if (likely(mergeable) &&
memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
offsetof(typeof(hdr), num_buffers),
sizeof hdr.num_buffers)) {
vq_err(vq, "Failed num_buffers write");
vhost_discard_vq_desc(vq, headcount);
break;
}
vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
headcount); /* 添加多个vring_used_elem,并notify前端 */
if (unlikely(vq_log))
vhost_log_write(vq, vq_log, log, vhost_len);
total_len += vhost_len;
if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll); /* 超出了quota,重新入队列等待,注意此时加入的是vq的poll,下次会触发调用handle_rx_kick */
break;
}
}
mutex_unlock(&vq->mutex);
}