ernel 3.10内核源码分析--KVM相关--虚拟机运行

最新推荐文章于 2024-04-11 21:16:12 发布

winceos

最新推荐文章于 2024-04-11 21:16:12 发布

阅读量1.9k

点赞数

分类专栏： Kernel 虚拟化

虚拟化同时被 2 个专栏收录

9 篇文章

订阅专栏

Kernel

7 篇文章

订阅专栏

本文详细解析了KVM虚拟机运行的基本原理、流程，包括上下文切换、硬件指令执行、敏感指令处理等关键步骤，以及在不同内核版本中的具体实现细节。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1、基本原理
KVM虚拟机通过字符设备/dev/kvm的ioctl接口创建和运行，相关原理见之前的文章说明。
虚拟机的运行通过/dev/kvm设备ioctl VCPU接口的KVM_RUN指令实现，在VM和VCPU创建好并完成初始化后，就可以调度该虚拟机运行了，通常，一个VCPU对应于一个线程，虚拟机运行的本质为调度该虚拟机相关的VCPU所在线程运行。虚拟机(VCPU)的运行主要任务是要进行上下文切换，上下文主要包括相关寄存器、APIC状态、TLB等，通常上下文切换的过程如下：
1、    保存当前的上下文。
2、    使用kvm_vcpu结构体中的上下文信息，加载到物理CPU中。
3、    执行kvm_x86_ops中的run_vcpu函数，调用硬件相关的指令(如VMLAUNCH)，进入虚拟机运行环境中。
虚拟机运行于qemu-kvm的进程上下文中，从硬件的角度看，虚拟机的运行过程，实质为相关指令的执行过程，虚拟机编译后的也就是相应的CPU指令序列，而虚拟机的指令跟Host机的指令执行过程并没有太多的差别，最关键的差别为“敏感指令”(通常为IO、内存等关键操作)的执行，这也是虚拟化实现的本质所在，当在虚拟机中(Guest模式)执行“敏感指令”时，会触发（由硬件触发）VM-exit，使当前CPU从Guest模式(non-root模式)切换到root模式，当前CPU的控制权随之转交给VMM(Hypervisor，KVM中即Host)，由VMM进行相应的处理，处理完成后再次通过应该硬件指令(如VMLAUNCH)，重新进入到Guest模式，从而进入虚拟机运行环境中继续运行。
本文简单解释及分析在3.10版本内核代码中的相关流程，用户态qemu-kvm部分暂不包括。

2、大致流程：
Qemu-kvm可以通过ioctl(KVM_RUN…)使虚拟机运行，最终进入内核态，由KVM相关内核流程处理，在内核态执行的大致过程如下：
kvm_vcpu_ioctl -->
    kvm_arch_vcpu_ioctl_run
具体由内核函数kvm_arch_vcpu_ioctl_run完成相关工作。主要流程如下：

1、 Sigprocmask()屏蔽信号，防止在此过程中受到信号的干扰。

2、设置当前VCPU状态为KVM_MP_STATE_UNINITIALIZED

3、配置APIC和mmio相关信息

4、将VCPU中保存的上下文信息写入指定位置

5、然后的工作交由__vcpu_run完成

6、 __vcpu_run最终调用vcpu_enter_guest，该函数实现了进入Guest，并执行Guest OS具体指令的操作。

7、 vcpu_enter_guest最终调用kvm_x86_ops中的run函数运行。对应于Intel平台，该函数为vmx_vcpu_run(设置Guest CR3和其他寄存器、EPT/影子页表相关设置、汇编代码VMLAUNCH切换到非根模式，执行Guest目标代码)。

8、 Guest代码执行到敏感指令或因其他原因(比如中断/异常)，VM-Exit退出非根模式，返回到vcpu_enter_guest函数继续执行。

9、 vcpu_enter_guest函数中会判断VM-Exit原因，并进行相应处理。

10、处理完成后VM-Entry到Guest重新执行Guest代码，或重新等待下次调度。

3、代码分析
kvm_vcpu_ioctl():

点击(此处)折叠或打开

/*

  * kvm
 ioctl VCPU指令的入口，传入的fd为KVM_CREATE_VCPU中返回的fd。

  * 主要针对具体的VCPU进行参数设置。如：相关寄存器的读

  * 写、中断控制等

  */

static long kvm_vcpu_ioctl(struct file *filp,

             unsigned int ioctl, unsigned
 long arg)

{

    struct kvm_vcpu *vcpu = filp->private_data;

    void __user *argp = (void
 __user *)arg;

    int r;

    struct kvm_fpu *fpu = NULL;

    struct kvm_sregs *kvm_sregs = NULL;

    if (vcpu->kvm->mm != current->mm)

        return -EIO;

#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)

    /*

     * Special
 cases: vcpu ioctls that are asynchronous to vcpu execution,

     * so
 vcpu_load() would
 break it.

     */

    if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)

        return kvm_arch_vcpu_ioctl(filp, ioctl, arg);

#endif

    // KVM虚拟机VCPU数据结构载入物理CPU

    r = vcpu_load(vcpu);

    if (r)

        return r;

    switch (ioctl) {

    /* 

     * 运行虚拟机，最终通过执行VMLAUNCH指令进入non
 root模式，

     * 进入虚拟机运行。当虚拟机内部执行敏感指令时，由硬

     * 件触发VM-exit，返回到root模式

     */

    case KVM_RUN:

        r = -EINVAL;

        // 不能带参数。

        if (arg)

            goto out;

        // 运行VCPU(即运行虚拟机)的入口函数

        r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);

        trace_kvm_userspace_exit(vcpu->run->exit_reason, r);

        break;

...

kvm_vcpu_ioctl()-->kvm_arch_vcpu_ioctl_run()-->__vcpu_run():

点击(此处)折叠或打开

static int __vcpu_run(struct
 kvm_vcpu *vcpu)

{

    int r;

    struct kvm *kvm = vcpu->kvm;

    vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);

    /*设置vcpu->arch.apic->vapic_page*/

    r = vapic_enter(vcpu);

    if (r) {

        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

        return r;

    }

    r = 1;

    while (r > 0) {

        /*检查状态*/

        if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&

         !vcpu->arch.apf.halted)

         /* 进入Guest模式，最终通过VMLAUNCH指令实现*/

            r = vcpu_enter_guest(vcpu);

        else {/*什么情况下会走到这里?*/

            srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

            /*阻塞VCPU，其实就是schddule()调度出去，但在有特殊情况时(比如有挂起的定时器或信号时)，不进行调度而直接退出*/

            kvm_vcpu_block(vcpu);

            vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);

            if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {

                kvm_apic_accept_events(vcpu);

                switch(vcpu->arch.mp_state) {

                case KVM_MP_STATE_HALTED:

                    vcpu->arch.pv.pv_unhalted = false;

                    vcpu->arch.mp_state =

                        KVM_MP_STATE_RUNNABLE;

                case KVM_MP_STATE_RUNNABLE:

                    vcpu->arch.apf.halted = false;

                    break;

                case KVM_MP_STATE_INIT_RECEIVED:

                    break;

                default:

                    r = -EINTR;

                    break;

                }

            }

        }

        if (r <= 0)

            break;

        clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);

        if (kvm_cpu_has_pending_timer(vcpu))

            kvm_inject_pending_timer_irqs(vcpu);

        if (dm_request_for_irq_injection(vcpu)) {

            r = -EINTR;

            vcpu->run->exit_reason = KVM_EXIT_INTR;

            ++vcpu->stat.request_irq_exits;

        }

        kvm_check_async_pf_completion(vcpu);

        if (signal_pending(current)) {

            r = -EINTR;

            vcpu->run->exit_reason = KVM_EXIT_INTR;

            ++vcpu->stat.signal_exits;

        }

        /*这是kvm中的一个调度时机点，即选择新VCPU运行的时机点*/

        if (need_resched()) {

            srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

            kvm_resched(vcpu);

            vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);

        }

    }

    srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

    vapic_exit(vcpu);

    return r;

}

kvm_vcpu_ioctl()-->kvm_arch_vcpu_ioctl_run()-->__vcpu_run()-->vcpu_enter_guest():

点击(此处)折叠或打开

/* 进入Guest模式，最终通过VMLAUNCH指令实现*/

static int vcpu_enter_guest(struct
 kvm_vcpu *vcpu)

{

    int r;

    bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&

        vcpu->run->request_interrupt_window;

    bool req_immediate_exit = false;

    /*进入Guest模式前先处理相关挂起的请求*/

    if (vcpu->requests) {

        /*卸载MMU*/

        if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))

            kvm_mmu_unload(vcpu);

        /*定时器迁移*/

        if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))

            __kvm_migrate_timers(vcpu);

        /*主时钟更新*/

        if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))

            kvm_gen_update_masterclock(vcpu->kvm);

        /*全局时钟更新*/

        if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))

            kvm_gen_kvmclock_update(vcpu);

        /*虚拟机时钟更新*/

        if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {

            r = kvm_guest_time_update(vcpu);

            if (unlikely(r))

                goto out;

        }

        /*更新mmu*/

        if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))

            kvm_mmu_sync_roots(vcpu);

        /*刷新TLB*/

        if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))

            kvm_x86_ops->tlb_flush(vcpu);

        if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {

            vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;

            r = 0;

            goto out;

        }

        if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {

            vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;

            r = 0;

            goto out;

        }

        if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {

            vcpu->fpu_active = 0;

            kvm_x86_ops->fpu_deactivate(vcpu);

        }

        if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {

            /* Page is swapped
 out. Do synthetic
 halt */

            vcpu->arch.apf.halted = true;

            r = 1;

            goto out;

        }

        if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))

            record_steal_time(vcpu);

        if (kvm_check_request(KVM_REQ_NMI, vcpu))

            process_nmi(vcpu);

        if (kvm_check_request(KVM_REQ_PMU, vcpu))

            kvm_handle_pmu_event(vcpu);

        if (kvm_check_request(KVM_REQ_PMI, vcpu))

            kvm_deliver_pmi(vcpu);

        if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))

            vcpu_scan_ioapic(vcpu);

    }

    // 检查是否有事件请求

    if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {

        kvm_apic_accept_events(vcpu);

        if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {

            r = 1;

            goto out;

        }

        // 注入阻塞的事件，中断，异常和nmi等

        inject_pending_event(vcpu);

        /* enable
 NMI/IRQ window open
 exits if needed */

        /*

         * 使能NMI/IRQ
 window，参见Intel64 System Programming Guide 25.3节

         * 当使能了interrupt-window exiting或NMI-window exiting(由VMCS中相关字段控制)，

         * 表示在刚进入虚拟机后，就会立刻因为有pending或注入的中断导致VM-exit

         */

        if (vcpu->arch.nmi_pending)

            req_immediate_exit =

                kvm_x86_ops->enable_nmi_window(vcpu) != 0;

        else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)

            req_immediate_exit =

                kvm_x86_ops->enable_irq_window(vcpu) != 0;

        if (kvm_lapic_enabled(vcpu)) {

            /*

             * Update
 architecture specific hints for APIC

             * virtual
 interrupt delivery.

             */

            if (kvm_x86_ops->hwapic_irr_update)

                kvm_x86_ops->hwapic_irr_update(vcpu,

                    kvm_lapic_find_highest_irr(vcpu));

            update_cr8_intercept(vcpu);

            kvm_lapic_sync_to_vapic(vcpu);

        }

    }

    // 装载MMU，待深入分析

    r = kvm_mmu_reload(vcpu);

    if (unlikely(r)) {

        goto cancel_injection;

    }

    preempt_disable();

    // 进入Guest前期准备，架构相关

    kvm_x86_ops->prepare_guest_switch(vcpu);

    if (vcpu->fpu_active)

        kvm_load_guest_fpu(vcpu);

    kvm_load_guest_xcr0(vcpu);

    vcpu->mode = IN_GUEST_MODE;

    /* We
 should set ->mode
 before check ->requests,

     * see
 the comment in make_all_cpus_request.

     */

    smp_mb();

    local_irq_disable();

    /* 

     * 如果VCPU处于EXITING_GUEST_MODE或者vcpu->requests(?)或者需要调度或者

     * 有挂起的信号，则放弃

     */

    if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests

     || need_resched() || signal_pending(current)) {

        vcpu->mode = OUTSIDE_GUEST_MODE;

        smp_wmb();

        local_irq_enable();

        preempt_enable();

        r = 1;

        goto cancel_injection;

    }

    srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);

    // req_immediate_exit在前面使能NMI/IRQ
 window失败时设置，此时需要立即退出，触发重新调度

    if (req_immediate_exit)

        smp_send_reschedule(vcpu->cpu);

    // 计算虚拟机的enter时间

    kvm_guest_enter();

    // 调试相关

    if (unlikely(vcpu->arch.switch_db_regs)) {

        set_debugreg(0, 7);

        set_debugreg(vcpu->arch.eff_db[0], 0);

        set_debugreg(vcpu->arch.eff_db[1], 1);

        set_debugreg(vcpu->arch.eff_db[2], 2);

        set_debugreg(vcpu->arch.eff_db[3], 3);

    }

    trace_kvm_entry(vcpu->vcpu_id);

    // 调用架构相关的run接口(vmx_vcpu_run)，进入Guest模式

    kvm_x86_ops->run(vcpu);

    // 此处开始，说明已经发生了VM-exit，退出了Guest模式

    /*

     * If the
 guest has used debug registers, at least dr7

     * will
 be disabled while returning to the
 host.

     * If we
 don't have active breakpoints in the
 host, we don't

     * care
 about the messed up debug address registers. But if

     * we
 have some of them active, restore the old state.

     */

    if (hw_breakpoint_active())

        hw_breakpoint_restore();

    /*记录Guest退出前的TSC时钟*/

    vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,

                             native_read_tsc());

    // 设置模式

    vcpu->mode = OUTSIDE_GUEST_MODE;

    smp_wmb();

    /* Interrupt is enabled
 by handle_external_intr() */

    kvm_x86_ops->handle_external_intr(vcpu);

    ++vcpu->stat.exits;

    /*

     * We
 must have an instruction between local_irq_enable() and

     * kvm_guest_exit(), so
 the timer interrupt isn't delayed by

     * the
 interrupt shadow. The stat.exits
 increment will do nicely.

     * But
 we need to prevent reordering, hence
 this barrier():

     */

    barrier();

    // 计算虚拟机的退出时间，其中还开中断了?

    kvm_guest_exit();

    preempt_enable();

    vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);

    /*

     * Profile
 KVM exit RIPs:

     */

    // Profile(采样计数，用于性能分析和调优)相关

    if (unlikely(prof_on == KVM_PROFILING)) {

        unsigned long rip = kvm_rip_read(vcpu);

        profile_hit(KVM_PROFILING, (void *)rip);

    }

    if (unlikely(vcpu->arch.tsc_always_catchup))

        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

    if (vcpu->arch.apic_attention)

        kvm_lapic_sync_from_vapic(vcpu);

    /* 

     * 调用vmx_handle_exit()处理虚拟机异常，异常原因及其它关键信息

     * 已经在之前获取。

     */

    r = kvm_x86_ops->handle_exit(vcpu);

    return r;

cancel_injection:

    kvm_x86_ops->cancel_injection(vcpu);

    if (unlikely(vcpu->arch.apic_attention))

        kvm_lapic_sync_from_vapic(vcpu);

out:

    return r;

}

kvm_vcpu_ioctl()-->kvm_arch_vcpu_ioctl_run()-->__vcpu_run()-->vcpu_enter_guest()-->vmx_vcpu_run():

点击(此处)折叠或打开

/*

  * 运行虚拟机，进入Guest模式，即non
 root模式

  */

static void __noclone vmx_vcpu_run(struct
 kvm_vcpu *vcpu)

{

    struct vcpu_vmx *vmx = to_vmx(vcpu);

    unsigned long debugctlmsr;

    /* Record
 the guest's net vcpu time for enforced
 NMI injections. */

    // nmi注入?跟nmi_watchdog相关?

    if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))

        vmx->entry_time = ktime_get();

    /* Don't
 enter VMX if guest state is invalid, let the exit handler

     start emulation until we
 arrive back to a valid state */

    if (vmx->emulation_required)

        return;

    if (vmx->nested.sync_shadow_vmcs) {

        copy_vmcs12_to_shadow(vmx);

        vmx->nested.sync_shadow_vmcs = false;

    }

    // 写入Guest的RSP寄存器信息至VMCS相关位置中

    if (test_bit(VCPU_REGS_RSP, (unsigned
 long *)&vcpu->arch.regs_dirty))

        vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);

    // 写入Guest的RIP寄存器信息至VMCS相关位置中

    if (test_bit(VCPU_REGS_RIP, (unsigned
 long *)&vcpu->arch.regs_dirty))

        vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);

    /* When
 single-stepping over STI and MOV
 SS, we must clear the

     * corresponding
 interruptibility bits in the guest state. Otherwise

     * vmentry
 fails as it then expects bit 14 (BS) in pending
 debug

     * exceptions
 being set, but
 that's not correct for the
 guest debugging

     * case. */

    // 单步调试时，需要禁用Guest中断

    if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)

        vmx_set_interrupt_shadow(vcpu, 0);

    atomic_switch_perf_msrs(vmx);

    debugctlmsr = get_debugctlmsr();

    // vmx->__launched用于判断当前VCPU是否已经VMLAUNCH了

    vmx->__launched = vmx->loaded_vmcs->launched;

    // 执行VMLAUNCH指令进入Guest模式，虚拟机开始运行

    asm(

        /* Store
 host registers */

        /*将相关寄存器压栈*/

        "push %%" _ASM_DX ";
 push %%" _ASM_BP ";"/*BP压栈*/

        /*为guest的rcx寄存器保留个位置，所以这里压两次栈*/

        "push %%" _ASM_CX "
 \n\t" /* placeholder for guest
 rcx */

        "push %%" _ASM_CX "
 \n\t"

        /*

         * %c表示用来表示使用立即数替换，但不使用立即数的语法，at&t汇编中表示立即数的语法前面有一个$，而用了%c后，就去掉了这个$。

         * 主要是用在间接寻址的情况，这种情况下如果直接使用$立即数的方式的话，会报语法错误。

         * [host_rsp]是后面输入部分定义的tag，使用%tag方式可以直接引用，%0是后面输入输出部分中的第一个操作数，即vmx，这里是间接寻址

         * %c[host_rsp](%0)整体来看就是vmx(以寄存器ecx传入)中的host_rsp成员。

         * 所以，如下语句的整体含义就是比较当前SP寄存器和vmx->host_rsp的值。

         */

        /*如果当前RSP和vmx->rsp相等，那就不用mov了，否则将当前RSP保存到vmx中*/

        "cmp %%" _ASM_SP ",
 %c[host_rsp](%0) \n\t"

        "je 1f \n\t"

        "mov %%" _ASM_SP ",
 %c[host_rsp](%0) \n\t"

       /*

        * 执行ASM_VMX_VMWRITE_RSP_RDX指令(Writes the contents of a primary source operand (register or memory) to a specified field in a VMCS，即将RSP的值写入vmcs中，field由RDX寄存器指定，

        * 而此时的RDX寄存器的内容由后面的约束条件:"d"((unsigned long)HOST_RSP指定为HOST_RSP，所以这句命令的作用为:将rsp的值写vmcs，field是HOST_RSP。)，

        * 当出现异常时直接重启，由__ex()实现

        */

        __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"

        "1: \n\t"

        /* Reload
 cr2 if changed */

        /*比较当前CR2寄存器和vmx中保存的CR2寄存器内容，如果不相等，就从vmx中重新CR2内容到当前CR2寄存器中*/

        "mov %c[cr2](%0), %%" _ASM_AX "
 \n\t"

        "mov %%cr2, %%" _ASM_DX "
 \n\t"

        "cmp %%" _ASM_AX ",
 %%" _ASM_DX " \n\t"

        "je 2f \n\t"

        "mov %%" _ASM_AX",
 %%cr2 \n\t"

        "2: \n\t"

        /* Check if vmlaunch
 of vmresume is needed */

        /*判断vcpu_vmx->__launched，确认是否需要执行VMLAUNCH*/

        "cmpl $0, %c[launched](%0) \n\t"

        /* Load
 guest registers. Don't
 clobber flags. */

        /*加载guest寄存器，其实就是从vmx中加载*/

        "mov %c[rax](%0), %%" _ASM_AX "
 \n\t"

        "mov %c[rbx](%0), %%" _ASM_BX "
 \n\t"

        "mov %c[rdx](%0), %%" _ASM_DX "
 \n\t"

        "mov %c[rsi](%0), %%" _ASM_SI "
 \n\t"

        "mov %c[rdi](%0), %%" _ASM_DI "
 \n\t"

        "mov %c[rbp](%0), %%" _ASM_BP "
 \n\t"

#ifdef CONFIG_X86_64

        "mov %c[r8](%0), %%r8 \n\t"

        "mov %c[r9](%0), %%r9 \n\t"

        "mov %c[r10](%0), %%r10 \n\t"

        "mov %c[r11](%0), %%r11 \n\t"

        "mov %c[r12](%0), %%r12 \n\t"

        "mov %c[r13](%0), %%r13 \n\t"

        "mov %c[r14](%0), %%r14 \n\t"

        "mov %c[r15](%0), %%r15 \n\t"

#endif

        "mov %c[rcx](%0), %%" _ASM_CX "
 \n\t" /* kills %0 (ecx) */

        /* Enter
 guest mode */

        "jne 1f \n\t"

        /* 执行VMLAUNCH指令，进入Guest模式*/

        __ex(ASM_VMX_VMLAUNCH) "\n\t"

        "jmp 2f \n\t"

        /* 执行VMRESUME指令，从Guest模式恢复到root模式*/

        "1: " __ex(ASM_VMX_VMRESUME) "\n\t"

        "2: "

        /* Save
 guest registers, load host registers, keep
 flags */

        "mov %0, %c[wordsize](%%" _ASM_SP ")
 \n\t"

        "pop %0 \n\t"

        "mov %%" _ASM_AX ",
 %c[rax](%0) \n\t"

        "mov %%" _ASM_BX ",
 %c[rbx](%0) \n\t"

        __ASM_SIZE(pop) "
 %c[rcx](%0) \n\t"

        "mov %%" _ASM_DX ",
 %c[rdx](%0) \n\t"

        "mov %%" _ASM_SI ",
 %c[rsi](%0) \n\t"

        "mov %%" _ASM_DI ",
 %c[rdi](%0) \n\t"

        "mov %%" _ASM_BP ",
 %c[rbp](%0) \n\t"

#ifdef CONFIG_X86_64

        "mov %%r8, %c[r8](%0) \n\t"

        "mov %%r9, %c[r9](%0) \n\t"

        "mov %%r10, %c[r10](%0) \n\t"

        "mov %%r11, %c[r11](%0) \n\t"

        "mov %%r12, %c[r12](%0) \n\t"

        "mov %%r13, %c[r13](%0) \n\t"

        "mov %%r14, %c[r14](%0) \n\t"

        "mov %%r15, %c[r15](%0) \n\t"

#endif

        "mov %%cr2, %%" _ASM_AX "
 \n\t"

        "mov %%" _ASM_AX ",
 %c[cr2](%0) \n\t"

        "pop %%" _ASM_BP ";
 pop %%" _ASM_DX " \n\t"

        "setbe %c[fail](%0) \n\t"

        ".pushsection .rodata \n\t"

        ".global vmx_return \n\t"

        "vmx_return: " _ASM_PTR "
 2b \n\t"

        ".popsection"

     : : "c"(vmx), "d"((unsigned
 long)HOST_RSP),

        [launched]"i"(offsetof(struct
 vcpu_vmx, __launched)),

        [fail]"i"(offsetof(struct
 vcpu_vmx, fail)),

        /*[host_rsp]是tag，可以在前面以%[host_rsp]方式引用*/

        [host_rsp]"i"(offsetof(struct
 vcpu_vmx, host_rsp)),

        [rax]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),

        [rbx]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),

        [rcx]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),

        [rdx]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),

        [rsi]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),

        [rdi]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),

        [rbp]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),

#ifdef CONFIG_X86_64

        [r8]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),

        [r9]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),

        [r10]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),

        [r11]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),

        [r12]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),

        [r13]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),

        [r14]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),

        [r15]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),

#endif

        [cr2]"i"(offsetof(struct
 vcpu_vmx, vcpu.arch.cr2)),

        [wordsize]"i"(sizeof(ulong))

     : "cc", "memory"/*clobber
 list，cc表示寄存器，memory表示内存*/

#ifdef CONFIG_X86_64

        , "rax", "rbx", "rdi", "rsi"

        , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"

#else

        , "eax", "ebx", "edi", "esi"

#endif

     );

    // 运行到这里，说明已经发生了VM-exit，返回到了root模式

    /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore
 it if needed */

    if (debugctlmsr)

        update_debugctlmsr(debugctlmsr);

#ifndef CONFIG_X86_64

    /*

     * The
 sysexit path does not restore ds/es, so
 we must set them to

     * a
 reasonable value ourselves.

     *

     * We
 can't defer this to vmx_load_host_state() since
 that function

     * may
 be executed in interrupt context, which
 saves and restore segments

     * around
 it, nullifying its effect.

     */

    /*重新加载ds/es段寄存器，因为VM-exit不会自动加载他们*/

    loadsegment(ds, __USER_DS);

    loadsegment(es, __USER_DS);

#endif

    vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)

                 | (1 << VCPU_EXREG_RFLAGS)

                 | (1 << VCPU_EXREG_CPL)

                 | (1 << VCPU_EXREG_PDPTR)

                 | (1 << VCPU_EXREG_SEGMENTS)

                 | (1 << VCPU_EXREG_CR3));

    vcpu->arch.regs_dirty = 0;

    // 从硬件VMCS中读取中断向量表信息

    vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);

    vmx->loaded_vmcs->launched = 1;

    // 从硬件VMCS中读取VM-exit原因信息，这些信息是VM-exit过程中由硬件自动写入的

    vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);

    trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);

    /*处理MCE异常和NMI中断*/

    vmx_complete_atomic_exit(vmx);

    vmx_recover_nmi_blocking(vmx);

    vmx_complete_interrupts(vmx);

}