记录一次在阅读kvm内核源码时遇到的疑惑。
arch/x86/include/uapi/asm/vmx.h (exit reason)
#define EXIT_REASON_EXCEPTION_NMI 0
#define EXIT_REASON_EXTERNAL_INTERRUPT 1
#define EXIT_REASON_TRIPLE_FAULT 2
#define EXIT_REASON_INIT_SIGNAL 3
#define EXIT_REASON_SIPI_SIGNAL 4
#define EXIT_REASON_INTERRUPT_WINDOW 7
#define EXIT_REASON_NMI_WINDOW 8
#define EXIT_REASON_TASK_SWITCH 9
#define EXIT_REASON_CPUID 10
#define EXIT_REASON_HLT 12
#define EXIT_REASON_INVD 13
#define EXIT_REASON_INVLPG 14
#define EXIT_REASON_RDPMC 15
#define EXIT_REASON_RDTSC 16
#define EXIT_REASON_VMCALL 18
#define EXIT_REASON_VMCLEAR 19
#define EXIT_REASON_VMLAUNCH 20
#define EXIT_REASON_VMPTRLD 21
#define EXIT_REASON_VMPTRST 22
#define EXIT_REASON_VMREAD 23
#define EXIT_REASON_VMRESUME 24
#define EXIT_REASON_VMWRITE 25
#define EXIT_REASON_VMOFF 26
#define EXIT_REASON_VMON 27
#define EXIT_REASON_CR_ACCESS 28
#define EXIT_REASON_DR_ACCESS 29
#define EXIT_REASON_IO_INSTRUCTION 30
#define EXIT_REASON_MSR_READ 31
#define EXIT_REASON_MSR_WRITE 32
#define EXIT_REASON_INVALID_STATE 33
#define EXIT_REASON_MSR_LOAD_FAIL 34
#define EXIT_REASON_MWAIT_INSTRUCTION 36
#define EXIT_REASON_MONITOR_TRAP_FLAG 37
#define EXIT_REASON_MONITOR_INSTRUCTION 39
#define EXIT_REASON_PAUSE_INSTRUCTION 40
#define EXIT_REASON_MCE_DURING_VMENTRY 41
#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
#define EXIT_REASON_APIC_ACCESS 44
#define EXIT_REASON_EOI_INDUCED 45
#define EXIT_REASON_GDTR_IDTR 46
#define EXIT_REASON_LDTR_TR 47
#define EXIT_REASON_EPT_VIOLATION 48
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_INVEPT 50
#define EXIT_REASON_RDTSCP 51
#define EXIT_REASON_PREEMPTION_TIMER 52
#define EXIT_REASON_INVVPID 53
#define EXIT_REASON_WBINVD 54
#define EXIT_REASON_XSETBV 55
#define EXIT_REASON_APIC_WRITE 56
#define EXIT_REASON_RDRAND 57
#define EXIT_REASON_INVPCID 58
#define EXIT_REASON_VMFUNC 59
#define EXIT_REASON_ENCLS 60
#define EXIT_REASON_RDSEED 61
#define EXIT_REASON_PML_FULL 62 //PML buffer满
#define EXIT_REASON_XSAVES 63
#define EXIT_REASON_XRSTORS 64
#define EXIT_REASON_UMWAIT 67
#define EXIT_REASON_TPAUSE 68
#define EXIT_REASON_BUS_LOCK 74
include/uapi/linux/kvm.h (userspace_exit)
#define KVM_EXIT_UNKNOWN 0
#define KVM_EXIT_EXCEPTION 1
#define KVM_EXIT_IO 2
#define KVM_EXIT_HYPERCALL 3
#define KVM_EXIT_DEBUG 4
#define KVM_EXIT_HLT 5
#define KVM_EXIT_MMIO 6
#define KVM_EXIT_IRQ_WINDOW_OPEN 7
#define KVM_EXIT_SHUTDOWN 8
#define KVM_EXIT_FAIL_ENTRY 9
#define KVM_EXIT_INTR 10
#define KVM_EXIT_SET_TPR 11
#define KVM_EXIT_TPR_ACCESS 12
#define KVM_EXIT_S390_SIEIC 13
#define KVM_EXIT_S390_RESET 14
#define KVM_EXIT_DCR 15 /* deprecated */
#define KVM_EXIT_NMI 16
#define KVM_EXIT_INTERNAL_ERROR 17
#define KVM_EXIT_OSI 18
#define KVM_EXIT_PAPR_HCALL 19
#define KVM_EXIT_S390_UCONTROL 20
#define KVM_EXIT_WATCHDOG 21
#define KVM_EXIT_S390_TSCH 22
#define KVM_EXIT_EPR 23
#define KVM_EXIT_SYSTEM_EVENT 24
#define KVM_EXIT_S390_STSI 25
#define KVM_EXIT_IOAPIC_EOI 26
#define KVM_EXIT_HYPERV 27
#define KVM_EXIT_ARM_NISV 28
#define KVM_EXIT_X86_RDMSR 29
#define KVM_EXIT_X86_WRMSR 30
#define KVM_EXIT_DIRTY_RING_FULL 31
#define KVM_EXIT_AP_RESET_HOLD 32
#define KVM_EXIT_X86_BUS_LOCK 33
#define KVM_EXIT_XEN 34
之前在阅读kvm内核源码时,看到源码中有两部分exit reson,便有了疑问上述两部分exit reason有什么区别?通过询问一些大佬了解到:
前面的是x86的,exit时从vmcs取出来的,然后去找对应的exit handler。后者是kvm的,和架构关系不大,有些会exit到用户态给qemu处理。
x86 cpu运行guest的时候,需要从vmx-root切换为vmx-nonroot模式,然后执行guest中的代码。在guest内部遇到敏感指令后,cpu会设置vmcs,然后从vmx-nonroot切换为vmx-root。
这个时候,cpu上的执行的代码流程会从guest中回到kvm,kvm从vmcs中取出EXIT_REASON_XXX,然后执行对应的handler。
如果kvm这个时候因为某些原因,需要退出到用户态的hypervisor(比如qemu),kvm就要设置前面那个KVM_EXIT_XXX。然后退出之前hypervisor的ioctl系统调用,然后用户态的hypervisor就能拿到这个reason,做进一步的处理。如果kvm不需要退出到用户态,它就会在执行handler后,继续从vmx-root切换为vmx-nonroot,然后执行guest中的代码。所以这个EXIT_REASON_XXX是cpu设置的,给kvm用。KVM_EXIT_XXX是kvm设置的,给hypervisor用。
例子:比如x86下,常见的调试场景,guest触发一个0xcc断点,然后调用EXIT_REASON_EXCEPTION_NMI的handle函数。这个函数发现是3号中断,内核处理不了,会设置一个KVM_EXIT_DEBUG返回到qemu。qemu再把这个消息发给gdb server做决策。
这个还有个例子:
static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
...
[EXIT_REASON_CPUID] = kvm_emulate_cpuid,
[EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, //读msr寄存器
[EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, //写msr寄存器
...
guest内部遇到读msr寄存器的指令时,会设置exit reason为EXIT_REASON_MSR_READ,退出到kvm,执行对应的handler函数kvm_emulate_rdmsr:
/*
* 模拟执行 rdmsr 指令
*
* @vcpu: 指向当前虚拟 CPU 结构体的指针
*
* 返回值:
* - 0:成功执行 rdmsr 指令并完成模拟
* - 其他值:失败,需要进一步处理或请求用户空间处理
*/
int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu); // 读取 ecx 寄存器的值
u64 data;
int r;
// 获取 MSR(Model Specific Register) 的值,并进行过滤
r = kvm_get_msr_with_filter(vcpu, ecx, &data);
if (!r) {
// 如果获取成功,则跟踪 MSR 读取事件并将结果写入 rax 和 rdx 寄存器
trace_kvm_msr_read(ecx, data);
kvm_rax_write(vcpu, data & -1u);
kvm_rdx_write(vcpu, (data >> 32) & -1u);
} else {
/* MSR 读取失败?看看是否应该请求用户空间处理 */
if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0,
complete_fast_rdmsr, r))
return 0; // 如果需要请求用户空间处理,则直接返回
trace_kvm_msr_read_ex(ecx); // 跟踪 MSR 读取异常事件
}
// 调用相应体系结构的函数,完成模拟执行 MSR 指令
return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
}
如果退出到kvm时没有成功读取msr寄存器则会调用kvm_msr_user_space函数检查是否需要退出到用户态qemu进行处理:
/*
* 如果用户空间希望了解此 MSR(Model Specific Register) 故障的详细信息,则返回 1;否则返回 0。
*
* @vcpu: 指向当前虚拟 CPU 结构体的指针
* @index: MSR 的索引
* @exit_reason: 退出原因
* @data: 数据
* @completion: 完成函数指针,用于处理用户空间 MSR 故障的完成回调
* @r: MSR 故障代码
*
* 返回值:
* - 1:如果用户希望了解此 MSR 故障的详细信息
* - 0:否则
*/
static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
u32 exit_reason, u64 data,
int (*completion)(struct kvm_vcpu *vcpu),
int r)
{
u64 msr_reason = kvm_msr_reason(r);
/* 检查用户是否希望了解此 MSR 故障 */
if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
return 0;
// 设置退出原因、MSR 错误、MSR 原因、索引、数据以及完成函数指针
vcpu->run->exit_reason = exit_reason;
vcpu->run->msr.error = 0;
memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
vcpu->run->msr.reason = msr_reason;
vcpu->run->msr.index = index;
vcpu->run->msr.data = data;
vcpu->arch.complete_userspace_io = completion;
return 1;
}
如果对应的位掩码被设置,则设置vcpu->run->exit_reason为KVM_EXIT_X86_RDMSR,设置msr寄存器的编号index,msr错误的原因等,然后退出到用户态qemu进行处理,处理完成后调用回调函数complete_fast_rdmsr:
static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
{
complete_userspace_rdmsr(vcpu);
return complete_fast_msr_access(vcpu);
}