Message ID | 20141205225707.GA6689@gnote (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
[Ccing a few others who might be interested] Hi Eugene, Did you look at the other patch that was posted for this functionality by Wincy Van ? https://lkml.org/lkml/2014/11/21/749 Eugene Korenevsky <ekorenevsky@gmail.com> writes: > BitVisor hypervisor fails to start a nested guest due to lack of MSR > load/store support in KVM. > > This patch fixes this problem according to Intel SDM. It would be helpful for the commit message to be a little more descriptive. > > Signed-off-by: Eugene Korenevsky <ekorenevsky@gmail.com> > --- > arch/x86/include/asm/vmx.h | 6 ++ > arch/x86/include/uapi/asm/msr-index.h | 3 + > arch/x86/include/uapi/asm/vmx.h | 1 + > arch/x86/kvm/vmx.c | 191 +++++++++++++++++++++++++++++++++- > virt/kvm/kvm_main.c | 1 + > 5 files changed, 197 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h > index bcbfade..e07414c 100644 > --- a/arch/x86/include/asm/vmx.h > +++ b/arch/x86/include/asm/vmx.h > @@ -454,6 +454,12 @@ struct vmx_msr_entry { > #define ENTRY_FAIL_VMCS_LINK_PTR 4 > > /* > + * VMX Abort codes > + */ > +#define VMX_ABORT_MSR_STORE 1 > +#define VMX_ABORT_MSR_LOAD 4 > + > +/* > * VM-instruction error numbers > */ > enum vm_instruction_error_number { > diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h > index e21331c..3c9c601 100644 > --- a/arch/x86/include/uapi/asm/msr-index.h > +++ b/arch/x86/include/uapi/asm/msr-index.h > @@ -316,6 +316,9 @@ > #define MSR_IA32_UCODE_WRITE 0x00000079 > #define MSR_IA32_UCODE_REV 0x0000008b > > +#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b > +#define MSR_IA32_SMBASE 0x0000009e > + Extra tab here ? > #define MSR_IA32_PERF_STATUS 0x00000198 > #define MSR_IA32_PERF_CTL 0x00000199 > #define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 > diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h > index 990a2fe..f5f8a62 100644 > --- a/arch/x86/include/uapi/asm/vmx.h > +++ b/arch/x86/include/uapi/asm/vmx.h > @@ -56,6 +56,7 @@ > #define EXIT_REASON_MSR_READ 31 > #define EXIT_REASON_MSR_WRITE 32 > #define EXIT_REASON_INVALID_STATE 33 > +#define EXIT_REASON_MSR_LOAD_FAILURE 34 > #define EXIT_REASON_MWAIT_INSTRUCTION 36 > #define EXIT_REASON_MONITOR_INSTRUCTION 39 > #define EXIT_REASON_PAUSE_INSTRUCTION 40 > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 6a951d8..deebc96 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -8498,6 +8498,163 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) > kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); > } > > +struct msr_entry { > + u32 index; > + u32 reserved; > + u64 data; > +} __packed; > + We already have struct vmx_msr_entry in vmx.h > +static bool vmx_msr_switch_area_verify(u32 count, u64 addr, int maxphyaddr) > +{ > + if (count == 0) > + return true; > + if ((addr & 0xf) != 0) > + return false; > + if ((addr >> maxphyaddr) != 0) > + return false; > + if (((addr + count * sizeof(struct msr_entry) - 1) >> maxphyaddr) != 0) > + return false; > + return true; > +} > + Shouldn't we also check if any of bits 63:32 is set ? > +static bool nested_vmx_msr_switch_verify(struct kvm_vcpu *vcpu, > + struct vmcs12 *vmcs12) > +{ > + int maxphyaddr = cpuid_maxphyaddr(vcpu); > + > +#define VMCS12_MSR_SWITCH_VERIFY(name) { \ > + if (!vmx_msr_switch_area_verify(vmcs12->name##_count, \ > + vmcs12->name##_addr, maxphyaddr)) { \ > + pr_warn_ratelimited( \ > + "%s: invalid MSR_{LOAD,STORE} parameters", \ > + #name); \ > + return false; \ > + } \ > + } Just a personal opinion: replacing the macro with a switch statement is probably a little less clumsy > + VMCS12_MSR_SWITCH_VERIFY(vm_exit_msr_load); > + VMCS12_MSR_SWITCH_VERIFY(vm_exit_msr_store); > + VMCS12_MSR_SWITCH_VERIFY(vm_entry_msr_load); > + return true; > +} > + > +static bool vmx_load_msr_entry_verify(struct kvm_vcpu *vcpu, > + struct msr_entry *e) > +{ > + if (e->index == MSR_FS_BASE || e->index == MSR_GS_BASE) > + return false; > + /* SMM is not supported */ > + if (e->index == MSR_IA32_SMM_MONITOR_CTL) > + return false; > + /* x2APIC MSR accesses are not allowed */ > + if (apic_x2apic_mode(vcpu->arch.apic) && (e->index >> 24) == 0x800) > + return false; > + if (e->reserved != 0) > + return false; > + return true; > +} > + > +static bool vmx_msr_switch_is_protected_msr(u32 msr_index) > +{ > + /* Intel SDM: a processor MAY prevent writing to these MSRs when > + * loading guest states on VM entries or saving guest states on VM > + * exits. > + * Assume our emulated processor DOES prevent writing */ > + return msr_index == MSR_IA32_UCODE_WRITE || > + msr_index == MSR_IA32_UCODE_REV; > +} > + > +static unsigned int nested_vmx_load_msrs(struct kvm_vcpu *vcpu, > + u32 count, u64 addr) > +{ > + unsigned int i; > + struct msr_entry msr_entry; > + struct msr_data msr; > + > + for (i = 1; i <= count; i++, addr += sizeof(msr_entry)) { > + if (kvm_read_guest(vcpu->kvm, addr, > + &msr_entry, sizeof(msr_entry))) { > + pr_warn_ratelimited( > + "Load MSR %d: cannot read GPA: 0x%llx\n", > + i, addr); > + return i; > + } > + if (!vmx_load_msr_entry_verify(vcpu, &msr_entry)) { > + pr_warn_ratelimited( > + "Load MSR %d: invalid MSR entry 0x%x 0x%x\n", > + i, msr_entry.index, msr_entry.reserved); > + return i; > + } > + msr.host_initiated = 0; > + msr.index = msr_entry.index; > + msr.data = msr_entry.data; > + if (vmx_msr_switch_is_protected_msr(msr.index)) { > + pr_warn_ratelimited( > + "Load MSR %d: prevent writing to MSR 0x%x\n", > + i, msr.index); > + return i; > + } > + if (kvm_set_msr(vcpu, &msr)) { > + pr_warn_ratelimited( > + "Load MSR %d: cannot write 0x%llx to MSR 0x%x\n", > + i, msr.data, msr.index); > + return i; > + } > + } > + return 0; > +} > + To ease debugging, these warning messages should also specify that they are Nested VMX specific. Also I think that the interfaces/functions you have introduced can be split up into an introductory patch and a second patch can then use these interfaces to provide the load/store functionality. > +static bool vmx_store_msr_entry_verify(struct kvm_vcpu *vcpu, > + struct msr_entry *e) > +{ > + /* x2APIC MSR accesses are not allowed */ > + if (apic_x2apic_mode(vcpu->arch.apic) && (e->index >> 24) == 0x800) > + return false; > + /* SMM is not supported */ > + if (e->index == MSR_IA32_SMBASE) > + return false; > + if (e->reserved != 0) > + return false; > + return true; > +} > + > +static unsigned int nested_vmx_store_msrs(struct kvm_vcpu *vcpu, > + u32 count, u64 addr) > +{ > + unsigned int i; > + struct msr_entry msr_entry; > + > + for (i = 1; i <= count; i++, addr += sizeof(msr_entry)) { > + if (kvm_read_guest(vcpu->kvm, addr, > + &msr_entry, 2 * sizeof(u32))) { > + pr_warn_ratelimited( > + "Store MSR %d: cannot read GPA: 0x%llx\n", > + i, addr); > + return i; > + } > + if (!vmx_store_msr_entry_verify(vcpu, &msr_entry)) { > + pr_warn_ratelimited( > + "Store MSR %d: invalid MSR entry 0x%x 0x%x\n", > + i, msr_entry.index, msr_entry.reserved); > + return i; > + } > + if (vmx_get_msr(vcpu, msr_entry.index, &msr_entry.data)) { > + pr_warn_ratelimited( > + "Store MSR %d: cannot read MSR 0x%x\n", > + i, msr_entry.index); > + return i; > + } > + if (kvm_write_guest(vcpu->kvm, > + addr + offsetof(struct msr_entry, data), > + &msr_entry.data, sizeof(msr_entry.data))) { > + pr_warn_ratelimited( > + "Store MSR %d: cannot write to GPA: 0x%llx\n", > + i, addr); > + return i; > + } > + } > + return 0; > +} > + > /* > * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 > * for running an L2 nested guest. > @@ -8507,6 +8664,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) > struct vmcs12 *vmcs12; > struct vcpu_vmx *vmx = to_vmx(vcpu); > int cpu; > + unsigned int msr; > struct loaded_vmcs *vmcs02; > bool ia32e; > > @@ -8556,11 +8714,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) > return 1; > } > > - if (vmcs12->vm_entry_msr_load_count > 0 || > - vmcs12->vm_exit_msr_load_count > 0 || > - vmcs12->vm_exit_msr_store_count > 0) { > - pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", > - __func__); > + if (!nested_vmx_msr_switch_verify(vcpu, vmcs12)) { > nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); > return 1; > } > @@ -8670,6 +8824,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) > > prepare_vmcs02(vcpu, vmcs12); > > + msr = nested_vmx_load_msrs(vcpu, vmcs12->vm_entry_msr_load_count, > + vmcs12->vm_entry_msr_load_addr); > + if (msr) > + nested_vmx_entry_failure(vcpu, vmcs12, > + EXIT_REASON_MSR_LOAD_FAILURE, msr); > + > if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) > return kvm_emulate_halt(vcpu); > > @@ -9099,6 +9259,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, > vmcs_write64(GUEST_IA32_DEBUGCTL, 0); > } > > +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 abort_code) > +{ > + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); > + > + vmcs12->abort = abort_code; > + pr_warn("Nested VMX abort %u\n", abort_code); > + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); > +} > + > /* > * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 > * and modify vmcs12 to make it see what it would expect to see there if > @@ -9118,8 +9287,20 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, > prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, > exit_qualification); > > + if (exit_reason != EXIT_REASON_INVALID_STATE && > + exit_reason != EXIT_REASON_MSR_LOAD_FAILURE) { > + if (nested_vmx_store_msrs(vcpu, > + vmcs12->vm_exit_msr_store_count, > + vmcs12->vm_exit_msr_store_addr)) > + nested_vmx_abort(vcpu, VMX_ABORT_MSR_STORE); > + } > + > vmx_load_vmcs01(vcpu); > > + if (nested_vmx_load_msrs(vcpu, vmcs12->vm_exit_msr_load_count, > + vmcs12->vm_exit_msr_load_addr)) > + nested_vmx_abort(vcpu, VMX_ABORT_MSR_LOAD); > + > if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) > && nested_exit_intr_ack_set(vcpu)) { > int irq = kvm_cpu_get_interrupt(vcpu); > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 5b45330..c9d1c4a 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -1582,6 +1582,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, > } > return 0; > } > +EXPORT_SYMBOL_GPL(kvm_write_guest); > > int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, > gpa_t gpa, unsigned long len) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2014-12-07 10:46, Bandan Das wrote: > [Ccing a few others who might be interested] > > Hi Eugene, > > Did you look at the other patch that was posted for this functionality > by Wincy Van ? > > https://lkml.org/lkml/2014/11/21/749 It's good to see more momentum here, but, yes, we should definitely try to join forces on this topic. It may have subtle corners that could undermine the integrity of the host. And there are apparently a lot of states to be validated for full compliance. Thus it would be good to have kvm-unit-tests for all what is checked here. > > Eugene Korenevsky <ekorenevsky@gmail.com> writes: > >> BitVisor hypervisor fails to start a nested guest due to lack of MSR >> load/store support in KVM. >> >> This patch fixes this problem according to Intel SDM. > > It would be helpful for the commit message to be a little more > descriptive. >> >> Signed-off-by: Eugene Korenevsky <ekorenevsky@gmail.com> >> --- >> arch/x86/include/asm/vmx.h | 6 ++ >> arch/x86/include/uapi/asm/msr-index.h | 3 + >> arch/x86/include/uapi/asm/vmx.h | 1 + >> arch/x86/kvm/vmx.c | 191 +++++++++++++++++++++++++++++++++- >> virt/kvm/kvm_main.c | 1 + >> 5 files changed, 197 insertions(+), 5 deletions(-) >> >> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h >> index bcbfade..e07414c 100644 >> --- a/arch/x86/include/asm/vmx.h >> +++ b/arch/x86/include/asm/vmx.h >> @@ -454,6 +454,12 @@ struct vmx_msr_entry { >> #define ENTRY_FAIL_VMCS_LINK_PTR 4 >> >> /* >> + * VMX Abort codes >> + */ >> +#define VMX_ABORT_MSR_STORE 1 >> +#define VMX_ABORT_MSR_LOAD 4 >> + >> +/* >> * VM-instruction error numbers >> */ >> enum vm_instruction_error_number { >> diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h >> index e21331c..3c9c601 100644 >> --- a/arch/x86/include/uapi/asm/msr-index.h >> +++ b/arch/x86/include/uapi/asm/msr-index.h >> @@ -316,6 +316,9 @@ >> #define MSR_IA32_UCODE_WRITE 0x00000079 >> #define MSR_IA32_UCODE_REV 0x0000008b >> >> +#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b >> +#define MSR_IA32_SMBASE 0x0000009e >> + > Extra tab here ? > >> #define MSR_IA32_PERF_STATUS 0x00000198 >> #define MSR_IA32_PERF_CTL 0x00000199 >> #define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 >> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h >> index 990a2fe..f5f8a62 100644 >> --- a/arch/x86/include/uapi/asm/vmx.h >> +++ b/arch/x86/include/uapi/asm/vmx.h >> @@ -56,6 +56,7 @@ >> #define EXIT_REASON_MSR_READ 31 >> #define EXIT_REASON_MSR_WRITE 32 >> #define EXIT_REASON_INVALID_STATE 33 >> +#define EXIT_REASON_MSR_LOAD_FAILURE 34 >> #define EXIT_REASON_MWAIT_INSTRUCTION 36 >> #define EXIT_REASON_MONITOR_INSTRUCTION 39 >> #define EXIT_REASON_PAUSE_INSTRUCTION 40 >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index 6a951d8..deebc96 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -8498,6 +8498,163 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) >> kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); >> } >> >> +struct msr_entry { >> + u32 index; >> + u32 reserved; >> + u64 data; >> +} __packed; >> + > > We already have struct vmx_msr_entry in vmx.h > >> +static bool vmx_msr_switch_area_verify(u32 count, u64 addr, int maxphyaddr) >> +{ >> + if (count == 0) >> + return true; >> + if ((addr & 0xf) != 0) Bits 11:0 have to be zero, in fact (page alignment required). >> + return false; >> + if ((addr >> maxphyaddr) != 0) >> + return false; >> + if (((addr + count * sizeof(struct msr_entry) - 1) >> maxphyaddr) != 0) >> + return false; >> + return true; >> +} >> + > Shouldn't we also check if any of bits 63:32 is set ? Why? The physical address just needs to be in the guest's address space - but I wonder if that isn't already checked when actually reading the area later on. > >> +static bool nested_vmx_msr_switch_verify(struct kvm_vcpu *vcpu, >> + struct vmcs12 *vmcs12) >> +{ >> + int maxphyaddr = cpuid_maxphyaddr(vcpu); >> + >> +#define VMCS12_MSR_SWITCH_VERIFY(name) { \ >> + if (!vmx_msr_switch_area_verify(vmcs12->name##_count, \ >> + vmcs12->name##_addr, maxphyaddr)) { \ >> + pr_warn_ratelimited( \ >> + "%s: invalid MSR_{LOAD,STORE} parameters", \ >> + #name); \ >> + return false; \ >> + } \ >> + } > Just a personal opinion: replacing the macro with a switch statement is > probably a little less clumsy Indeed. Better have function than accepts the field index and that has some translation table to derive the name for printing the message. > >> + VMCS12_MSR_SWITCH_VERIFY(vm_exit_msr_load); >> + VMCS12_MSR_SWITCH_VERIFY(vm_exit_msr_store); >> + VMCS12_MSR_SWITCH_VERIFY(vm_entry_msr_load); >> + return true; >> +} >> + >> +static bool vmx_load_msr_entry_verify(struct kvm_vcpu *vcpu, >> + struct msr_entry *e) >> +{ >> + if (e->index == MSR_FS_BASE || e->index == MSR_GS_BASE) >> + return false; >> + /* SMM is not supported */ >> + if (e->index == MSR_IA32_SMM_MONITOR_CTL) >> + return false; >> + /* x2APIC MSR accesses are not allowed */ >> + if (apic_x2apic_mode(vcpu->arch.apic) && (e->index >> 24) == 0x800) >> + return false; >> + if (e->reserved != 0) >> + return false; >> + return true; >> +} Is white-listing a safe approach for this? Wouldn't it be better to save/restore only a known set of MSRs, possibly adding an option to ignore others (with a warning) instead of returning an error (similar to what we do when L1 tries to write to an unknown MSR)? >> + >> +static bool vmx_msr_switch_is_protected_msr(u32 msr_index) >> +{ >> + /* Intel SDM: a processor MAY prevent writing to these MSRs when >> + * loading guest states on VM entries or saving guest states on VM >> + * exits. >> + * Assume our emulated processor DOES prevent writing */ >> + return msr_index == MSR_IA32_UCODE_WRITE || >> + msr_index == MSR_IA32_UCODE_REV; >> +} >> + >> +static unsigned int nested_vmx_load_msrs(struct kvm_vcpu *vcpu, >> + u32 count, u64 addr) >> +{ >> + unsigned int i; >> + struct msr_entry msr_entry; >> + struct msr_data msr; >> + >> + for (i = 1; i <= count; i++, addr += sizeof(msr_entry)) { >> + if (kvm_read_guest(vcpu->kvm, addr, >> + &msr_entry, sizeof(msr_entry))) { >> + pr_warn_ratelimited( >> + "Load MSR %d: cannot read GPA: 0x%llx\n", >> + i, addr); >> + return i; >> + } >> + if (!vmx_load_msr_entry_verify(vcpu, &msr_entry)) { >> + pr_warn_ratelimited( >> + "Load MSR %d: invalid MSR entry 0x%x 0x%x\n", >> + i, msr_entry.index, msr_entry.reserved); >> + return i; >> + } >> + msr.host_initiated = 0; >> + msr.index = msr_entry.index; >> + msr.data = msr_entry.data; >> + if (vmx_msr_switch_is_protected_msr(msr.index)) { >> + pr_warn_ratelimited( >> + "Load MSR %d: prevent writing to MSR 0x%x\n", >> + i, msr.index); >> + return i; >> + } >> + if (kvm_set_msr(vcpu, &msr)) { >> + pr_warn_ratelimited( >> + "Load MSR %d: cannot write 0x%llx to MSR 0x%x\n", >> + i, msr.data, msr.index); >> + return i; >> + } >> + } >> + return 0; >> +} >> + > To ease debugging, these warning messages should also specify that they are > Nested VMX specific. Also I think that the interfaces/functions you have introduced > can be split up into an introductory patch and a second patch can then use these > interfaces to provide the load/store functionality. > >> +static bool vmx_store_msr_entry_verify(struct kvm_vcpu *vcpu, >> + struct msr_entry *e) >> +{ >> + /* x2APIC MSR accesses are not allowed */ >> + if (apic_x2apic_mode(vcpu->arch.apic) && (e->index >> 24) == 0x800) >> + return false; >> + /* SMM is not supported */ >> + if (e->index == MSR_IA32_SMBASE) >> + return false; >> + if (e->reserved != 0) >> + return false; Same comment as above. >> + return true; >> +} >> + >> +static unsigned int nested_vmx_store_msrs(struct kvm_vcpu *vcpu, >> + u32 count, u64 addr) >> +{ >> + unsigned int i; >> + struct msr_entry msr_entry; >> + >> + for (i = 1; i <= count; i++, addr += sizeof(msr_entry)) { >> + if (kvm_read_guest(vcpu->kvm, addr, >> + &msr_entry, 2 * sizeof(u32))) { >> + pr_warn_ratelimited( >> + "Store MSR %d: cannot read GPA: 0x%llx\n", >> + i, addr); >> + return i; >> + } >> + if (!vmx_store_msr_entry_verify(vcpu, &msr_entry)) { >> + pr_warn_ratelimited( >> + "Store MSR %d: invalid MSR entry 0x%x 0x%x\n", >> + i, msr_entry.index, msr_entry.reserved); >> + return i; >> + } >> + if (vmx_get_msr(vcpu, msr_entry.index, &msr_entry.data)) { >> + pr_warn_ratelimited( >> + "Store MSR %d: cannot read MSR 0x%x\n", >> + i, msr_entry.index); >> + return i; >> + } >> + if (kvm_write_guest(vcpu->kvm, >> + addr + offsetof(struct msr_entry, data), >> + &msr_entry.data, sizeof(msr_entry.data))) { >> + pr_warn_ratelimited( >> + "Store MSR %d: cannot write to GPA: 0x%llx\n", >> + i, addr); >> + return i; >> + } >> + } >> + return 0; >> +} >> + >> /* >> * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 >> * for running an L2 nested guest. >> @@ -8507,6 +8664,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) >> struct vmcs12 *vmcs12; >> struct vcpu_vmx *vmx = to_vmx(vcpu); >> int cpu; >> + unsigned int msr; >> struct loaded_vmcs *vmcs02; >> bool ia32e; >> >> @@ -8556,11 +8714,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) >> return 1; >> } >> >> - if (vmcs12->vm_entry_msr_load_count > 0 || >> - vmcs12->vm_exit_msr_load_count > 0 || >> - vmcs12->vm_exit_msr_store_count > 0) { >> - pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", >> - __func__); >> + if (!nested_vmx_msr_switch_verify(vcpu, vmcs12)) { >> nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); >> return 1; >> } >> @@ -8670,6 +8824,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) >> >> prepare_vmcs02(vcpu, vmcs12); >> >> + msr = nested_vmx_load_msrs(vcpu, vmcs12->vm_entry_msr_load_count, >> + vmcs12->vm_entry_msr_load_addr); >> + if (msr) >> + nested_vmx_entry_failure(vcpu, vmcs12, >> + EXIT_REASON_MSR_LOAD_FAILURE, msr); Don't you have to terminate the nested run attempt here as well -> return 1? >> + >> if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) >> return kvm_emulate_halt(vcpu); >> >> @@ -9099,6 +9259,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, >> vmcs_write64(GUEST_IA32_DEBUGCTL, 0); >> } >> >> +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 abort_code) >> +{ >> + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); >> + >> + vmcs12->abort = abort_code; >> + pr_warn("Nested VMX abort %u\n", abort_code); >> + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); >> +} >> + >> /* >> * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 >> * and modify vmcs12 to make it see what it would expect to see there if >> @@ -9118,8 +9287,20 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, >> prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, >> exit_qualification); >> >> + if (exit_reason != EXIT_REASON_INVALID_STATE && >> + exit_reason != EXIT_REASON_MSR_LOAD_FAILURE) { >> + if (nested_vmx_store_msrs(vcpu, >> + vmcs12->vm_exit_msr_store_count, >> + vmcs12->vm_exit_msr_store_addr)) >> + nested_vmx_abort(vcpu, VMX_ABORT_MSR_STORE); >> + } >> + >> vmx_load_vmcs01(vcpu); >> >> + if (nested_vmx_load_msrs(vcpu, vmcs12->vm_exit_msr_load_count, >> + vmcs12->vm_exit_msr_load_addr)) >> + nested_vmx_abort(vcpu, VMX_ABORT_MSR_LOAD); >> + >> if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) >> && nested_exit_intr_ack_set(vcpu)) { >> int irq = kvm_cpu_get_interrupt(vcpu); >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c >> index 5b45330..c9d1c4a 100644 >> --- a/virt/kvm/kvm_main.c >> +++ b/virt/kvm/kvm_main.c >> @@ -1582,6 +1582,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, >> } >> return 0; >> } >> +EXPORT_SYMBOL_GPL(kvm_write_guest); >> >> int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, >> gpa_t gpa, unsigned long len)
Sorry, please ignore the previous mail, It was rejected due to HTML format. 2014-12-07 18:21 GMT+08:00 Jan Kiszka <jan.kiszka@web.de>: > > On 2014-12-07 10:46, Bandan Das wrote: > > [Ccing a few others who might be interested] > > > > Hi Eugene, > > > > Did you look at the other patch that was posted for this functionality > > by Wincy Van ? > > > > https://lkml.org/lkml/2014/11/21/749 This patch is misformated, I have already sent out a corrected one: https://lkml.org/lkml/2014/11/22/61 > > > It's good to see more momentum here, but, yes, we should definitely try > to join forces on this topic. It may have subtle corners that could > undermine the integrity of the host. And there are apparently a lot of > states to be validated for full compliance. Thus it would be good to > have kvm-unit-tests for all what is checked here. > > > > > Eugene Korenevsky <ekorenevsky@gmail.com> writes: > > > >> BitVisor hypervisor fails to start a nested guest due to lack of MSR > >> load/store support in KVM. > >> > >> This patch fixes this problem according to Intel SDM. > > > > It would be helpful for the commit message to be a little more > > descriptive. > >> > >> Signed-off-by: Eugene Korenevsky <ekorenevsky@gmail.com> > >> --- > >> arch/x86/include/asm/vmx.h | 6 ++ > >> arch/x86/include/uapi/asm/msr-index.h | 3 + > >> arch/x86/include/uapi/asm/vmx.h | 1 + > >> arch/x86/kvm/vmx.c | 191 +++++++++++++++++++++++++++++++++- > >> virt/kvm/kvm_main.c | 1 + > >> 5 files changed, 197 insertions(+), 5 deletions(-) > >> > >> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h > >> index bcbfade..e07414c 100644 > >> --- a/arch/x86/include/asm/vmx.h > >> +++ b/arch/x86/include/asm/vmx.h > >> @@ -454,6 +454,12 @@ struct vmx_msr_entry { > >> #define ENTRY_FAIL_VMCS_LINK_PTR 4 > >> > >> /* > >> + * VMX Abort codes > >> + */ > >> +#define VMX_ABORT_MSR_STORE 1 > >> +#define VMX_ABORT_MSR_LOAD 4 > >> + > >> +/* > >> * VM-instruction error numbers > >> */ > >> enum vm_instruction_error_number { > >> diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h > >> index e21331c..3c9c601 100644 > >> --- a/arch/x86/include/uapi/asm/msr-index.h > >> +++ b/arch/x86/include/uapi/asm/msr-index.h > >> @@ -316,6 +316,9 @@ > >> #define MSR_IA32_UCODE_WRITE 0x00000079 > >> #define MSR_IA32_UCODE_REV 0x0000008b > >> > >> +#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b > >> +#define MSR_IA32_SMBASE 0x0000009e > >> + > > Extra tab here ? > > > >> #define MSR_IA32_PERF_STATUS 0x00000198 > >> #define MSR_IA32_PERF_CTL 0x00000199 > >> #define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 > >> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h > >> index 990a2fe..f5f8a62 100644 > >> --- a/arch/x86/include/uapi/asm/vmx.h > >> +++ b/arch/x86/include/uapi/asm/vmx.h > >> @@ -56,6 +56,7 @@ > >> #define EXIT_REASON_MSR_READ 31 > >> #define EXIT_REASON_MSR_WRITE 32 > >> #define EXIT_REASON_INVALID_STATE 33 > >> +#define EXIT_REASON_MSR_LOAD_FAILURE 34 > >> #define EXIT_REASON_MWAIT_INSTRUCTION 36 > >> #define EXIT_REASON_MONITOR_INSTRUCTION 39 > >> #define EXIT_REASON_PAUSE_INSTRUCTION 40 > >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > >> index 6a951d8..deebc96 100644 > >> --- a/arch/x86/kvm/vmx.c > >> +++ b/arch/x86/kvm/vmx.c > >> @@ -8498,6 +8498,163 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) > >> kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); > >> } > >> > >> +struct msr_entry { > >> + u32 index; > >> + u32 reserved; > >> + u64 data; > >> +} __packed; > >> + > > > > We already have struct vmx_msr_entry in vmx.h > > > >> +static bool vmx_msr_switch_area_verify(u32 count, u64 addr, int maxphyaddr) > >> +{ > >> + if (count == 0) > >> + return true; > >> + if ((addr & 0xf) != 0) > > Bits 11:0 have to be zero, in fact (page alignment required). Why? According to SDM(24.8.2), "If the VM-entry MSR-load count is not zero, the address must be 16-byte aligned." I think the macro IS_ALIGNED would be better. > > > >> + return false; > >> + if ((addr >> maxphyaddr) != 0) > >> + return false; > >> + if (((addr + count * sizeof(struct msr_entry) - 1) >> maxphyaddr) != 0) > >> + return false; > >> + return true; > >> +} > >> + > > Shouldn't we also check if any of bits 63:32 is set ? > > Why? The physical address just needs to be in the guest's address space > - but I wonder if that isn't already checked when actually reading the > area later on. > > > > >> +static bool nested_vmx_msr_switch_verify(struct kvm_vcpu *vcpu, > >> + struct vmcs12 *vmcs12) > >> +{ > >> + int maxphyaddr = cpuid_maxphyaddr(vcpu); > >> + > >> +#define VMCS12_MSR_SWITCH_VERIFY(name) { \ > >> + if (!vmx_msr_switch_area_verify(vmcs12->name##_count, \ > >> + vmcs12->name##_addr, maxphyaddr)) { \ > >> + pr_warn_ratelimited( \ > >> + "%s: invalid MSR_{LOAD,STORE} parameters", \ > >> + #name); \ > >> + return false; \ > >> + } \ > >> + } > > Just a personal opinion: replacing the macro with a switch statement is > > probably a little less clumsy > > Indeed. Better have function than accepts the field index and that has > some translation table to derive the name for printing the message. > > > > >> + VMCS12_MSR_SWITCH_VERIFY(vm_exit_msr_load); > >> + VMCS12_MSR_SWITCH_VERIFY(vm_exit_msr_store); > >> + VMCS12_MSR_SWITCH_VERIFY(vm_entry_msr_load); > >> + return true; > >> +} > >> + > >> +static bool vmx_load_msr_entry_verify(struct kvm_vcpu *vcpu, > >> + struct msr_entry *e) > >> +{ > >> + if (e->index == MSR_FS_BASE || e->index == MSR_GS_BASE) > >> + return false; > >> + /* SMM is not supported */ > >> + if (e->index == MSR_IA32_SMM_MONITOR_CTL) > >> + return false; > >> + /* x2APIC MSR accesses are not allowed */ > >> + if (apic_x2apic_mode(vcpu->arch.apic) && (e->index >> 24) == 0x800) > >> + return false; > >> + if (e->reserved != 0) > >> + return false; > >> + return true; > >> +} > > Is white-listing a safe approach for this? Wouldn't it be better to > save/restore only a known set of MSRs, possibly adding an option to > ignore others (with a warning) instead of returning an error (similar to > what we do when L1 tries to write to an unknown MSR)? Agreed. According to SDM(26.4 LOADING MSRs), we need the checks above, and if failed, VM entry should failed. But for other MSRs, I think a white list is necessary to ensure the integrity of the host. > > >> + > >> +static bool vmx_msr_switch_is_protected_msr(u32 msr_index) > >> +{ > >> + /* Intel SDM: a processor MAY prevent writing to these MSRs when > >> + * loading guest states on VM entries or saving guest states on VM > >> + * exits. > >> + * Assume our emulated processor DOES prevent writing */ > >> + return msr_index == MSR_IA32_UCODE_WRITE || > >> + msr_index == MSR_IA32_UCODE_REV; > >> +} > >> + > >> +static unsigned int nested_vmx_load_msrs(struct kvm_vcpu *vcpu, > >> + u32 count, u64 addr) > >> +{ > >> + unsigned int i; > >> + struct msr_entry msr_entry; > >> + struct msr_data msr; > >> + > >> + for (i = 1; i <= count; i++, addr += sizeof(msr_entry)) { > >> + if (kvm_read_guest(vcpu->kvm, addr, > >> + &msr_entry, sizeof(msr_entry))) { > >> + pr_warn_ratelimited( > >> + "Load MSR %d: cannot read GPA: 0x%llx\n", > >> + i, addr); > >> + return i; > >> + } > >> + if (!vmx_load_msr_entry_verify(vcpu, &msr_entry)) { > >> + pr_warn_ratelimited( > >> + "Load MSR %d: invalid MSR entry 0x%x 0x%x\n", > >> + i, msr_entry.index, msr_entry.reserved); > >> + return i; > >> + } > >> + msr.host_initiated = 0; > >> + msr.index = msr_entry.index; > >> + msr.data = msr_entry.data; > >> + if (vmx_msr_switch_is_protected_msr(msr.index)) { > >> + pr_warn_ratelimited( > >> + "Load MSR %d: prevent writing to MSR 0x%x\n", > >> + i, msr.index); > >> + return i; > >> + } > >> + if (kvm_set_msr(vcpu, &msr)) { > >> + pr_warn_ratelimited( > >> + "Load MSR %d: cannot write 0x%llx to MSR 0x%x\n", > >> + i, msr.data, msr.index); > >> + return i; > >> + } > >> + } > >> + return 0; > >> +} > >> + > > To ease debugging, these warning messages should also specify that they are > > Nested VMX specific. Also I think that the interfaces/functions you have introduced > > can be split up into an introductory patch and a second patch can then use these > > interfaces to provide the load/store functionality. > > > >> +static bool vmx_store_msr_entry_verify(struct kvm_vcpu *vcpu, > >> + struct msr_entry *e) > >> +{ > >> + /* x2APIC MSR accesses are not allowed */ > >> + if (apic_x2apic_mode(vcpu->arch.apic) && (e->index >> 24) == 0x800) > >> + return false; > >> + /* SMM is not supported */ > >> + if (e->index == MSR_IA32_SMBASE) > >> + return false; > >> + if (e->reserved != 0) > >> + return false; > > Same comment as above. > > >> + return true; > >> +} > >> + > >> +static unsigned int nested_vmx_store_msrs(struct kvm_vcpu *vcpu, > >> + u32 count, u64 addr) > >> +{ > >> + unsigned int i; > >> + struct msr_entry msr_entry; > >> + > >> + for (i = 1; i <= count; i++, addr += sizeof(msr_entry)) { > >> + if (kvm_read_guest(vcpu->kvm, addr, > >> + &msr_entry, 2 * sizeof(u32))) { > >> + pr_warn_ratelimited( > >> + "Store MSR %d: cannot read GPA: 0x%llx\n", > >> + i, addr); > >> + return i; > >> + } > >> + if (!vmx_store_msr_entry_verify(vcpu, &msr_entry)) { > >> + pr_warn_ratelimited( > >> + "Store MSR %d: invalid MSR entry 0x%x 0x%x\n", > >> + i, msr_entry.index, msr_entry.reserved); > >> + return i; > >> + } > >> + if (vmx_get_msr(vcpu, msr_entry.index, &msr_entry.data)) { > >> + pr_warn_ratelimited( > >> + "Store MSR %d: cannot read MSR 0x%x\n", > >> + i, msr_entry.index); > >> + return i; > >> + } > >> + if (kvm_write_guest(vcpu->kvm, > >> + addr + offsetof(struct msr_entry, data), > >> + &msr_entry.data, sizeof(msr_entry.data))) { > >> + pr_warn_ratelimited( > >> + "Store MSR %d: cannot write to GPA: 0x%llx\n", > >> + i, addr); > >> + return i; > >> + } > >> + } > >> + return 0; > >> +} > >> + > >> /* > >> * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 > >> * for running an L2 nested guest. > >> @@ -8507,6 +8664,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) > >> struct vmcs12 *vmcs12; > >> struct vcpu_vmx *vmx = to_vmx(vcpu); > >> int cpu; > >> + unsigned int msr; > >> struct loaded_vmcs *vmcs02; > >> bool ia32e; > >> > >> @@ -8556,11 +8714,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) > >> return 1; > >> } > >> > >> - if (vmcs12->vm_entry_msr_load_count > 0 || > >> - vmcs12->vm_exit_msr_load_count > 0 || > >> - vmcs12->vm_exit_msr_store_count > 0) { > >> - pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", > >> - __func__); > >> + if (!nested_vmx_msr_switch_verify(vcpu, vmcs12)) { > >> nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); > >> return 1; > >> } > >> @@ -8670,6 +8824,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) > >> > >> prepare_vmcs02(vcpu, vmcs12); > >> > >> + msr = nested_vmx_load_msrs(vcpu, vmcs12->vm_entry_msr_load_count, > >> + vmcs12->vm_entry_msr_load_addr); > >> + if (msr) > >> + nested_vmx_entry_failure(vcpu, vmcs12, > >> + EXIT_REASON_MSR_LOAD_FAILURE, msr); > > Don't you have to terminate the nested run attempt here as well -> return 1? Indeed. According to SDM(26.4) "The VM entry fails if processing fails for any entry. The logical processor responds to such failures by loading state from the host-state area, as it would for a VM exit. See Section 26.7." In addition, we should load L1`s state here. I have done it in my patch. > > > >> + > >> if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) > >> return kvm_emulate_halt(vcpu); > >> > >> @@ -9099,6 +9259,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, > >> vmcs_write64(GUEST_IA32_DEBUGCTL, 0); > >> } > >> > >> +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 abort_code) > >> +{ > >> + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); > >> + > >> + vmcs12->abort = abort_code; > >> + pr_warn("Nested VMX abort %u\n", abort_code); > >> + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); > >> +} > >> + > >> /* > >> * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 > >> * and modify vmcs12 to make it see what it would expect to see there if > >> @@ -9118,8 +9287,20 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, > >> prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, > >> exit_qualification); > >> > >> + if (exit_reason != EXIT_REASON_INVALID_STATE && > >> + exit_reason != EXIT_REASON_MSR_LOAD_FAILURE) { > >> + if (nested_vmx_store_msrs(vcpu, > >> + vmcs12->vm_exit_msr_store_count, > >> + vmcs12->vm_exit_msr_store_addr)) > >> + nested_vmx_abort(vcpu, VMX_ABORT_MSR_STORE); > >> + } > >> + > >> vmx_load_vmcs01(vcpu); > >> > >> + if (nested_vmx_load_msrs(vcpu, vmcs12->vm_exit_msr_load_count, > >> + vmcs12->vm_exit_msr_load_addr)) > >> + nested_vmx_abort(vcpu, VMX_ABORT_MSR_LOAD); > >> + We should load L1`s MSRs in load_vmcs12_host_state, not here. If we failed to load MSR in nested entry, we should terminate the nested entry and load L1`s state. Both nested_vmx_vmexit and nested_vmx_entry_failure call load_vmcs12_host_state, so it is better to load L1`s MSRs there. > > >> if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) > >> && nested_exit_intr_ack_set(vcpu)) { > >> int irq = kvm_cpu_get_interrupt(vcpu); > >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > >> index 5b45330..c9d1c4a 100644 > >> --- a/virt/kvm/kvm_main.c > >> +++ b/virt/kvm/kvm_main.c > >> @@ -1582,6 +1582,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, > >> } > >> return 0; > >> } > >> +EXPORT_SYMBOL_GPL(kvm_write_guest); > >> > >> int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, > >> gpa_t gpa, unsigned long len) > > Thanks, Wincy -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2014-12-08 04:49, Wincy Van wrote: >>>> +static bool vmx_msr_switch_area_verify(u32 count, u64 addr, int maxphyaddr) >>>> +{ >>>> + if (count == 0) >>>> + return true; >>>> + if ((addr & 0xf) != 0) >> >> Bits 11:0 have to be zero, in fact (page alignment required). > > > Why? According to SDM(24.8.2), "If the VM-entry MSR-load count is not zero, > the address must be 16-byte aligned." True, sorry, confused with the MSR usage bitmap. > > I think the macro IS_ALIGNED would be better. Yes. Jan
> Thus it would be good to > have kvm-unit-tests for all what is checked here. I'll try to implement unit tests a bit later. The fixed patch based on comments and critics from this thread is following. > Indeed. Better have function than accepts the field index and that has > some translation table to derive the name for printing the message. Is having translation table for such simple case an overkill? Maybe printing VMCS field indexes would be enough? > Is white-listing a safe approach for this? Wouldn't it be better to > save/restore only a known set of MSRs, possibly adding an option to > ignore others (with a warning) instead of returning an error (similar to > what we do when L1 tries to write to an unknown MSR)? MSRs are written via kvm_set_msr() path and MSR set restricting is done there. If ignored or unhandled MSR attempted to write then whole MSR loading transaction fails. This check (vmx_msr_switch_is_protected_msr) adds more strict check on u-code-related MSRs (this model-specified checks are specified in SDM 26.4, 27.4 and 35). kvm_set_msr() simply ignores write attempts to these MSRs (MSR_IA32_UCODE_WRITE, MSR_IA32_UCODE_REV)) and returns 0 (success). Instead we should fail here according to SDM. > Same comment as above. kvm_{get/set}_msr() are safe, aren't they? >>> + msr = nested_vmx_load_msrs(vcpu, vmcs12->vm_entry_msr_load_count, >>> + vmcs12->vm_entry_msr_load_addr); >>> + if (msr) >>> + nested_vmx_entry_failure(vcpu, vmcs12, >>> + EXIT_REASON_MSR_LOAD_FAILURE, msr); > > Don't you have to terminate the nested run attempt here as well -> return 1? Fixed.
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index bcbfade..e07414c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -454,6 +454,12 @@ struct vmx_msr_entry { #define ENTRY_FAIL_VMCS_LINK_PTR 4 /* + * VMX Abort codes + */ +#define VMX_ABORT_MSR_STORE 1 +#define VMX_ABORT_MSR_LOAD 4 + +/* * VM-instruction error numbers */ enum vm_instruction_error_number { diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index e21331c..3c9c601 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -316,6 +316,9 @@ #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b +#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b +#define MSR_IA32_SMBASE 0x0000009e + #define MSR_IA32_PERF_STATUS 0x00000198 #define MSR_IA32_PERF_CTL 0x00000199 #define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 990a2fe..f5f8a62 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -56,6 +56,7 @@ #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_INVALID_STATE 33 +#define EXIT_REASON_MSR_LOAD_FAILURE 34 #define EXIT_REASON_MWAIT_INSTRUCTION 36 #define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6a951d8..deebc96 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8498,6 +8498,163 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); } +struct msr_entry { + u32 index; + u32 reserved; + u64 data; +} __packed; + +static bool vmx_msr_switch_area_verify(u32 count, u64 addr, int maxphyaddr) +{ + if (count == 0) + return true; + if ((addr & 0xf) != 0) + return false; + if ((addr >> maxphyaddr) != 0) + return false; + if (((addr + count * sizeof(struct msr_entry) - 1) >> maxphyaddr) != 0) + return false; + return true; +} + +static bool nested_vmx_msr_switch_verify(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int maxphyaddr = cpuid_maxphyaddr(vcpu); + +#define VMCS12_MSR_SWITCH_VERIFY(name) { \ + if (!vmx_msr_switch_area_verify(vmcs12->name##_count, \ + vmcs12->name##_addr, maxphyaddr)) { \ + pr_warn_ratelimited( \ + "%s: invalid MSR_{LOAD,STORE} parameters", \ + #name); \ + return false; \ + } \ + } + VMCS12_MSR_SWITCH_VERIFY(vm_exit_msr_load); + VMCS12_MSR_SWITCH_VERIFY(vm_exit_msr_store); + VMCS12_MSR_SWITCH_VERIFY(vm_entry_msr_load); + return true; +} + +static bool vmx_load_msr_entry_verify(struct kvm_vcpu *vcpu, + struct msr_entry *e) +{ + if (e->index == MSR_FS_BASE || e->index == MSR_GS_BASE) + return false; + /* SMM is not supported */ + if (e->index == MSR_IA32_SMM_MONITOR_CTL) + return false; + /* x2APIC MSR accesses are not allowed */ + if (apic_x2apic_mode(vcpu->arch.apic) && (e->index >> 24) == 0x800) + return false; + if (e->reserved != 0) + return false; + return true; +} + +static bool vmx_msr_switch_is_protected_msr(u32 msr_index) +{ + /* Intel SDM: a processor MAY prevent writing to these MSRs when + * loading guest states on VM entries or saving guest states on VM + * exits. + * Assume our emulated processor DOES prevent writing */ + return msr_index == MSR_IA32_UCODE_WRITE || + msr_index == MSR_IA32_UCODE_REV; +} + +static unsigned int nested_vmx_load_msrs(struct kvm_vcpu *vcpu, + u32 count, u64 addr) +{ + unsigned int i; + struct msr_entry msr_entry; + struct msr_data msr; + + for (i = 1; i <= count; i++, addr += sizeof(msr_entry)) { + if (kvm_read_guest(vcpu->kvm, addr, + &msr_entry, sizeof(msr_entry))) { + pr_warn_ratelimited( + "Load MSR %d: cannot read GPA: 0x%llx\n", + i, addr); + return i; + } + if (!vmx_load_msr_entry_verify(vcpu, &msr_entry)) { + pr_warn_ratelimited( + "Load MSR %d: invalid MSR entry 0x%x 0x%x\n", + i, msr_entry.index, msr_entry.reserved); + return i; + } + msr.host_initiated = 0; + msr.index = msr_entry.index; + msr.data = msr_entry.data; + if (vmx_msr_switch_is_protected_msr(msr.index)) { + pr_warn_ratelimited( + "Load MSR %d: prevent writing to MSR 0x%x\n", + i, msr.index); + return i; + } + if (kvm_set_msr(vcpu, &msr)) { + pr_warn_ratelimited( + "Load MSR %d: cannot write 0x%llx to MSR 0x%x\n", + i, msr.data, msr.index); + return i; + } + } + return 0; +} + +static bool vmx_store_msr_entry_verify(struct kvm_vcpu *vcpu, + struct msr_entry *e) +{ + /* x2APIC MSR accesses are not allowed */ + if (apic_x2apic_mode(vcpu->arch.apic) && (e->index >> 24) == 0x800) + return false; + /* SMM is not supported */ + if (e->index == MSR_IA32_SMBASE) + return false; + if (e->reserved != 0) + return false; + return true; +} + +static unsigned int nested_vmx_store_msrs(struct kvm_vcpu *vcpu, + u32 count, u64 addr) +{ + unsigned int i; + struct msr_entry msr_entry; + + for (i = 1; i <= count; i++, addr += sizeof(msr_entry)) { + if (kvm_read_guest(vcpu->kvm, addr, + &msr_entry, 2 * sizeof(u32))) { + pr_warn_ratelimited( + "Store MSR %d: cannot read GPA: 0x%llx\n", + i, addr); + return i; + } + if (!vmx_store_msr_entry_verify(vcpu, &msr_entry)) { + pr_warn_ratelimited( + "Store MSR %d: invalid MSR entry 0x%x 0x%x\n", + i, msr_entry.index, msr_entry.reserved); + return i; + } + if (vmx_get_msr(vcpu, msr_entry.index, &msr_entry.data)) { + pr_warn_ratelimited( + "Store MSR %d: cannot read MSR 0x%x\n", + i, msr_entry.index); + return i; + } + if (kvm_write_guest(vcpu->kvm, + addr + offsetof(struct msr_entry, data), + &msr_entry.data, sizeof(msr_entry.data))) { + pr_warn_ratelimited( + "Store MSR %d: cannot write to GPA: 0x%llx\n", + i, addr); + return i; + } + } + return 0; +} + /* * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 * for running an L2 nested guest. @@ -8507,6 +8664,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; + unsigned int msr; struct loaded_vmcs *vmcs02; bool ia32e; @@ -8556,11 +8714,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (vmcs12->vm_entry_msr_load_count > 0 || - vmcs12->vm_exit_msr_load_count > 0 || - vmcs12->vm_exit_msr_store_count > 0) { - pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", - __func__); + if (!nested_vmx_msr_switch_verify(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } @@ -8670,6 +8824,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) prepare_vmcs02(vcpu, vmcs12); + msr = nested_vmx_load_msrs(vcpu, vmcs12->vm_entry_msr_load_count, + vmcs12->vm_entry_msr_load_addr); + if (msr) + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_MSR_LOAD_FAILURE, msr); + if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) return kvm_emulate_halt(vcpu); @@ -9099,6 +9259,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_write64(GUEST_IA32_DEBUGCTL, 0); } +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 abort_code) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + vmcs12->abort = abort_code; + pr_warn("Nested VMX abort %u\n", abort_code); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); +} + /* * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 * and modify vmcs12 to make it see what it would expect to see there if @@ -9118,8 +9287,20 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + if (exit_reason != EXIT_REASON_INVALID_STATE && + exit_reason != EXIT_REASON_MSR_LOAD_FAILURE) { + if (nested_vmx_store_msrs(vcpu, + vmcs12->vm_exit_msr_store_count, + vmcs12->vm_exit_msr_store_addr)) + nested_vmx_abort(vcpu, VMX_ABORT_MSR_STORE); + } + vmx_load_vmcs01(vcpu); + if (nested_vmx_load_msrs(vcpu, vmcs12->vm_exit_msr_load_count, + vmcs12->vm_exit_msr_load_addr)) + nested_vmx_abort(vcpu, VMX_ABORT_MSR_LOAD); + if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) && nested_exit_intr_ack_set(vcpu)) { int irq = kvm_cpu_get_interrupt(vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5b45330..c9d1c4a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1582,6 +1582,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, } return 0; } +EXPORT_SYMBOL_GPL(kvm_write_guest); int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len)
BitVisor hypervisor fails to start a nested guest due to lack of MSR load/store support in KVM. This patch fixes this problem according to Intel SDM. Signed-off-by: Eugene Korenevsky <ekorenevsky@gmail.com> --- arch/x86/include/asm/vmx.h | 6 ++ arch/x86/include/uapi/asm/msr-index.h | 3 + arch/x86/include/uapi/asm/vmx.h | 1 + arch/x86/kvm/vmx.c | 191 +++++++++++++++++++++++++++++++++- virt/kvm/kvm_main.c | 1 + 5 files changed, 197 insertions(+), 5 deletions(-)