diff mbox series

[v3,3/4] KVM: nVMX: relax canonical checks on some x86 registers in vmx host state

Message ID 20240815123349.729017-4-mlevitsk@redhat.com (mailing list archive)
State New, archived
Headers show
Series Relax canonical checks on some arch msrs | expand

Commit Message

Maxim Levitsky Aug. 15, 2024, 12:33 p.m. UTC
Several x86's architecture registers contain a linear base, and thus must
contain a canonical address.

This includes segment and segment like bases (FS/GS base, GDT,IDT,LDT,TR),
addresses used for SYSENTER and SYSCALL instructions and probably more.

As it turns out, when x86 architecture was updated to 5 level paging /
57 bit virtual addresses, these fields were allowed to contain a full
57 bit address regardless of the state of CR4.LA57.

The main reason behind this decision is that 5 level paging, and even
paging itself can be temporarily disabled (e.g by SMM entry) leaving non
canonical values in these fields.
Another reason is that OS might prepare these fields before it switches to
5 level paging.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

Comments

Maxim Levitsky Aug. 16, 2024, 10:40 a.m. UTC | #1
У чт, 2024-08-15 у 15:33 +0300, Maxim Levitsky пише:
> Several x86's architecture registers contain a linear base, and thus must
> contain a canonical address.
> 
> This includes segment and segment like bases (FS/GS base, GDT,IDT,LDT,TR),
> addresses used for SYSENTER and SYSCALL instructions and probably more.
> 
> As it turns out, when x86 architecture was updated to 5 level paging /
> 57 bit virtual addresses, these fields were allowed to contain a full
> 57 bit address regardless of the state of CR4.LA57.
> 
> The main reason behind this decision is that 5 level paging, and even
> paging itself can be temporarily disabled (e.g by SMM entry) leaving non
> canonical values in these fields.
> Another reason is that OS might prepare these fields before it switches to
> 5 level paging.

Hi,

Note that I haven't included a fix for HOST_RIP. I did today a bare metal check
and indeed the microcode does check CR4.LA57, the one that is stored in the vmcs
as you suspected.

I add a patch to this patch series with this mostly theoretical fix, when I send a new revision.

Second thing, I kept the canonical check on 'vmcs12->guest_bndcfgs because Intel
deprecated this feature and none of CPUs which support 5 level paging support MPX.

Also I think that since this is a guest state field, it might be possible to just
remove the check, because the value of this field is copied to vmcs02 and the
CPU's microcode should do the same check that KVM does.

Best regards,
	Maxim Levitsky


> 
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  arch/x86/kvm/vmx/nested.c | 30 +++++++++++++++++++++++-------
>  1 file changed, 23 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 2392a7ef254d..3f18edff80ac 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -2969,6 +2969,22 @@ static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
>         return 0;
>  }
>  
> +static bool is_l1_noncanonical_address_static(u64 la, struct kvm_vcpu *vcpu)
> +{
> +       u8 max_guest_address_bits = guest_can_use(vcpu, X86_FEATURE_LA57) ? 57 : 48;
> +       /*
> +        * Most x86 arch registers which contain linear addresses like
> +        * segment bases, addresses that are used in instructions (e.g SYSENTER),
> +        * have static canonicality checks,
> +        * size of whose depends only on CPU's support for 5-level
> +        * paging, rather than state of CR4.LA57.
> +        *
> +        * In other words the check only depends on the CPU model,
> +        * rather than on runtime state.
> +        */
> +       return !__is_canonical_address(la, max_guest_address_bits);
> +}
> +
>  static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
>                                        struct vmcs12 *vmcs12)
>  {
> @@ -2979,8 +2995,8 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
>             CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3)))
>                 return -EINVAL;
>  
> -       if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
> -           CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
> +       if (CC(is_l1_noncanonical_address_static(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
> +           CC(is_l1_noncanonical_address_static(vmcs12->host_ia32_sysenter_eip, vcpu)))
>                 return -EINVAL;
>  
>         if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
> @@ -3014,11 +3030,11 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
>             CC(vmcs12->host_ss_selector == 0 && !ia32e))
>                 return -EINVAL;
>  
> -       if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
> -           CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
> -           CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
> -           CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
> -           CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
> +       if (CC(is_l1_noncanonical_address_static(vmcs12->host_fs_base, vcpu)) ||
> +           CC(is_l1_noncanonical_address_static(vmcs12->host_gs_base, vcpu)) ||
> +           CC(is_l1_noncanonical_address_static(vmcs12->host_gdtr_base, vcpu)) ||
> +           CC(is_l1_noncanonical_address_static(vmcs12->host_idtr_base, vcpu)) ||
> +           CC(is_l1_noncanonical_address_static(vmcs12->host_tr_base, vcpu)) ||
>             CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
>                 return -EINVAL;
>
Sean Christopherson Aug. 16, 2024, 10:03 p.m. UTC | #2
On Fri, Aug 16, 2024, mlevitsk@redhat.com wrote:
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > ---
> >  arch/x86/kvm/vmx/nested.c | 30 +++++++++++++++++++++++-------
> >  1 file changed, 23 insertions(+), 7 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > index 2392a7ef254d..3f18edff80ac 100644
> > --- a/arch/x86/kvm/vmx/nested.c
> > +++ b/arch/x86/kvm/vmx/nested.c
> > @@ -2969,6 +2969,22 @@ static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
> >         return 0;
> >  }
> >  
> > +static bool is_l1_noncanonical_address_static(u64 la, struct kvm_vcpu *vcpu)
> > +{
> > +       u8 max_guest_address_bits = guest_can_use(vcpu, X86_FEATURE_LA57) ? 57 : 48;

I don't see any reason to use LA57 support from guest CPUID for the VMCS checks.
The virtualization hole exists can't be safely plugged for all cases, so why
bother trying to plug it only for some cases?

It'd be very odd that an L1 could set a "bad" value via WRMSR, but then couldn't
load that same value on VM-Exit, e.g. if L1 gets the VMCS value by doing RDMSR.

> > +       /*
> > +        * Most x86 arch registers which contain linear addresses like
> > +        * segment bases, addresses that are used in instructions (e.g SYSENTER),
> > +        * have static canonicality checks,
> > +        * size of whose depends only on CPU's support for 5-level
> > +        * paging, rather than state of CR4.LA57.
> > +        *
> > +        * In other words the check only depends on the CPU model,
> > +        * rather than on runtime state.
> > +        */
> > +       return !__is_canonical_address(la, max_guest_address_bits);
> > +}
> > +
> >  static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
> >                                        struct vmcs12 *vmcs12)
> >  {
> > @@ -2979,8 +2995,8 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
> >             CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3)))
> >                 return -EINVAL;
> >  
> > -       if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
> > -           CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
> > +       if (CC(is_l1_noncanonical_address_static(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
> > +           CC(is_l1_noncanonical_address_static(vmcs12->host_ia32_sysenter_eip, vcpu)))
> >                 return -EINVAL;
> >  
> >         if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
> > @@ -3014,11 +3030,11 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
> >             CC(vmcs12->host_ss_selector == 0 && !ia32e))
> >                 return -EINVAL;
> >  
> > -       if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
> > -           CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
> > -           CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
> > -           CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
> > -           CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
> > +       if (CC(is_l1_noncanonical_address_static(vmcs12->host_fs_base, vcpu)) ||
> > +           CC(is_l1_noncanonical_address_static(vmcs12->host_gs_base, vcpu)) ||
> > +           CC(is_l1_noncanonical_address_static(vmcs12->host_gdtr_base, vcpu)) ||
> > +           CC(is_l1_noncanonical_address_static(vmcs12->host_idtr_base, vcpu)) ||
> > +           CC(is_l1_noncanonical_address_static(vmcs12->host_tr_base, vcpu)) ||

If loads via LTR, LLDT, and LGDT are indeed exempt, then we need to update
emul_is_noncanonical_address() too.

The best idea I have is to have a separate flow for system registers (not a great
name, but I can't think of anything better), and the

E.g. s/is_host_noncanonical_msr_value/is_non_canonical_system_reg, and then
wire that up to the emulator.

> >             CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
> >                 return -EINVAL;
> >  
>
Maxim Levitsky Aug. 20, 2024, 12:19 p.m. UTC | #3
У пт, 2024-08-16 у 15:03 -0700, Sean Christopherson пише:
> On Fri, Aug 16, 2024, mlevitsk@redhat.com wrote:
> > > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > > ---
> > >  arch/x86/kvm/vmx/nested.c | 30 +++++++++++++++++++++++-------
> > >  1 file changed, 23 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > > index 2392a7ef254d..3f18edff80ac 100644
> > > --- a/arch/x86/kvm/vmx/nested.c
> > > +++ b/arch/x86/kvm/vmx/nested.c
> > > @@ -2969,6 +2969,22 @@ static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
> > >         return 0;
> > >  }
> > >  
> > > +static bool is_l1_noncanonical_address_static(u64 la, struct kvm_vcpu *vcpu)
> > > +{
> > > +       u8 max_guest_address_bits = guest_can_use(vcpu, X86_FEATURE_LA57) ? 57 : 48;
> 
> I don't see any reason to use LA57 support from guest CPUID for the VMCS checks.
> The virtualization hole exists can't be safely plugged for all cases, so why
> bother trying to plug it only for some cases?

I also thought like that but then there is another argument:

My idea was that the guest really ought to not put non canonical values if its CPUID doesn't
support 5 level paging. There is absolutely no reason for doing so.

If the guest does this though via WRMSR, most of the time the MSR is not intercepted, thus
it makes sense to allow this in emulation patch as well, as we discussed to be consistent.

But when VMRESUME/VMLAUNCH instruction, which is *always* emulated, writes those MSRS on VM exit,
then I don't see a reason to allow a virtualization hole.

But then as it turns out (I didn't expect that) that instructions like LGDT also don't check CR4.LA57,
and these are also passed through, then I guess singling out the VMX instructions is no longer better.

> 
> It'd be very odd that an L1 could set a "bad" value via WRMSR, but then couldn't
> load that same value on VM-Exit, e.g. if L1 gets the VMCS value by doing RDMSR.
> 
> > > +       /*
> > > +        * Most x86 arch registers which contain linear addresses like
> > > +        * segment bases, addresses that are used in instructions (e.g SYSENTER),
> > > +        * have static canonicality checks,
> > > +        * size of whose depends only on CPU's support for 5-level
> > > +        * paging, rather than state of CR4.LA57.
> > > +        *
> > > +        * In other words the check only depends on the CPU model,
> > > +        * rather than on runtime state.
> > > +        */
> > > +       return !__is_canonical_address(la, max_guest_address_bits);
> > > +}
> > > +
> > >  static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
> > >                                        struct vmcs12 *vmcs12)
> > >  {
> > > @@ -2979,8 +2995,8 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
> > >             CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3)))
> > >                 return -EINVAL;
> > >  
> > > -       if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
> > > -           CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
> > > +       if (CC(is_l1_noncanonical_address_static(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
> > > +           CC(is_l1_noncanonical_address_static(vmcs12->host_ia32_sysenter_eip, vcpu)))
> > >                 return -EINVAL;
> > >  
> > >         if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
> > > @@ -3014,11 +3030,11 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
> > >             CC(vmcs12->host_ss_selector == 0 && !ia32e))
> > >                 return -EINVAL;
> > >  
> > > -       if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
> > > -           CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
> > > -           CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
> > > -           CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
> > > -           CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
> > > +       if (CC(is_l1_noncanonical_address_static(vmcs12->host_fs_base, vcpu)) ||
> > > +           CC(is_l1_noncanonical_address_static(vmcs12->host_gs_base, vcpu)) ||
> > > +           CC(is_l1_noncanonical_address_static(vmcs12->host_gdtr_base, vcpu)) ||
> > > +           CC(is_l1_noncanonical_address_static(vmcs12->host_idtr_base, vcpu)) ||
> > > +           CC(is_l1_noncanonical_address_static(vmcs12->host_tr_base, vcpu)) ||
> 
> If loads via LTR, LLDT, and LGDT are indeed exempt, then we need to update
> emul_is_noncanonical_address() too.

Sadly the answer to this is yes, at least on Intel. I will test on AMD soon, as soon as I grab
a Zen4 machine again.

And since these instructions are also all unintercepted, it also makes sense to use host cpuid
for them as well.

I attached two kvm unit tests, which I will hopefully polish for publishing soon, which pass
with flying colors with this patch series, and unless I made a mistake prove most of my
research.

The HOST_RIP field I checked separately by patching the L0 kernel, and observing it
either hang/crash or fail VM entry of the first guest.

Best regards,
	Maxim Levitsky

> 
> The best idea I have is to have a separate flow for system registers (not a great
> name, but I can't think of anything better), and the
> 
> E.g. s/is_host_noncanonical_msr_value/is_non_canonical_system_reg, and then
> wire that up to the emulator.
> 
> > >             CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
> > >                 return -EINVAL;
> > >  



>
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 2392a7ef254d..3f18edff80ac 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2969,6 +2969,22 @@  static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static bool is_l1_noncanonical_address_static(u64 la, struct kvm_vcpu *vcpu)
+{
+	u8 max_guest_address_bits = guest_can_use(vcpu, X86_FEATURE_LA57) ? 57 : 48;
+	/*
+	 * Most x86 arch registers which contain linear addresses like
+	 * segment bases, addresses that are used in instructions (e.g SYSENTER),
+	 * have static canonicality checks,
+	 * size of whose depends only on CPU's support for 5-level
+	 * paging, rather than state of CR4.LA57.
+	 *
+	 * In other words the check only depends on the CPU model,
+	 * rather than on runtime state.
+	 */
+	return !__is_canonical_address(la, max_guest_address_bits);
+}
+
 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
 				       struct vmcs12 *vmcs12)
 {
@@ -2979,8 +2995,8 @@  static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
 	    CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3)))
 		return -EINVAL;
 
-	if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
-	    CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
+	if (CC(is_l1_noncanonical_address_static(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
+	    CC(is_l1_noncanonical_address_static(vmcs12->host_ia32_sysenter_eip, vcpu)))
 		return -EINVAL;
 
 	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
@@ -3014,11 +3030,11 @@  static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
 	    CC(vmcs12->host_ss_selector == 0 && !ia32e))
 		return -EINVAL;
 
-	if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
-	    CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
-	    CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
-	    CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
-	    CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
+	if (CC(is_l1_noncanonical_address_static(vmcs12->host_fs_base, vcpu)) ||
+	    CC(is_l1_noncanonical_address_static(vmcs12->host_gs_base, vcpu)) ||
+	    CC(is_l1_noncanonical_address_static(vmcs12->host_gdtr_base, vcpu)) ||
+	    CC(is_l1_noncanonical_address_static(vmcs12->host_idtr_base, vcpu)) ||
+	    CC(is_l1_noncanonical_address_static(vmcs12->host_tr_base, vcpu)) ||
 	    CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
 		return -EINVAL;