diff mbox

[1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC

Message ID 20160808181623.12132-2-rkrcmar@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Radim Krčmář Aug. 8, 2016, 6:16 p.m. UTC
msr bitmap can be used to avoid a VM exit (interception) on guest MSR
accesses.  In some configurations of VMX controls, the guest can even
directly access host's x2APIC MSRs.  See SDM 29.5 VIRTUALIZING MSR-BASED
APIC ACCESSES.

L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI.
To do so, L1 would first trick KVM to disable all possible interceptions
by enabling APICv features and then would turn those features off;
nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would
not intercept previously enabled MSRs even though they were not safe
with the new configuration.

Correctly re-enabling interceptions is not enough as a second bug would
still allow L1+L2 to access host's MSRs: msr bitmap was shared for all
VMCSs, so L1 could trigger a race to get the desired combination of msr
bitmap and VMX controls.

This fix allocates a msr bitmap for every L1 VCPU, allows only safe
x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would
have to intercept everything anyway.

Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap")
Reported-by: Jim Mattson <jmattson@google.com>
Suggested-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 107 ++++++++++++++++++++++-------------------------------
 1 file changed, 44 insertions(+), 63 deletions(-)

Comments

Yang Zhang Aug. 9, 2016, 9:32 a.m. UTC | #1
On 2016/8/9 2:16, Radim Krčmář wrote:
> msr bitmap can be used to avoid a VM exit (interception) on guest MSR
> accesses.  In some configurations of VMX controls, the guest can even
> directly access host's x2APIC MSRs.  See SDM 29.5 VIRTUALIZING MSR-BASED
> APIC ACCESSES.
>
> L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI.
> To do so, L1 would first trick KVM to disable all possible interceptions
> by enabling APICv features and then would turn those features off;
> nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would
> not intercept previously enabled MSRs even though they were not safe
> with the new configuration.
>
> Correctly re-enabling interceptions is not enough as a second bug would
> still allow L1+L2 to access host's MSRs: msr bitmap was shared for all
> VMCSs, so L1 could trigger a race to get the desired combination of msr
> bitmap and VMX controls.
>
> This fix allocates a msr bitmap for every L1 VCPU, allows only safe
> x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would
> have to intercept everything anyway.
>
> Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap")
> Reported-by: Jim Mattson <jmattson@google.com>
> Suggested-by: Wincy Van <fanwenyi0529@gmail.com>
> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
> ---
>  arch/x86/kvm/vmx.c | 107 ++++++++++++++++++++++-------------------------------
>  1 file changed, 44 insertions(+), 63 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index a45d8580f91e..c66ac2c70d22 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -435,6 +435,8 @@ struct nested_vmx {
>  	bool pi_pending;
>  	u16 posted_intr_nv;
>
> +	unsigned long *msr_bitmap;
> +
>  	struct hrtimer preemption_timer;
>  	bool preemption_timer_expired;
>
> @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy;
>  static unsigned long *vmx_msr_bitmap_longmode;
>  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
>  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
> -static unsigned long *vmx_msr_bitmap_nested;
>  static unsigned long *vmx_vmread_bitmap;
>  static unsigned long *vmx_vmwrite_bitmap;
>
> @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
>  	unsigned long *msr_bitmap;
>
>  	if (is_guest_mode(vcpu))
> -		msr_bitmap = vmx_msr_bitmap_nested;
> +		msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
>  	else if (cpu_has_secondary_exec_ctrls() &&
>  		 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
>  		  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
> @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void)
>  	if (!vmx_msr_bitmap_longmode_x2apic)
>  		goto out4;
>
> -	if (nested) {
> -		vmx_msr_bitmap_nested =
> -			(unsigned long *)__get_free_page(GFP_KERNEL);
> -		if (!vmx_msr_bitmap_nested)
> -			goto out5;
> -	}
> -
>  	vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
>  	if (!vmx_vmread_bitmap)
>  		goto out6;
> @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void)
>
>  	memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
>  	memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
> -	if (nested)
> -		memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
>
>  	if (setup_vmcs_config(&vmcs_config) < 0) {
>  		r = -EIO;
> @@ -6529,9 +6521,6 @@ out8:
>  out7:
>  	free_page((unsigned long)vmx_vmread_bitmap);
>  out6:
> -	if (nested)
> -		free_page((unsigned long)vmx_msr_bitmap_nested);
> -out5:
>  	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
>  out4:
>  	free_page((unsigned long)vmx_msr_bitmap_longmode);
> @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void)
>  	free_page((unsigned long)vmx_io_bitmap_a);
>  	free_page((unsigned long)vmx_vmwrite_bitmap);
>  	free_page((unsigned long)vmx_vmread_bitmap);
> -	if (nested)
> -		free_page((unsigned long)vmx_msr_bitmap_nested);
>
>  	free_kvm_area();
>  }
> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>  		return 1;
>  	}
>
> +	if (cpu_has_vmx_msr_bitmap()) {
> +		vmx->nested.msr_bitmap =
> +				(unsigned long *)__get_free_page(GFP_KERNEL);
> +		if (!vmx->nested.msr_bitmap)
> +			goto out_msr_bitmap;
> +	}
> +

We export msr_bitmap to guest even it is not supported by hardware. So 
we need to allocate msr_bitmap for L1 unconditionally.

>  	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
>  	if (!vmx->nested.cached_vmcs12)
> -		return -ENOMEM;
> +		goto out_cached_vmcs12;
>
>  	if (enable_shadow_vmcs) {
>  		shadow_vmcs = alloc_vmcs();
> -		if (!shadow_vmcs) {
> -			kfree(vmx->nested.cached_vmcs12);
> -			return -ENOMEM;
> -		}
> +		if (!shadow_vmcs)
> +			goto out_shadow_vmcs;
>  		/* mark vmcs as shadow */
>  		shadow_vmcs->revision_id |= (1u << 31);
>  		/* init shadow vmcs */
> @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>  	skip_emulated_instruction(vcpu);
>  	nested_vmx_succeed(vcpu);
>  	return 1;
> +
> +out_shadow_vmcs:
> +	kfree(vmx->nested.cached_vmcs12);
> +
> +out_cached_vmcs12:
> +	free_page((unsigned long)vmx->nested.msr_bitmap);
> +
> +out_msr_bitmap:
> +	return -ENOMEM;
>  }
>
>  /*
> @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx)
>  	vmx->nested.vmxon = false;
>  	free_vpid(vmx->nested.vpid02);
>  	nested_release_vmcs12(vmx);
> +	if (vmx->nested.msr_bitmap) {
> +		free_page((unsigned long)vmx->nested.msr_bitmap);
> +		vmx->nested.msr_bitmap = NULL;
> +	}
>  	if (enable_shadow_vmcs)
>  		free_vmcs(vmx->nested.current_shadow_vmcs);
>  	kfree(vmx->nested.cached_vmcs12);
> @@ -9472,8 +9477,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>  {
>  	int msr;
>  	struct page *page;
> -	unsigned long *msr_bitmap;
> +	unsigned long *msr_bitmap_l1;
> +	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
>
> +	/* This shortcut is ok because we support only x2APIC MSRs so far. */
>  	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
>  		return false;
>
> @@ -9482,63 +9489,37 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>  		WARN_ON(1);
>  		return false;
>  	}
> -	msr_bitmap = (unsigned long *)kmap(page);
> -	if (!msr_bitmap) {
> +	msr_bitmap_l1 = (unsigned long *)kmap(page);
> +	if (!msr_bitmap_l1) {
>  		nested_release_page_clean(page);
>  		WARN_ON(1);
>  		return false;
>  	}
>
> +	memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
> +
>  	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
>  		if (nested_cpu_has_apic_reg_virt(vmcs12))
>  			for (msr = 0x800; msr <= 0x8ff; msr++)
>  				nested_vmx_disable_intercept_for_msr(
> -					msr_bitmap,
> -					vmx_msr_bitmap_nested,
> +					msr_bitmap_l1, msr_bitmap_l0,
>  					msr, MSR_TYPE_R);
> -		/* TPR is allowed */
> -		nested_vmx_disable_intercept_for_msr(msr_bitmap,
> -				vmx_msr_bitmap_nested,
> +
> +		nested_vmx_disable_intercept_for_msr(
> +				msr_bitmap_l1, msr_bitmap_l0,
>  				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>  				MSR_TYPE_R | MSR_TYPE_W);
> +
>  		if (nested_cpu_has_vid(vmcs12)) {
> -			/* EOI and self-IPI are allowed */
>  			nested_vmx_disable_intercept_for_msr(
> -				msr_bitmap,
> -				vmx_msr_bitmap_nested,
> +				msr_bitmap_l1, msr_bitmap_l0,
>  				APIC_BASE_MSR + (APIC_EOI >> 4),
>  				MSR_TYPE_W);
>  			nested_vmx_disable_intercept_for_msr(
> -				msr_bitmap,
> -				vmx_msr_bitmap_nested,
> +				msr_bitmap_l1, msr_bitmap_l0,
>  				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
>  				MSR_TYPE_W);
>  		}
> -	} else {
> -		/*
> -		 * Enable reading intercept of all the x2apic
> -		 * MSRs. We should not rely on vmcs12 to do any
> -		 * optimizations here, it may have been modified
> -		 * by L1.
> -		 */
> -		for (msr = 0x800; msr <= 0x8ff; msr++)
> -			__vmx_enable_intercept_for_msr(
> -				vmx_msr_bitmap_nested,
> -				msr,
> -				MSR_TYPE_R);
> -
> -		__vmx_enable_intercept_for_msr(
> -				vmx_msr_bitmap_nested,
> -				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
> -				MSR_TYPE_W);
> -		__vmx_enable_intercept_for_msr(
> -				vmx_msr_bitmap_nested,
> -				APIC_BASE_MSR + (APIC_EOI >> 4),
> -				MSR_TYPE_W);
> -		__vmx_enable_intercept_for_msr(
> -				vmx_msr_bitmap_nested,
> -				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
> -				MSR_TYPE_W);
>  	}
>  	kunmap(page);
>  	nested_release_page_clean(page);
> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  	}
>
>  	if (cpu_has_vmx_msr_bitmap() &&
> -	    exec_control & CPU_BASED_USE_MSR_BITMAPS) {
> -		nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
> -		/* MSR_BITMAP will be set by following vmx_set_efer. */
> -	} else
> +	    exec_control & CPU_BASED_USE_MSR_BITMAPS &&
> +	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
> +		; /* MSR_BITMAP will be set by following vmx_set_efer. */
> +	else
>  		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
>
>  	/*
>
Wincy Van Aug. 9, 2016, 10:19 a.m. UTC | #2
On Tue, Aug 9, 2016 at 5:32 PM, Yang Zhang <yang.zhang.wz@gmail.com> wrote:
> On 2016/8/9 2:16, Radim Krčmář wrote:
>>
>> msr bitmap can be used to avoid a VM exit (interception) on guest MSR
>> accesses.  In some configurations of VMX controls, the guest can even
>> directly access host's x2APIC MSRs.  See SDM 29.5 VIRTUALIZING MSR-BASED
>> APIC ACCESSES.
>>
>> L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI.
>> To do so, L1 would first trick KVM to disable all possible interceptions
>> by enabling APICv features and then would turn those features off;
>> nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would
>> not intercept previously enabled MSRs even though they were not safe
>> with the new configuration.
>>
>> Correctly re-enabling interceptions is not enough as a second bug would
>> still allow L1+L2 to access host's MSRs: msr bitmap was shared for all
>> VMCSs, so L1 could trigger a race to get the desired combination of msr
>> bitmap and VMX controls.
>>
>> This fix allocates a msr bitmap for every L1 VCPU, allows only safe
>> x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would
>> have to intercept everything anyway.
>>
>> Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap")
>> Reported-by: Jim Mattson <jmattson@google.com>
>> Suggested-by: Wincy Van <fanwenyi0529@gmail.com>
>> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
>> ---
>>  arch/x86/kvm/vmx.c | 107
>> ++++++++++++++++++++++-------------------------------
>>  1 file changed, 44 insertions(+), 63 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index a45d8580f91e..c66ac2c70d22 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -435,6 +435,8 @@ struct nested_vmx {
>>         bool pi_pending;
>>         u16 posted_intr_nv;
>>
>> +       unsigned long *msr_bitmap;
>> +
>>         struct hrtimer preemption_timer;
>>         bool preemption_timer_expired;
>>
>> @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy;
>>  static unsigned long *vmx_msr_bitmap_longmode;
>>  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
>>  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
>> -static unsigned long *vmx_msr_bitmap_nested;
>>  static unsigned long *vmx_vmread_bitmap;
>>  static unsigned long *vmx_vmwrite_bitmap;
>>
>> @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu
>> *vcpu)
>>         unsigned long *msr_bitmap;
>>
>>         if (is_guest_mode(vcpu))
>> -               msr_bitmap = vmx_msr_bitmap_nested;
>> +               msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
>>         else if (cpu_has_secondary_exec_ctrls() &&
>>                  (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
>>                   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
>> @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void)
>>         if (!vmx_msr_bitmap_longmode_x2apic)
>>                 goto out4;
>>
>> -       if (nested) {
>> -               vmx_msr_bitmap_nested =
>> -                       (unsigned long *)__get_free_page(GFP_KERNEL);
>> -               if (!vmx_msr_bitmap_nested)
>> -                       goto out5;
>> -       }
>> -
>>         vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
>>         if (!vmx_vmread_bitmap)
>>                 goto out6;
>> @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void)
>>
>>         memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
>>         memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
>> -       if (nested)
>> -               memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
>>
>>         if (setup_vmcs_config(&vmcs_config) < 0) {
>>                 r = -EIO;
>> @@ -6529,9 +6521,6 @@ out8:
>>  out7:
>>         free_page((unsigned long)vmx_vmread_bitmap);
>>  out6:
>> -       if (nested)
>> -               free_page((unsigned long)vmx_msr_bitmap_nested);
>> -out5:
>>         free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
>>  out4:
>>         free_page((unsigned long)vmx_msr_bitmap_longmode);
>> @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void)
>>         free_page((unsigned long)vmx_io_bitmap_a);
>>         free_page((unsigned long)vmx_vmwrite_bitmap);
>>         free_page((unsigned long)vmx_vmread_bitmap);
>> -       if (nested)
>> -               free_page((unsigned long)vmx_msr_bitmap_nested);
>>
>>         free_kvm_area();
>>  }
>> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>>                 return 1;
>>         }
>>
>> +       if (cpu_has_vmx_msr_bitmap()) {
>> +               vmx->nested.msr_bitmap =
>> +                               (unsigned long
>> *)__get_free_page(GFP_KERNEL);
>> +               if (!vmx->nested.msr_bitmap)
>> +                       goto out_msr_bitmap;
>> +       }
>> +
>
>
> We export msr_bitmap to guest even it is not supported by hardware. So we
> need to allocate msr_bitmap for L1 unconditionally.
>
>

Nice to see you, Yang :-)

I think vmx->nested.msr_bitmap is filled by L0 and L1's settings, and
used by hardware.
L1's msr_bitmap is managed by itself, and L1 will write MSR_BITMAP field to
vmcs12->msr_bitmap, then L0 can get L1's settings to emulate MSR_BITMAP.

Am I missed something?

Thanks,
Wincy

>>         vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
>>         if (!vmx->nested.cached_vmcs12)
>> -               return -ENOMEM;
>> +               goto out_cached_vmcs12;
>>
>>         if (enable_shadow_vmcs) {
>>                 shadow_vmcs = alloc_vmcs();
>> -               if (!shadow_vmcs) {
>> -                       kfree(vmx->nested.cached_vmcs12);
>> -                       return -ENOMEM;
>> -               }
>> +               if (!shadow_vmcs)
>> +                       goto out_shadow_vmcs;
>>                 /* mark vmcs as shadow */
>>                 shadow_vmcs->revision_id |= (1u << 31);
>>                 /* init shadow vmcs */
>> @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>>         skip_emulated_instruction(vcpu);
>>         nested_vmx_succeed(vcpu);
>>         return 1;
>> +
>> +out_shadow_vmcs:
>> +       kfree(vmx->nested.cached_vmcs12);
>> +
>> +out_cached_vmcs12:
>> +       free_page((unsigned long)vmx->nested.msr_bitmap);
>> +
>> +out_msr_bitmap:
>> +       return -ENOMEM;
>>  }
>>
>>  /*
>> @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx)
>>         vmx->nested.vmxon = false;
>>         free_vpid(vmx->nested.vpid02);
>>         nested_release_vmcs12(vmx);
>> +       if (vmx->nested.msr_bitmap) {
>> +               free_page((unsigned long)vmx->nested.msr_bitmap);
>> +               vmx->nested.msr_bitmap = NULL;
>> +       }
>>         if (enable_shadow_vmcs)
>>                 free_vmcs(vmx->nested.current_shadow_vmcs);
>>         kfree(vmx->nested.cached_vmcs12);
>> @@ -9472,8 +9477,10 @@ static inline bool
>> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>>  {
>>         int msr;
>>         struct page *page;
>> -       unsigned long *msr_bitmap;
>> +       unsigned long *msr_bitmap_l1;
>> +       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
>>
>> +       /* This shortcut is ok because we support only x2APIC MSRs so far.
>> */
>>         if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
>>                 return false;
>>
>> @@ -9482,63 +9489,37 @@ static inline bool
>> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>>                 WARN_ON(1);
>>                 return false;
>>         }
>> -       msr_bitmap = (unsigned long *)kmap(page);
>> -       if (!msr_bitmap) {
>> +       msr_bitmap_l1 = (unsigned long *)kmap(page);
>> +       if (!msr_bitmap_l1) {
>>                 nested_release_page_clean(page);
>>                 WARN_ON(1);
>>                 return false;
>>         }
>>
>> +       memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
>> +
>>         if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
>>                 if (nested_cpu_has_apic_reg_virt(vmcs12))
>>                         for (msr = 0x800; msr <= 0x8ff; msr++)
>>                                 nested_vmx_disable_intercept_for_msr(
>> -                                       msr_bitmap,
>> -                                       vmx_msr_bitmap_nested,
>> +                                       msr_bitmap_l1, msr_bitmap_l0,
>>                                         msr, MSR_TYPE_R);
>> -               /* TPR is allowed */
>> -               nested_vmx_disable_intercept_for_msr(msr_bitmap,
>> -                               vmx_msr_bitmap_nested,
>> +
>> +               nested_vmx_disable_intercept_for_msr(
>> +                               msr_bitmap_l1, msr_bitmap_l0,
>>                                 APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>>                                 MSR_TYPE_R | MSR_TYPE_W);
>> +
>>                 if (nested_cpu_has_vid(vmcs12)) {
>> -                       /* EOI and self-IPI are allowed */
>>                         nested_vmx_disable_intercept_for_msr(
>> -                               msr_bitmap,
>> -                               vmx_msr_bitmap_nested,
>> +                               msr_bitmap_l1, msr_bitmap_l0,
>>                                 APIC_BASE_MSR + (APIC_EOI >> 4),
>>                                 MSR_TYPE_W);
>>                         nested_vmx_disable_intercept_for_msr(
>> -                               msr_bitmap,
>> -                               vmx_msr_bitmap_nested,
>> +                               msr_bitmap_l1, msr_bitmap_l0,
>>                                 APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
>>                                 MSR_TYPE_W);
>>                 }
>> -       } else {
>> -               /*
>> -                * Enable reading intercept of all the x2apic
>> -                * MSRs. We should not rely on vmcs12 to do any
>> -                * optimizations here, it may have been modified
>> -                * by L1.
>> -                */
>> -               for (msr = 0x800; msr <= 0x8ff; msr++)
>> -                       __vmx_enable_intercept_for_msr(
>> -                               vmx_msr_bitmap_nested,
>> -                               msr,
>> -                               MSR_TYPE_R);
>> -
>> -               __vmx_enable_intercept_for_msr(
>> -                               vmx_msr_bitmap_nested,
>> -                               APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>> -                               MSR_TYPE_W);
>> -               __vmx_enable_intercept_for_msr(
>> -                               vmx_msr_bitmap_nested,
>> -                               APIC_BASE_MSR + (APIC_EOI >> 4),
>> -                               MSR_TYPE_W);
>> -               __vmx_enable_intercept_for_msr(
>> -                               vmx_msr_bitmap_nested,
>> -                               APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
>> -                               MSR_TYPE_W);
>>         }
>>         kunmap(page);
>>         nested_release_page_clean(page);
>> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu,
>> struct vmcs12 *vmcs12)
>>         }
>>
>>         if (cpu_has_vmx_msr_bitmap() &&
>> -           exec_control & CPU_BASED_USE_MSR_BITMAPS) {
>> -               nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
>> -               /* MSR_BITMAP will be set by following vmx_set_efer. */
>> -       } else
>> +           exec_control & CPU_BASED_USE_MSR_BITMAPS &&
>> +           nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
>> +               ; /* MSR_BITMAP will be set by following vmx_set_efer. */
>> +       else
>>                 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
>>
>>         /*
>>
>
>
> --
> Yang
> Alibaba Cloud Computing
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Radim Krčmář Aug. 9, 2016, 12:23 p.m. UTC | #3
2016-08-09 17:32+0800, Yang Zhang:
> On 2016/8/9 2:16, Radim Krčmář wrote:
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>>  		return 1;
>>  	}
>> 
>> +	if (cpu_has_vmx_msr_bitmap()) {
>> +		vmx->nested.msr_bitmap =
>> +				(unsigned long *)__get_free_page(GFP_KERNEL);
>> +		if (!vmx->nested.msr_bitmap)
>> +			goto out_msr_bitmap;
>> +	}
>> +
> 
> We export msr_bitmap to guest even it is not supported by hardware. So we
> need to allocate msr_bitmap for L1 unconditionally.

We do emulate the feature, but the vmx->nested.msr_bitmap is used only
if VMX supports it to avoid some VM exits:

>> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>  	}
>> 
>>  	if (cpu_has_vmx_msr_bitmap() &&
>> -	    exec_control & CPU_BASED_USE_MSR_BITMAPS) {
>> -		nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
>> -		/* MSR_BITMAP will be set by following vmx_set_efer. */
>> -	} else
>> +	    exec_control & CPU_BASED_USE_MSR_BITMAPS &&
>> +	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
>> +		; /* MSR_BITMAP will be set by following vmx_set_efer. */
>> +	else
>>  		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;

The else branch is taken if !cpu_has_vmx_msr_bitmap() and disables msr
bitmaps.  Similar check for vmx_set_msr_bitmap(), so the NULL doesn't
even get written to VMCS.

KVM always uses L1's msr bitmaps when emulating the feature.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yang Zhang Aug. 10, 2016, 5:52 a.m. UTC | #4
On 2016/8/9 18:19, Wincy Van wrote:
> On Tue, Aug 9, 2016 at 5:32 PM, Yang Zhang <yang.zhang.wz@gmail.com> wrote:
>> On 2016/8/9 2:16, Radim Krčmář wrote:
>>>
>>> msr bitmap can be used to avoid a VM exit (interception) on guest MSR
>>> accesses.  In some configurations of VMX controls, the guest can even
>>> directly access host's x2APIC MSRs.  See SDM 29.5 VIRTUALIZING MSR-BASED
>>> APIC ACCESSES.
>>>
>>> L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI.
>>> To do so, L1 would first trick KVM to disable all possible interceptions
>>> by enabling APICv features and then would turn those features off;
>>> nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would
>>> not intercept previously enabled MSRs even though they were not safe
>>> with the new configuration.
>>>
>>> Correctly re-enabling interceptions is not enough as a second bug would
>>> still allow L1+L2 to access host's MSRs: msr bitmap was shared for all
>>> VMCSs, so L1 could trigger a race to get the desired combination of msr
>>> bitmap and VMX controls.
>>>
>>> This fix allocates a msr bitmap for every L1 VCPU, allows only safe
>>> x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would
>>> have to intercept everything anyway.
>>>
>>> Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap")
>>> Reported-by: Jim Mattson <jmattson@google.com>
>>> Suggested-by: Wincy Van <fanwenyi0529@gmail.com>
>>> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
>>> ---
>>>  arch/x86/kvm/vmx.c | 107
>>> ++++++++++++++++++++++-------------------------------
>>>  1 file changed, 44 insertions(+), 63 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>> index a45d8580f91e..c66ac2c70d22 100644
>>> --- a/arch/x86/kvm/vmx.c
>>> +++ b/arch/x86/kvm/vmx.c
>>> @@ -435,6 +435,8 @@ struct nested_vmx {
>>>         bool pi_pending;
>>>         u16 posted_intr_nv;
>>>
>>> +       unsigned long *msr_bitmap;
>>> +
>>>         struct hrtimer preemption_timer;
>>>         bool preemption_timer_expired;
>>>
>>> @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy;
>>>  static unsigned long *vmx_msr_bitmap_longmode;
>>>  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
>>>  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
>>> -static unsigned long *vmx_msr_bitmap_nested;
>>>  static unsigned long *vmx_vmread_bitmap;
>>>  static unsigned long *vmx_vmwrite_bitmap;
>>>
>>> @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu
>>> *vcpu)
>>>         unsigned long *msr_bitmap;
>>>
>>>         if (is_guest_mode(vcpu))
>>> -               msr_bitmap = vmx_msr_bitmap_nested;
>>> +               msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
>>>         else if (cpu_has_secondary_exec_ctrls() &&
>>>                  (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
>>>                   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
>>> @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void)
>>>         if (!vmx_msr_bitmap_longmode_x2apic)
>>>                 goto out4;
>>>
>>> -       if (nested) {
>>> -               vmx_msr_bitmap_nested =
>>> -                       (unsigned long *)__get_free_page(GFP_KERNEL);
>>> -               if (!vmx_msr_bitmap_nested)
>>> -                       goto out5;
>>> -       }
>>> -
>>>         vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
>>>         if (!vmx_vmread_bitmap)
>>>                 goto out6;
>>> @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void)
>>>
>>>         memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
>>>         memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
>>> -       if (nested)
>>> -               memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
>>>
>>>         if (setup_vmcs_config(&vmcs_config) < 0) {
>>>                 r = -EIO;
>>> @@ -6529,9 +6521,6 @@ out8:
>>>  out7:
>>>         free_page((unsigned long)vmx_vmread_bitmap);
>>>  out6:
>>> -       if (nested)
>>> -               free_page((unsigned long)vmx_msr_bitmap_nested);
>>> -out5:
>>>         free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
>>>  out4:
>>>         free_page((unsigned long)vmx_msr_bitmap_longmode);
>>> @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void)
>>>         free_page((unsigned long)vmx_io_bitmap_a);
>>>         free_page((unsigned long)vmx_vmwrite_bitmap);
>>>         free_page((unsigned long)vmx_vmread_bitmap);
>>> -       if (nested)
>>> -               free_page((unsigned long)vmx_msr_bitmap_nested);
>>>
>>>         free_kvm_area();
>>>  }
>>> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>>>                 return 1;
>>>         }
>>>
>>> +       if (cpu_has_vmx_msr_bitmap()) {
>>> +               vmx->nested.msr_bitmap =
>>> +                               (unsigned long
>>> *)__get_free_page(GFP_KERNEL);
>>> +               if (!vmx->nested.msr_bitmap)
>>> +                       goto out_msr_bitmap;
>>> +       }
>>> +
>>
>>
>> We export msr_bitmap to guest even it is not supported by hardware. So we
>> need to allocate msr_bitmap for L1 unconditionally.
>>
>>
>
> Nice to see you, Yang :-)

Niec to see you too.

>
> I think vmx->nested.msr_bitmap is filled by L0 and L1's settings, and
> used by hardware.
> L1's msr_bitmap is managed by itself, and L1 will write MSR_BITMAP field to
> vmcs12->msr_bitmap, then L0 can get L1's settings to emulate MSR_BITMAP.
>
> Am I missed something?

You are correct!

>
> Thanks,
> Wincy
>
>>>         vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
>>>         if (!vmx->nested.cached_vmcs12)
>>> -               return -ENOMEM;
>>> +               goto out_cached_vmcs12;
>>>
>>>         if (enable_shadow_vmcs) {
>>>                 shadow_vmcs = alloc_vmcs();
>>> -               if (!shadow_vmcs) {
>>> -                       kfree(vmx->nested.cached_vmcs12);
>>> -                       return -ENOMEM;
>>> -               }
>>> +               if (!shadow_vmcs)
>>> +                       goto out_shadow_vmcs;
>>>                 /* mark vmcs as shadow */
>>>                 shadow_vmcs->revision_id |= (1u << 31);
>>>                 /* init shadow vmcs */
>>> @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>>>         skip_emulated_instruction(vcpu);
>>>         nested_vmx_succeed(vcpu);
>>>         return 1;
>>> +
>>> +out_shadow_vmcs:
>>> +       kfree(vmx->nested.cached_vmcs12);
>>> +
>>> +out_cached_vmcs12:
>>> +       free_page((unsigned long)vmx->nested.msr_bitmap);
>>> +
>>> +out_msr_bitmap:
>>> +       return -ENOMEM;
>>>  }
>>>
>>>  /*
>>> @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx)
>>>         vmx->nested.vmxon = false;
>>>         free_vpid(vmx->nested.vpid02);
>>>         nested_release_vmcs12(vmx);
>>> +       if (vmx->nested.msr_bitmap) {
>>> +               free_page((unsigned long)vmx->nested.msr_bitmap);
>>> +               vmx->nested.msr_bitmap = NULL;
>>> +       }
>>>         if (enable_shadow_vmcs)
>>>                 free_vmcs(vmx->nested.current_shadow_vmcs);
>>>         kfree(vmx->nested.cached_vmcs12);
>>> @@ -9472,8 +9477,10 @@ static inline bool
>>> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>>>  {
>>>         int msr;
>>>         struct page *page;
>>> -       unsigned long *msr_bitmap;
>>> +       unsigned long *msr_bitmap_l1;
>>> +       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
>>>
>>> +       /* This shortcut is ok because we support only x2APIC MSRs so far.
>>> */
>>>         if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
>>>                 return false;
>>>
>>> @@ -9482,63 +9489,37 @@ static inline bool
>>> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>>>                 WARN_ON(1);
>>>                 return false;
>>>         }
>>> -       msr_bitmap = (unsigned long *)kmap(page);
>>> -       if (!msr_bitmap) {
>>> +       msr_bitmap_l1 = (unsigned long *)kmap(page);
>>> +       if (!msr_bitmap_l1) {
>>>                 nested_release_page_clean(page);
>>>                 WARN_ON(1);
>>>                 return false;
>>>         }
>>>
>>> +       memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
>>> +
>>>         if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
>>>                 if (nested_cpu_has_apic_reg_virt(vmcs12))
>>>                         for (msr = 0x800; msr <= 0x8ff; msr++)
>>>                                 nested_vmx_disable_intercept_for_msr(
>>> -                                       msr_bitmap,
>>> -                                       vmx_msr_bitmap_nested,
>>> +                                       msr_bitmap_l1, msr_bitmap_l0,
>>>                                         msr, MSR_TYPE_R);
>>> -               /* TPR is allowed */
>>> -               nested_vmx_disable_intercept_for_msr(msr_bitmap,
>>> -                               vmx_msr_bitmap_nested,
>>> +
>>> +               nested_vmx_disable_intercept_for_msr(
>>> +                               msr_bitmap_l1, msr_bitmap_l0,
>>>                                 APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>>>                                 MSR_TYPE_R | MSR_TYPE_W);
>>> +
>>>                 if (nested_cpu_has_vid(vmcs12)) {
>>> -                       /* EOI and self-IPI are allowed */
>>>                         nested_vmx_disable_intercept_for_msr(
>>> -                               msr_bitmap,
>>> -                               vmx_msr_bitmap_nested,
>>> +                               msr_bitmap_l1, msr_bitmap_l0,
>>>                                 APIC_BASE_MSR + (APIC_EOI >> 4),
>>>                                 MSR_TYPE_W);
>>>                         nested_vmx_disable_intercept_for_msr(
>>> -                               msr_bitmap,
>>> -                               vmx_msr_bitmap_nested,
>>> +                               msr_bitmap_l1, msr_bitmap_l0,
>>>                                 APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
>>>                                 MSR_TYPE_W);
>>>                 }
>>> -       } else {
>>> -               /*
>>> -                * Enable reading intercept of all the x2apic
>>> -                * MSRs. We should not rely on vmcs12 to do any
>>> -                * optimizations here, it may have been modified
>>> -                * by L1.
>>> -                */
>>> -               for (msr = 0x800; msr <= 0x8ff; msr++)
>>> -                       __vmx_enable_intercept_for_msr(
>>> -                               vmx_msr_bitmap_nested,
>>> -                               msr,
>>> -                               MSR_TYPE_R);
>>> -
>>> -               __vmx_enable_intercept_for_msr(
>>> -                               vmx_msr_bitmap_nested,
>>> -                               APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>>> -                               MSR_TYPE_W);
>>> -               __vmx_enable_intercept_for_msr(
>>> -                               vmx_msr_bitmap_nested,
>>> -                               APIC_BASE_MSR + (APIC_EOI >> 4),
>>> -                               MSR_TYPE_W);
>>> -               __vmx_enable_intercept_for_msr(
>>> -                               vmx_msr_bitmap_nested,
>>> -                               APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
>>> -                               MSR_TYPE_W);
>>>         }
>>>         kunmap(page);
>>>         nested_release_page_clean(page);
>>> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu,
>>> struct vmcs12 *vmcs12)
>>>         }
>>>
>>>         if (cpu_has_vmx_msr_bitmap() &&
>>> -           exec_control & CPU_BASED_USE_MSR_BITMAPS) {
>>> -               nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
>>> -               /* MSR_BITMAP will be set by following vmx_set_efer. */
>>> -       } else
>>> +           exec_control & CPU_BASED_USE_MSR_BITMAPS &&
>>> +           nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
>>> +               ; /* MSR_BITMAP will be set by following vmx_set_efer. */
>>> +       else
>>>                 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
>>>
>>>         /*
>>>
>>
>>
>> --
>> Yang
>> Alibaba Cloud Computing
Yang Zhang Aug. 10, 2016, 5:55 a.m. UTC | #5
On 2016/8/9 20:23, Radim Krčmář wrote:
> 2016-08-09 17:32+0800, Yang Zhang:
>> On 2016/8/9 2:16, Radim Krčmář wrote:
>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>>>  		return 1;
>>>  	}
>>>
>>> +	if (cpu_has_vmx_msr_bitmap()) {
>>> +		vmx->nested.msr_bitmap =
>>> +				(unsigned long *)__get_free_page(GFP_KERNEL);
>>> +		if (!vmx->nested.msr_bitmap)
>>> +			goto out_msr_bitmap;
>>> +	}
>>> +
>>
>> We export msr_bitmap to guest even it is not supported by hardware. So we
>> need to allocate msr_bitmap for L1 unconditionally.
>
> We do emulate the feature, but the vmx->nested.msr_bitmap is used only
> if VMX supports it to avoid some VM exits:
>
>>> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>>  	}
>>>
>>>  	if (cpu_has_vmx_msr_bitmap() &&
>>> -	    exec_control & CPU_BASED_USE_MSR_BITMAPS) {
>>> -		nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
>>> -		/* MSR_BITMAP will be set by following vmx_set_efer. */
>>> -	} else
>>> +	    exec_control & CPU_BASED_USE_MSR_BITMAPS &&
>>> +	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
>>> +		; /* MSR_BITMAP will be set by following vmx_set_efer. */
>>> +	else
>>>  		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
>
> The else branch is taken if !cpu_has_vmx_msr_bitmap() and disables msr
> bitmaps.  Similar check for vmx_set_msr_bitmap(), so the NULL doesn't
> even get written to VMCS.
>
> KVM always uses L1's msr bitmaps when emulating the feature.
>

Yes, you are right. I forget the bitmap only used by hardware.:(
Wanpeng Li Aug. 16, 2016, 2:53 a.m. UTC | #6
2016-08-09 2:16 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
> msr bitmap can be used to avoid a VM exit (interception) on guest MSR
> accesses.  In some configurations of VMX controls, the guest can even
> directly access host's x2APIC MSRs.  See SDM 29.5 VIRTUALIZING MSR-BASED
> APIC ACCESSES.
>
> L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI.
> To do so, L1 would first trick KVM to disable all possible interceptions
> by enabling APICv features and then would turn those features off;
> nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would
> not intercept previously enabled MSRs even though they were not safe
> with the new configuration.
>
> Correctly re-enabling interceptions is not enough as a second bug would
> still allow L1+L2 to access host's MSRs: msr bitmap was shared for all
> VMCSs, so L1 could trigger a race to get the desired combination of msr
> bitmap and VMX controls.
>
> This fix allocates a msr bitmap for every L1 VCPU, allows only safe
> x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would
> have to intercept everything anyway.
>
> Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap")
> Reported-by: Jim Mattson <jmattson@google.com>
> Suggested-by: Wincy Van <fanwenyi0529@gmail.com>
> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>

Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>

> ---
>  arch/x86/kvm/vmx.c | 107 ++++++++++++++++++++++-------------------------------
>  1 file changed, 44 insertions(+), 63 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index a45d8580f91e..c66ac2c70d22 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -435,6 +435,8 @@ struct nested_vmx {
>         bool pi_pending;
>         u16 posted_intr_nv;
>
> +       unsigned long *msr_bitmap;
> +
>         struct hrtimer preemption_timer;
>         bool preemption_timer_expired;
>
> @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy;
>  static unsigned long *vmx_msr_bitmap_longmode;
>  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
>  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
> -static unsigned long *vmx_msr_bitmap_nested;
>  static unsigned long *vmx_vmread_bitmap;
>  static unsigned long *vmx_vmwrite_bitmap;
>
> @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
>         unsigned long *msr_bitmap;
>
>         if (is_guest_mode(vcpu))
> -               msr_bitmap = vmx_msr_bitmap_nested;
> +               msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
>         else if (cpu_has_secondary_exec_ctrls() &&
>                  (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
>                   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
> @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void)
>         if (!vmx_msr_bitmap_longmode_x2apic)
>                 goto out4;
>
> -       if (nested) {
> -               vmx_msr_bitmap_nested =
> -                       (unsigned long *)__get_free_page(GFP_KERNEL);
> -               if (!vmx_msr_bitmap_nested)
> -                       goto out5;
> -       }
> -
>         vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
>         if (!vmx_vmread_bitmap)
>                 goto out6;
> @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void)
>
>         memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
>         memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
> -       if (nested)
> -               memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
>
>         if (setup_vmcs_config(&vmcs_config) < 0) {
>                 r = -EIO;
> @@ -6529,9 +6521,6 @@ out8:
>  out7:
>         free_page((unsigned long)vmx_vmread_bitmap);
>  out6:
> -       if (nested)
> -               free_page((unsigned long)vmx_msr_bitmap_nested);
> -out5:
>         free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
>  out4:
>         free_page((unsigned long)vmx_msr_bitmap_longmode);
> @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void)
>         free_page((unsigned long)vmx_io_bitmap_a);
>         free_page((unsigned long)vmx_vmwrite_bitmap);
>         free_page((unsigned long)vmx_vmread_bitmap);
> -       if (nested)
> -               free_page((unsigned long)vmx_msr_bitmap_nested);
>
>         free_kvm_area();
>  }
> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>                 return 1;
>         }
>
> +       if (cpu_has_vmx_msr_bitmap()) {
> +               vmx->nested.msr_bitmap =
> +                               (unsigned long *)__get_free_page(GFP_KERNEL);
> +               if (!vmx->nested.msr_bitmap)
> +                       goto out_msr_bitmap;
> +       }
> +
>         vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
>         if (!vmx->nested.cached_vmcs12)
> -               return -ENOMEM;
> +               goto out_cached_vmcs12;
>
>         if (enable_shadow_vmcs) {
>                 shadow_vmcs = alloc_vmcs();
> -               if (!shadow_vmcs) {
> -                       kfree(vmx->nested.cached_vmcs12);
> -                       return -ENOMEM;
> -               }
> +               if (!shadow_vmcs)
> +                       goto out_shadow_vmcs;
>                 /* mark vmcs as shadow */
>                 shadow_vmcs->revision_id |= (1u << 31);
>                 /* init shadow vmcs */
> @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>         skip_emulated_instruction(vcpu);
>         nested_vmx_succeed(vcpu);
>         return 1;
> +
> +out_shadow_vmcs:
> +       kfree(vmx->nested.cached_vmcs12);
> +
> +out_cached_vmcs12:
> +       free_page((unsigned long)vmx->nested.msr_bitmap);
> +
> +out_msr_bitmap:
> +       return -ENOMEM;
>  }
>
>  /*
> @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx)
>         vmx->nested.vmxon = false;
>         free_vpid(vmx->nested.vpid02);
>         nested_release_vmcs12(vmx);
> +       if (vmx->nested.msr_bitmap) {
> +               free_page((unsigned long)vmx->nested.msr_bitmap);
> +               vmx->nested.msr_bitmap = NULL;
> +       }
>         if (enable_shadow_vmcs)
>                 free_vmcs(vmx->nested.current_shadow_vmcs);
>         kfree(vmx->nested.cached_vmcs12);
> @@ -9472,8 +9477,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>  {
>         int msr;
>         struct page *page;
> -       unsigned long *msr_bitmap;
> +       unsigned long *msr_bitmap_l1;
> +       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
>
> +       /* This shortcut is ok because we support only x2APIC MSRs so far. */
>         if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
>                 return false;
>
> @@ -9482,63 +9489,37 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>                 WARN_ON(1);
>                 return false;
>         }
> -       msr_bitmap = (unsigned long *)kmap(page);
> -       if (!msr_bitmap) {
> +       msr_bitmap_l1 = (unsigned long *)kmap(page);
> +       if (!msr_bitmap_l1) {
>                 nested_release_page_clean(page);
>                 WARN_ON(1);
>                 return false;
>         }
>
> +       memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
> +
>         if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
>                 if (nested_cpu_has_apic_reg_virt(vmcs12))
>                         for (msr = 0x800; msr <= 0x8ff; msr++)
>                                 nested_vmx_disable_intercept_for_msr(
> -                                       msr_bitmap,
> -                                       vmx_msr_bitmap_nested,
> +                                       msr_bitmap_l1, msr_bitmap_l0,
>                                         msr, MSR_TYPE_R);
> -               /* TPR is allowed */
> -               nested_vmx_disable_intercept_for_msr(msr_bitmap,
> -                               vmx_msr_bitmap_nested,
> +
> +               nested_vmx_disable_intercept_for_msr(
> +                               msr_bitmap_l1, msr_bitmap_l0,
>                                 APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>                                 MSR_TYPE_R | MSR_TYPE_W);
> +
>                 if (nested_cpu_has_vid(vmcs12)) {
> -                       /* EOI and self-IPI are allowed */
>                         nested_vmx_disable_intercept_for_msr(
> -                               msr_bitmap,
> -                               vmx_msr_bitmap_nested,
> +                               msr_bitmap_l1, msr_bitmap_l0,
>                                 APIC_BASE_MSR + (APIC_EOI >> 4),
>                                 MSR_TYPE_W);
>                         nested_vmx_disable_intercept_for_msr(
> -                               msr_bitmap,
> -                               vmx_msr_bitmap_nested,
> +                               msr_bitmap_l1, msr_bitmap_l0,
>                                 APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
>                                 MSR_TYPE_W);
>                 }
> -       } else {
> -               /*
> -                * Enable reading intercept of all the x2apic
> -                * MSRs. We should not rely on vmcs12 to do any
> -                * optimizations here, it may have been modified
> -                * by L1.
> -                */
> -               for (msr = 0x800; msr <= 0x8ff; msr++)
> -                       __vmx_enable_intercept_for_msr(
> -                               vmx_msr_bitmap_nested,
> -                               msr,
> -                               MSR_TYPE_R);
> -
> -               __vmx_enable_intercept_for_msr(
> -                               vmx_msr_bitmap_nested,
> -                               APIC_BASE_MSR + (APIC_TASKPRI >> 4),
> -                               MSR_TYPE_W);
> -               __vmx_enable_intercept_for_msr(
> -                               vmx_msr_bitmap_nested,
> -                               APIC_BASE_MSR + (APIC_EOI >> 4),
> -                               MSR_TYPE_W);
> -               __vmx_enable_intercept_for_msr(
> -                               vmx_msr_bitmap_nested,
> -                               APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
> -                               MSR_TYPE_W);
>         }
>         kunmap(page);
>         nested_release_page_clean(page);
> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>         }
>
>         if (cpu_has_vmx_msr_bitmap() &&
> -           exec_control & CPU_BASED_USE_MSR_BITMAPS) {
> -               nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
> -               /* MSR_BITMAP will be set by following vmx_set_efer. */
> -       } else
> +           exec_control & CPU_BASED_USE_MSR_BITMAPS &&
> +           nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
> +               ; /* MSR_BITMAP will be set by following vmx_set_efer. */
> +       else
>                 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
>
>         /*
> --
> 2.9.2
>
diff mbox

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a45d8580f91e..c66ac2c70d22 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -435,6 +435,8 @@  struct nested_vmx {
 	bool pi_pending;
 	u16 posted_intr_nv;
 
+	unsigned long *msr_bitmap;
+
 	struct hrtimer preemption_timer;
 	bool preemption_timer_expired;
 
@@ -924,7 +926,6 @@  static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
-static unsigned long *vmx_msr_bitmap_nested;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;
 
@@ -2508,7 +2509,7 @@  static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 	unsigned long *msr_bitmap;
 
 	if (is_guest_mode(vcpu))
-		msr_bitmap = vmx_msr_bitmap_nested;
+		msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
 	else if (cpu_has_secondary_exec_ctrls() &&
 		 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
 		  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
@@ -6363,13 +6364,6 @@  static __init int hardware_setup(void)
 	if (!vmx_msr_bitmap_longmode_x2apic)
 		goto out4;
 
-	if (nested) {
-		vmx_msr_bitmap_nested =
-			(unsigned long *)__get_free_page(GFP_KERNEL);
-		if (!vmx_msr_bitmap_nested)
-			goto out5;
-	}
-
 	vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_vmread_bitmap)
 		goto out6;
@@ -6392,8 +6386,6 @@  static __init int hardware_setup(void)
 
 	memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
 	memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-	if (nested)
-		memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
 
 	if (setup_vmcs_config(&vmcs_config) < 0) {
 		r = -EIO;
@@ -6529,9 +6521,6 @@  out8:
 out7:
 	free_page((unsigned long)vmx_vmread_bitmap);
 out6:
-	if (nested)
-		free_page((unsigned long)vmx_msr_bitmap_nested);
-out5:
 	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
 	free_page((unsigned long)vmx_msr_bitmap_longmode);
@@ -6557,8 +6546,6 @@  static __exit void hardware_unsetup(void)
 	free_page((unsigned long)vmx_io_bitmap_a);
 	free_page((unsigned long)vmx_vmwrite_bitmap);
 	free_page((unsigned long)vmx_vmread_bitmap);
-	if (nested)
-		free_page((unsigned long)vmx_msr_bitmap_nested);
 
 	free_kvm_area();
 }
@@ -6995,16 +6982,21 @@  static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
+	if (cpu_has_vmx_msr_bitmap()) {
+		vmx->nested.msr_bitmap =
+				(unsigned long *)__get_free_page(GFP_KERNEL);
+		if (!vmx->nested.msr_bitmap)
+			goto out_msr_bitmap;
+	}
+
 	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
 	if (!vmx->nested.cached_vmcs12)
-		return -ENOMEM;
+		goto out_cached_vmcs12;
 
 	if (enable_shadow_vmcs) {
 		shadow_vmcs = alloc_vmcs();
-		if (!shadow_vmcs) {
-			kfree(vmx->nested.cached_vmcs12);
-			return -ENOMEM;
-		}
+		if (!shadow_vmcs)
+			goto out_shadow_vmcs;
 		/* mark vmcs as shadow */
 		shadow_vmcs->revision_id |= (1u << 31);
 		/* init shadow vmcs */
@@ -7024,6 +7016,15 @@  static int handle_vmon(struct kvm_vcpu *vcpu)
 	skip_emulated_instruction(vcpu);
 	nested_vmx_succeed(vcpu);
 	return 1;
+
+out_shadow_vmcs:
+	kfree(vmx->nested.cached_vmcs12);
+
+out_cached_vmcs12:
+	free_page((unsigned long)vmx->nested.msr_bitmap);
+
+out_msr_bitmap:
+	return -ENOMEM;
 }
 
 /*
@@ -7098,6 +7099,10 @@  static void free_nested(struct vcpu_vmx *vmx)
 	vmx->nested.vmxon = false;
 	free_vpid(vmx->nested.vpid02);
 	nested_release_vmcs12(vmx);
+	if (vmx->nested.msr_bitmap) {
+		free_page((unsigned long)vmx->nested.msr_bitmap);
+		vmx->nested.msr_bitmap = NULL;
+	}
 	if (enable_shadow_vmcs)
 		free_vmcs(vmx->nested.current_shadow_vmcs);
 	kfree(vmx->nested.cached_vmcs12);
@@ -9472,8 +9477,10 @@  static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 {
 	int msr;
 	struct page *page;
-	unsigned long *msr_bitmap;
+	unsigned long *msr_bitmap_l1;
+	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
 
+	/* This shortcut is ok because we support only x2APIC MSRs so far. */
 	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
 		return false;
 
@@ -9482,63 +9489,37 @@  static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 		WARN_ON(1);
 		return false;
 	}
-	msr_bitmap = (unsigned long *)kmap(page);
-	if (!msr_bitmap) {
+	msr_bitmap_l1 = (unsigned long *)kmap(page);
+	if (!msr_bitmap_l1) {
 		nested_release_page_clean(page);
 		WARN_ON(1);
 		return false;
 	}
 
+	memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
+
 	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
 		if (nested_cpu_has_apic_reg_virt(vmcs12))
 			for (msr = 0x800; msr <= 0x8ff; msr++)
 				nested_vmx_disable_intercept_for_msr(
-					msr_bitmap,
-					vmx_msr_bitmap_nested,
+					msr_bitmap_l1, msr_bitmap_l0,
 					msr, MSR_TYPE_R);
-		/* TPR is allowed */
-		nested_vmx_disable_intercept_for_msr(msr_bitmap,
-				vmx_msr_bitmap_nested,
+
+		nested_vmx_disable_intercept_for_msr(
+				msr_bitmap_l1, msr_bitmap_l0,
 				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
 				MSR_TYPE_R | MSR_TYPE_W);
+
 		if (nested_cpu_has_vid(vmcs12)) {
-			/* EOI and self-IPI are allowed */
 			nested_vmx_disable_intercept_for_msr(
-				msr_bitmap,
-				vmx_msr_bitmap_nested,
+				msr_bitmap_l1, msr_bitmap_l0,
 				APIC_BASE_MSR + (APIC_EOI >> 4),
 				MSR_TYPE_W);
 			nested_vmx_disable_intercept_for_msr(
-				msr_bitmap,
-				vmx_msr_bitmap_nested,
+				msr_bitmap_l1, msr_bitmap_l0,
 				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
 				MSR_TYPE_W);
 		}
-	} else {
-		/*
-		 * Enable reading intercept of all the x2apic
-		 * MSRs. We should not rely on vmcs12 to do any
-		 * optimizations here, it may have been modified
-		 * by L1.
-		 */
-		for (msr = 0x800; msr <= 0x8ff; msr++)
-			__vmx_enable_intercept_for_msr(
-				vmx_msr_bitmap_nested,
-				msr,
-				MSR_TYPE_R);
-
-		__vmx_enable_intercept_for_msr(
-				vmx_msr_bitmap_nested,
-				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
-				MSR_TYPE_W);
-		__vmx_enable_intercept_for_msr(
-				vmx_msr_bitmap_nested,
-				APIC_BASE_MSR + (APIC_EOI >> 4),
-				MSR_TYPE_W);
-		__vmx_enable_intercept_for_msr(
-				vmx_msr_bitmap_nested,
-				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
-				MSR_TYPE_W);
 	}
 	kunmap(page);
 	nested_release_page_clean(page);
@@ -9957,10 +9938,10 @@  static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	}
 
 	if (cpu_has_vmx_msr_bitmap() &&
-	    exec_control & CPU_BASED_USE_MSR_BITMAPS) {
-		nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
-		/* MSR_BITMAP will be set by following vmx_set_efer. */
-	} else
+	    exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
+		; /* MSR_BITMAP will be set by following vmx_set_efer. */
+	else
 		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
 
 	/*