diff mbox

[v3,1/6] KVM: nVMX: Use hardware MSR bitmap

Message ID CACzj_yUu5omqFL7u8NE6vyqXGy_-dAeukRspJha+EiGXVBzmYg@mail.gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wincy Van Jan. 24, 2015, 10:20 a.m. UTC
Currently, if L1 enables MSR_BITMAP, we will emulate this feature,
all of L2's msr access is intercepted by L0. Since many features
like virtualize x2apic mode has a complicated logic and it is
difficult for us to emulate, we should use hardware and merge
the bitmap.

This patch introduces nested_vmx_merge_msr_bitmap for future use.

Signed-off-by: Wincy Van <fanwenyi0529@gmail.com>
---
 arch/x86/kvm/vmx.c |   71 +++++++++++++++++++++++++++++++++++++++++++--------
 1 files changed, 60 insertions(+), 11 deletions(-)

                                       unsigned long addr_field,
@@ -8637,11 +8679,17 @@ static void prepare_vmcs02(struct kvm_vcpu
*vcpu, struct vmcs12 *vmcs12)
                vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
        }

+       if (cpu_has_vmx_msr_bitmap() &&
+           exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+           nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
+               vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
+       } else
+               exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+
        /*
-        * Merging of IO and MSR bitmaps not currently supported.
+        * Merging of IO bitmap not currently supported.
         * Rather, exit every time.
         */
-       exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
        exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
        exec_control |= CPU_BASED_UNCOND_IO_EXITING;

@@ -8792,15 +8840,13 @@ static int nested_vmx_run(struct kvm_vcpu
*vcpu, bool launch)
                return 1;
        }

-       if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
-                       !PAGE_ALIGNED(vmcs12->msr_bitmap)) {
+       if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
                /*TODO: Also verify bits beyond physical address width are 0*/
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
        }

-       if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
-               /*TODO: Also verify bits beyond physical address width are 0*/
+       if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
        }
@@ -9356,6 +9402,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        kvm_set_dr(vcpu, 7, 0x400);
        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);

+       if (cpu_has_vmx_msr_bitmap())
+               vmx_set_msr_bitmap(vcpu);
+
        if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
                                vmcs12->vm_exit_msr_load_count))
                nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Zhang, Yang Z Jan. 28, 2015, 8 a.m. UTC | #1
V2luY3kgVmFuIHdyb3RlIG9uIDIwMTUtMDEtMjQ6DQo+IEN1cnJlbnRseSwgaWYgTDEgZW5hYmxl
cyBNU1JfQklUTUFQLCB3ZSB3aWxsIGVtdWxhdGUgdGhpcyBmZWF0dXJlLCBhbGwgb2YgTDIncw0K
PiBtc3IgYWNjZXNzIGlzIGludGVyY2VwdGVkIGJ5IEwwLiBTaW5jZSBtYW55IGZlYXR1cmVzIGxp
a2UgdmlydHVhbGl6ZSB4MmFwaWMgbW9kZQ0KPiBoYXMgYSBjb21wbGljYXRlZCBsb2dpYyBhbmQg
aXQgaXMgZGlmZmljdWx0IGZvciB1cyB0byBlbXVsYXRlLCB3ZSBzaG91bGQgdXNlDQo+IGhhcmR3
YXJlIGFuZCBtZXJnZSB0aGUgYml0bWFwLg0KPiANCj4gVGhpcyBwYXRjaCBpbnRyb2R1Y2VzIG5l
c3RlZF92bXhfbWVyZ2VfbXNyX2JpdG1hcCBmb3IgZnV0dXJlIHVzZS4NCj4gDQo+IFNpZ25lZC1v
ZmYtYnk6IFdpbmN5IFZhbiA8ZmFud2VueWkwNTI5QGdtYWlsLmNvbT4NCj4gLS0tDQo+ICBhcmNo
L3g4Ni9rdm0vdm14LmMgfCAgIDcxDQo+ICsrKysrKysrKysrKysrKysrKysrKysrKysrKysrKysr
KysrKysrKysrKystLS0tLS0tLQ0KPiAgMSBmaWxlcyBjaGFuZ2VkLCA2MCBpbnNlcnRpb25zKCsp
LCAxMSBkZWxldGlvbnMoLSkNCj4gDQo+IGRpZmYgLS1naXQgYS9hcmNoL3g4Ni9rdm0vdm14LmMg
Yi9hcmNoL3g4Ni9rdm0vdm14LmMgaW5kZXggYzk4NzM3NC4uMzZkMDcyNA0KPiAxMDA2NDQNCj4g
LS0tIGEvYXJjaC94ODYva3ZtL3ZteC5jDQo+ICsrKyBiL2FyY2gveDg2L2t2bS92bXguYw0KPiBA
QCAtNzk4LDYgKzc5OCw3IEBAIHN0YXRpYyB1bnNpZ25lZCBsb25nICp2bXhfbXNyX2JpdG1hcF9s
ZWdhY3k7DQo+IHN0YXRpYyB1bnNpZ25lZCBsb25nICp2bXhfbXNyX2JpdG1hcF9sb25nbW9kZTsg
IHN0YXRpYyB1bnNpZ25lZCBsb25nDQo+ICp2bXhfbXNyX2JpdG1hcF9sZWdhY3lfeDJhcGljOyAg
c3RhdGljIHVuc2lnbmVkIGxvbmcNCj4gKnZteF9tc3JfYml0bWFwX2xvbmdtb2RlX3gyYXBpYzsN
Cj4gK3N0YXRpYyB1bnNpZ25lZCBsb25nICp2bXhfbXNyX2JpdG1hcF9uZXN0ZWQ7DQo+ICBzdGF0
aWMgdW5zaWduZWQgbG9uZyAqdm14X3ZtcmVhZF9iaXRtYXA7ICBzdGF0aWMgdW5zaWduZWQgbG9u
Zw0KPiAqdm14X3Ztd3JpdGVfYml0bWFwOw0KPiANCj4gQEAgLTU4MTIsMTMgKzU4MTMsMTggQEAg
c3RhdGljIF9faW5pdCBpbnQgaGFyZHdhcmVfc2V0dXAodm9pZCkNCj4gICAgICAgICAgICAgICAg
ICAgICAgICAgICAgICAgICAodW5zaWduZWQgbG9uZw0KPiAqKV9fZ2V0X2ZyZWVfcGFnZShHRlBf
S0VSTkVMKTsNCj4gICAgICAgICBpZiAoIXZteF9tc3JfYml0bWFwX2xvbmdtb2RlX3gyYXBpYykN
Cj4gICAgICAgICAgICAgICAgIGdvdG8gb3V0NDsNCj4gKw0KPiArICAgICAgIHZteF9tc3JfYml0
bWFwX25lc3RlZCA9ICh1bnNpZ25lZCBsb25nDQo+ICopX19nZXRfZnJlZV9wYWdlKEdGUF9LRVJO
RUwpOw0KPiArICAgICAgIGlmICghdm14X21zcl9iaXRtYXBfbmVzdGVkKQ0KPiArICAgICAgICAg
ICAgICAgZ290byBvdXQ1Ow0KPiArDQoNClNpbmNlIHRoZSBuZXN0ZWQgdmlydHVhbGl6YXRpb24g
aXMgb2ZmIGJ5IGRlZmF1bHQuIEl0J3MgYmV0dGVyIHRvIGFsbG9jYXRlIHRoZSBwYWdlDQpvbmx5
IHdoZW4gbmVzdGVkIGlzIHRydWUuIE1heWJlIGFkZGluZyB0aGUgZm9sbG93aW5nIGNoZWNrIGlz
IGJldHRlcjoNCg0KaWYgKG5lc3RlZCkgew0KICAgICAgICB2bXhfbXNyX2JpdG1hcF9uZXN0ZWQg
PSAodW5zaWduZWQgbG9uZyAqKV9fZ2V0X2ZyZWVfcGFnZShHRlBfS0VSTkVMKTsNCiAgICAgICAg
aWYgKCF2bXhfbXNyX2JpdG1hcF9uZXN0ZWQpDQogICAgICAgICAgICAgICAgZ290byBvdXQ1Ow0K
fQ0KDQouLi5zbmlwLi4uDQoNCj4gKw0KPiArLyoNCj4gKyAqIE1lcmdlIEwwJ3MgYW5kIEwxJ3Mg
TVNSIGJpdG1hcCwgcmV0dXJuIGZhbHNlIHRvIGluZGljYXRlIHRoYXQNCj4gKyAqIHdlIGRvIG5v
dCB1c2UgdGhlIGhhcmR3YXJlLg0KPiArICovDQo+ICtzdGF0aWMgaW5saW5lIGJvb2wgbmVzdGVk
X3ZteF9tZXJnZV9tc3JfYml0bWFwKHN0cnVjdCBrdm1fdmNwdSAqdmNwdSwNCj4gKyAgICAgICAg
ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBzdHJ1Y3Qgdm1jczEyDQo+ICp2
bWNzMTIpIHsNCj4gKyAgICAgICByZXR1cm4gZmFsc2U7DQo+ICt9DQo+ICsNCg0KVGhlIGZvbGxv
d2luZyBwYXRjaGVzIGhhdmUgbm90aGluZyB0byBkbyB3aXRoIHRoZSBNU1IgY29udHJvbC4gV2h5
IGxlYXZlIHRoZSBmdW5jdGlvbiBlbXB0eSBoZXJlPw0KDQpCZXN0IHJlZ2FyZHMsDQpZYW5nDQoN
Cg0K
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 28, 2015, 8:05 a.m. UTC | #2
Wincy Van wrote on 2015-01-24:
> When L2 is using x2apic, we can use virtualize x2apic mode to gain higher

> performance, especially in apicv case.

> 

> This patch also introduces nested_vmx_check_apicv_controls for the nested

> apicv patches.

> 

> Signed-off-by: Wincy Van <fanwenyi0529@gmail.com>

> ---


...snip...

>  static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)  {

>         if (!longmode_only)

> @@ -8344,7 +8394,68 @@ static int

> nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,  static

> inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,

>                                                struct vmcs12

> *vmcs12)  {

> -       return false;

> +       struct page *page;

> +       unsigned long *msr_bitmap;

> +

> +       if (!nested_cpu_has_virt_x2apic_mode(vmcs12))

> +               return false;

> +

> +       page = nested_get_page(vcpu, vmcs12->msr_bitmap);

> +       if (!page) {

> +               WARN_ON(1);

> +               return false;

> +       }

> +       msr_bitmap = (unsigned long *)kmap(page);

> +       if (!msr_bitmap) {

> +               nested_release_page_clean(page);

> +               WARN_ON(1);

> +               return false;

> +       }

> +

> +       memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);

> +

> +       if (nested_cpu_has_virt_x2apic_mode(vmcs12))

> +               /* TPR is allowed */

> +               nested_vmx_disable_intercept_for_msr(msr_bitmap,

> +                               vmx_msr_bitmap_nested,

> +                               APIC_BASE_MSR + (APIC_TASKPRI >>

> 4),

> +                               MSR_TYPE_R | MSR_TYPE_W);


I didn't understand what this function does? Per my understanding, you only need to set the (vmx_msr_bitmap_nested = vmcs01->msr_bitmap | vmcs12->msr_bitmap) and inject the nested vmexit to L1 if the bit in vmcs12->msr_bitmap is setting. Am I missing some patches?

Best regards,
Yang
Zhang, Yang Z Jan. 28, 2015, 8:06 a.m. UTC | #3
Zhang, Yang Z wrote on 2015-01-28:
> Wincy Van wrote on 2015-01-24:

>> When L2 is using x2apic, we can use virtualize x2apic mode to gain

>> higher performance, especially in apicv case.

>> 

>> This patch also introduces nested_vmx_check_apicv_controls for the

>> nested apicv patches.


Sorry, replied on the wrong thread.:(

>> 

>> Signed-off-by: Wincy Van <fanwenyi0529@gmail.com>

>> ---

> 

> ...snip...

> 

>>  static void vmx_disable_intercept_for_msr(u32 msr, bool

>> longmode_only)

> {

>>         if (!longmode_only)

>> @@ -8344,7 +8394,68 @@ static int

>> nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,  static

>> inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,

>>                                                struct vmcs12

>> *vmcs12)  {

>> -       return false;

>> +       struct page *page;

>> +       unsigned long *msr_bitmap;

>> +

>> +       if (!nested_cpu_has_virt_x2apic_mode(vmcs12))

>> +               return false;

>> +

>> +       page = nested_get_page(vcpu, vmcs12->msr_bitmap);

>> +       if (!page) {

>> +               WARN_ON(1);

>> +               return false;

>> +       }

>> +       msr_bitmap = (unsigned long *)kmap(page);

>> +       if (!msr_bitmap) {

>> +               nested_release_page_clean(page);

>> +               WARN_ON(1);

>> +               return false;

>> +       }

>> +

>> +       memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);

>> +

>> +       if (nested_cpu_has_virt_x2apic_mode(vmcs12))

>> +               /* TPR is allowed */

>> +               nested_vmx_disable_intercept_for_msr(msr_bitmap,

>> +                               vmx_msr_bitmap_nested,

>> +                               APIC_BASE_MSR + (APIC_TASKPRI >>

>> 4),

>> +                               MSR_TYPE_R | MSR_TYPE_W);

> 

> I didn't understand what this function does? Per my understanding, you

> only need to set the (vmx_msr_bitmap_nested = vmcs01->msr_bitmap |

> vmcs12->msr_bitmap) and inject the nested vmexit to L1 if the bit in

> vmcs12->msr_bitmap is setting. Am I missing some patches?

> 

> Best regards,

> Yang

>



Best regards,
Yang
Wincy Van Jan. 28, 2015, 8:24 a.m. UTC | #4
On Wed, Jan 28, 2015 at 4:00 PM, Zhang, Yang Z <yang.z.zhang@intel.com> wrote:
>> @@ -5812,13 +5813,18 @@ static __init int hardware_setup(void)
>>                                 (unsigned long
>> *)__get_free_page(GFP_KERNEL);
>>         if (!vmx_msr_bitmap_longmode_x2apic)
>>                 goto out4;
>> +
>> +       vmx_msr_bitmap_nested = (unsigned long
>> *)__get_free_page(GFP_KERNEL);
>> +       if (!vmx_msr_bitmap_nested)
>> +               goto out5;
>> +
>
> Since the nested virtualization is off by default. It's better to allocate the page
> only when nested is true. Maybe adding the following check is better:
>
> if (nested) {
>         vmx_msr_bitmap_nested = (unsigned long *)__get_free_page(GFP_KERNEL);
>         if (!vmx_msr_bitmap_nested)
>                 goto out5;
> }

Agreed. Will do.

>
> ...snip...
>
>> +
>> +/*
>> + * Merge L0's and L1's MSR bitmap, return false to indicate that
>> + * we do not use the hardware.
>> + */
>> +static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>> +                                              struct vmcs12
>> *vmcs12) {
>> +       return false;
>> +}
>> +
>
> The following patches have nothing to do with the MSR control. Why leave the function empty here?
>

No. In patch "Enable nested virtualize x2apic mode", we will return
false if L1 disabled virt_x2apic_mode, then the hardware MSR-bitmap
control is disabled. This is faster than rebuilding the
vmx_msr_bitmap_nested.
This function returns false here to indicate that we do not use the
hardware. Since It is not only virtualize x2apic mode using this,
other features may use this either. I think it isn't suitable to
introduce this function in other patches.


> Best regards,
> Yang
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wincy Van Jan. 28, 2015, 8:57 a.m. UTC | #5
On Wed, Jan 28, 2015 at 4:05 PM, Zhang, Yang Z <yang.z.zhang@intel.com> wrote:
>> @@ -8344,7 +8394,68 @@ static int
>> nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,  static
>> inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>>                                                struct vmcs12
>> *vmcs12)  {
>> -       return false;
>> +       struct page *page;
>> +       unsigned long *msr_bitmap;
>> +
>> +       if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
>> +               return false;
>> +
>> +       page = nested_get_page(vcpu, vmcs12->msr_bitmap);
>> +       if (!page) {
>> +               WARN_ON(1);
>> +               return false;
>> +       }
>> +       msr_bitmap = (unsigned long *)kmap(page);
>> +       if (!msr_bitmap) {
>> +               nested_release_page_clean(page);
>> +               WARN_ON(1);
>> +               return false;
>> +       }
>> +
>> +       memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
>> +
>> +       if (nested_cpu_has_virt_x2apic_mode(vmcs12))
>> +               /* TPR is allowed */
>> +               nested_vmx_disable_intercept_for_msr(msr_bitmap,
>> +                               vmx_msr_bitmap_nested,
>> +                               APIC_BASE_MSR + (APIC_TASKPRI >>
>> 4),
>> +                               MSR_TYPE_R | MSR_TYPE_W);
>
> I didn't understand what this function does? Per my understanding, you only need to set the (vmx_msr_bitmap_nested = vmcs01->msr_bitmap | vmcs12->msr_bitmap) and inject the nested vmexit to L1 if the bit in vmcs12->msr_bitmap is setting. Am I missing some patches?

In the beginning, I want to do "vmcs01->msr_bitmap |
vmcs12->msr_bitmap", but I remember that there isn't a instruction to
do a bit or operation in two pages effectively, so I do the bit or
operation in nested_vmx_disable_intercept_for_msr. If the hardware do
not support this, I think it is faster if we deal with the bits on
demand. nested_vmx_merge_msr_bitmap is used to merge L0's and L1's
bitmaps, any features can put their logic here.

If there is a faster way, please teach me how to do it : )

Thanks,
Wincy


>
> Best regards,
> Yang
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 28, 2015, 11:09 a.m. UTC | #6
Wincy Van wrote on 2015-01-28:
> On Wed, Jan 28, 2015 at 4:00 PM, Zhang, Yang Z <yang.z.zhang@intel.com>

> wrote:

>>> @@ -5812,13 +5813,18 @@ static __init int hardware_setup(void)

>>>                                 (unsigned long

>>> *)__get_free_page(GFP_KERNEL);

>>>         if (!vmx_msr_bitmap_longmode_x2apic)

>>>                 goto out4;

>>> +

>>> +       vmx_msr_bitmap_nested = (unsigned long

>>> *)__get_free_page(GFP_KERNEL);

>>> +       if (!vmx_msr_bitmap_nested)

>>> +               goto out5;

>>> +

>> 

>> Since the nested virtualization is off by default. It's better to

>> allocate the page only when nested is true. Maybe adding the following

>> check is better:

>> 

>> if (nested) {

>>         vmx_msr_bitmap_nested = (unsigned long

>>         *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_nested)

>>                 goto out5;

>> }

> 

> Agreed. Will do.

> 

>> 

>> ...snip...

>> 

>>> +

>>> +/*

>>> + * Merge L0's and L1's MSR bitmap, return false to indicate that

>>> + * we do not use the hardware.

>>> + */

>>> +static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,

>>> +                                              struct vmcs12

>>> *vmcs12) {

>>> +       return false;

>>> +}

>>> +

>> 

>> The following patches have nothing to do with the MSR control. Why

>> leave the function empty here?

>> 

> 

> No. In patch "Enable nested virtualize x2apic mode", we will return

> false if L1 disabled virt_x2apic_mode, then the hardware MSR-bitmap control is disabled.

> This is faster than rebuilding the vmx_msr_bitmap_nested.

> This function returns false here to indicate that we do not use the hardware.

> Since It is not only virtualize x2apic mode using this, other features

> may use this either. I think it isn't suitable to introduce this function in other patches.


Yes, rebuilding is more costly. But your current implementation cannot leverage the APICv feature correctly. I replied in another thread.

> 

> 

>> Best regards,

>> Yang

>> 

>>



Best regards,
Yang
Zhang, Yang Z Jan. 28, 2015, 11:25 a.m. UTC | #7
Wincy Van wrote on 2015-01-28:
> On Wed, Jan 28, 2015 at 4:05 PM, Zhang, Yang Z <yang.z.zhang@intel.com>

> wrote:

>>> @@ -8344,7 +8394,68 @@ static int

>>> nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,  static

>>> inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,

>>>                                                struct vmcs12

>>> *vmcs12)  { -       return false; +       struct page *page; +      

>>> unsigned long *msr_bitmap; + +       if

>>> (!nested_cpu_has_virt_x2apic_mode(vmcs12)) +               return

>>> false; + +       page = nested_get_page(vcpu, vmcs12->msr_bitmap); +  

>>>     if (!page) { +               WARN_ON(1); +               return

>>> false; +       } +       msr_bitmap = (unsigned long *)kmap(page); +  

>>>     if (!msr_bitmap) { +              

>>> nested_release_page_clean(page); +               WARN_ON(1); +        

>>>       return false; +       } + +       memset(vmx_msr_bitmap_nested,

>>> 0xff, PAGE_SIZE); + +       if

>>> (nested_cpu_has_virt_x2apic_mode(vmcs12)) +               /* TPR is

>>> allowed */ +              

>>> nested_vmx_disable_intercept_for_msr(msr_bitmap, +                    

>>>           vmx_msr_bitmap_nested, +                              

>>> APIC_BASE_MSR + (APIC_TASKPRI >> 4), +                              

>>> MSR_TYPE_R | MSR_TYPE_W);

>> 

>> I didn't understand what this function does? Per my understanding,

>> you only

> need to set the (vmx_msr_bitmap_nested = vmcs01->msr_bitmap |

> vmcs12->msr_bitmap) and inject the nested vmexit to L1 if the bit in

> vmcs12->msr_bitmap is setting. Am I missing some patches?

> 

> In the beginning, I want to do "vmcs01->msr_bitmap |

> vmcs12->msr_bitmap", but I remember that there isn't a instruction to

> do a bit or operation in two pages effectively, so I do the bit or

> operation in nested_vmx_disable_intercept_for_msr. If the hardware do

> not support this, I think it is faster if we deal with the bits on demand.

> nested_vmx_merge_msr_bitmap is used to merge L0's and L1's bitmaps,

> any features can put their logic here.


You construct the nested_msr_bitmap based on vmcs12->msr_bitmap, what happens if vmcs01->msr_bitmap want to trap this msr?

> 

> If there is a faster way, please teach me how to do it : )


You are right. Interception should be much faster.

> 

> Thanks,

> Wincy

> 

> 

>> 

>> Best regards,

>> Yang

>> 

>>



Best regards,
Yang
Wincy Van Jan. 28, 2015, 11:43 a.m. UTC | #8
On Wed, Jan 28, 2015 at 7:25 PM, Zhang, Yang Z <yang.z.zhang@intel.com> wrote:
> Wincy Van wrote on 2015-01-28:
>> On Wed, Jan 28, 2015 at 4:05 PM, Zhang, Yang Z <yang.z.zhang@intel.com>
>> wrote:
>>>> @@ -8344,7 +8394,68 @@ static int
>>>> nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,  static
>>>> inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>>>>                                                struct vmcs12
>>>> *vmcs12)  { -       return false; +       struct page *page; +
>>>> unsigned long *msr_bitmap; + +       if
>>>> (!nested_cpu_has_virt_x2apic_mode(vmcs12)) +               return
>>>> false; + +       page = nested_get_page(vcpu, vmcs12->msr_bitmap); +
>>>>     if (!page) { +               WARN_ON(1); +               return
>>>> false; +       } +       msr_bitmap = (unsigned long *)kmap(page); +
>>>>     if (!msr_bitmap) { +
>>>> nested_release_page_clean(page); +               WARN_ON(1); +
>>>>       return false; +       } + +       memset(vmx_msr_bitmap_nested,
>>>> 0xff, PAGE_SIZE); + +       if
>>>> (nested_cpu_has_virt_x2apic_mode(vmcs12)) +               /* TPR is
>>>> allowed */ +
>>>> nested_vmx_disable_intercept_for_msr(msr_bitmap, +
>>>>           vmx_msr_bitmap_nested, +
>>>> APIC_BASE_MSR + (APIC_TASKPRI >> 4), +
>>>> MSR_TYPE_R | MSR_TYPE_W);
>>>
>>> I didn't understand what this function does? Per my understanding,
>>> you only
>> need to set the (vmx_msr_bitmap_nested = vmcs01->msr_bitmap |
>> vmcs12->msr_bitmap) and inject the nested vmexit to L1 if the bit in
>> vmcs12->msr_bitmap is setting. Am I missing some patches?
>>
>> In the beginning, I want to do "vmcs01->msr_bitmap |
>> vmcs12->msr_bitmap", but I remember that there isn't a instruction to
>> do a bit or operation in two pages effectively, so I do the bit or
>> operation in nested_vmx_disable_intercept_for_msr. If the hardware do
>> not support this, I think it is faster if we deal with the bits on demand.
>> nested_vmx_merge_msr_bitmap is used to merge L0's and L1's bitmaps,
>> any features can put their logic here.
>
> You construct the nested_msr_bitmap based on vmcs12->msr_bitmap, what happens if vmcs01->msr_bitmap want to trap this msr?
>

If L0 wants to intercept a msr, we should set
vmx_msr_bitmap_legacy(_x2apic) and vmx_msr_bitmap_longmode(_x2apic),
and that bitmaps should only be loaded in non-nested entry.
Currently we only clear the corresponding bits if L1 enables
virtualize x2apic mode, all the other bits are all set. On nested
entry, we load nested_msr_bitmap, on nested vmexit, we will restore
L0's bitmap.
In the nested entry, L0 only care about L2's msr access, not L1's. I
think that in nested entry, nested_msr_bitmap is "vmcs01->msr_bitmap"
as your mentioned above.

Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 28, 2015, 11:52 a.m. UTC | #9
V2luY3kgVmFuIHdyb3RlIG9uIDIwMTUtMDEtMjg6DQo+IE9uIFdlZCwgSmFuIDI4LCAyMDE1IGF0
IDc6MjUgUE0sIFpoYW5nLCBZYW5nIFogPHlhbmcuei56aGFuZ0BpbnRlbC5jb20+DQo+IHdyb3Rl
Og0KPj4gV2luY3kgVmFuIHdyb3RlIG9uIDIwMTUtMDEtMjg6DQo+Pj4gT24gV2VkLCBKYW4gMjgs
IDIwMTUgYXQgNDowNSBQTSwgWmhhbmcsIFlhbmcgWg0KPj4+IDx5YW5nLnouemhhbmdAaW50ZWwu
Y29tPg0KPj4+IHdyb3RlOg0KPj4+Pj4gQEAgLTgzNDQsNyArODM5NCw2OCBAQCBzdGF0aWMgaW50
DQo+Pj4+PiBuZXN0ZWRfdm14X2NoZWNrX21zcl9iaXRtYXBfY29udHJvbHMoc3RydWN0IGt2bV92
Y3B1ICp2Y3B1LCBzdGF0aWMNCj4+Pj4+IGlubGluZSBib29sIG5lc3RlZF92bXhfbWVyZ2VfbXNy
X2JpdG1hcChzdHJ1Y3Qga3ZtX3ZjcHUgKnZjcHUsDQo+Pj4+PiAgICAgICAgICAgICAgICAgICAg
ICAgICAgICAgICAgICAgICAgICAgICAgICAgIHN0cnVjdCB2bWNzMTINCj4+Pj4+ICp2bWNzMTIp
ICB7IC0gICAgICAgcmV0dXJuIGZhbHNlOyArICAgICAgIHN0cnVjdCBwYWdlICpwYWdlOyArDQo+
Pj4+PiB1bnNpZ25lZCBsb25nICptc3JfYml0bWFwOyArICsgICAgICAgaWYNCj4+Pj4+ICghbmVz
dGVkX2NwdV9oYXNfdmlydF94MmFwaWNfbW9kZSh2bWNzMTIpKSArICAgICAgICAgICAgICAgcmV0
dXJuDQo+Pj4+PiBmYWxzZTsgKyArICAgICAgIHBhZ2UgPSBuZXN0ZWRfZ2V0X3BhZ2UodmNwdSwg
dm1jczEyLT5tc3JfYml0bWFwKTsNCj4gKw0KPj4+Pj4gICAgIGlmICghcGFnZSkgeyArICAgICAg
ICAgICAgICAgV0FSTl9PTigxKTsgKw0KPiByZXR1cm4NCj4+Pj4+IGZhbHNlOyArICAgICAgIH0g
KyAgICAgICBtc3JfYml0bWFwID0gKHVuc2lnbmVkIGxvbmcgKilrbWFwKHBhZ2UpOyArDQo+Pj4+
PiAgICAgaWYgKCFtc3JfYml0bWFwKSB7ICsNCj4+Pj4+IG5lc3RlZF9yZWxlYXNlX3BhZ2VfY2xl
YW4ocGFnZSk7ICsgICAgICAgICAgICAgICBXQVJOX09OKDEpOyArDQo+Pj4+PiAgICAgICByZXR1
cm4gZmFsc2U7ICsgICAgICAgfSArICsNCj4gbWVtc2V0KHZteF9tc3JfYml0bWFwX25lc3RlZCwN
Cj4+Pj4+IDB4ZmYsIFBBR0VfU0laRSk7ICsgKyAgICAgICBpZg0KPj4+Pj4gKG5lc3RlZF9jcHVf
aGFzX3ZpcnRfeDJhcGljX21vZGUodm1jczEyKSkgKyAgICAgICAgICAgICAgIC8qIFRQUiBpcw0K
Pj4+Pj4gYWxsb3dlZCAqLyArIG5lc3RlZF92bXhfZGlzYWJsZV9pbnRlcmNlcHRfZm9yX21zciht
c3JfYml0bWFwLCArDQo+Pj4+PiAgICAgICAgICAgdm14X21zcl9iaXRtYXBfbmVzdGVkLCArIEFQ
SUNfQkFTRV9NU1IgKyAoQVBJQ19UQVNLUFJJDQo+Pj4+Pj4+IDQpLCArIE1TUl9UWVBFX1IgfCBN
U1JfVFlQRV9XKTsNCj4+Pj4gDQo+Pj4+IEkgZGlkbid0IHVuZGVyc3RhbmQgd2hhdCB0aGlzIGZ1
bmN0aW9uIGRvZXM/IFBlciBteSB1bmRlcnN0YW5kaW5nLA0KPj4+PiB5b3Ugb25seQ0KPj4+IG5l
ZWQgdG8gc2V0IHRoZSAodm14X21zcl9iaXRtYXBfbmVzdGVkID0gdm1jczAxLT5tc3JfYml0bWFw
IHwNCj4+PiB2bWNzMTItPm1zcl9iaXRtYXApIGFuZCBpbmplY3QgdGhlIG5lc3RlZCB2bWV4aXQg
dG8gTDEgaWYgdGhlIGJpdA0KPj4+IHZtY3MxMi0+aW4gbXNyX2JpdG1hcCBpcyBzZXR0aW5nLiBB
bSBJIG1pc3Npbmcgc29tZSBwYXRjaGVzPw0KPj4+IA0KPj4+IEluIHRoZSBiZWdpbm5pbmcsIEkg
d2FudCB0byBkbyAidm1jczAxLT5tc3JfYml0bWFwIHwNCj4+PiB2bWNzMTItPm1zcl9iaXRtYXAi
LCBidXQgSSByZW1lbWJlciB0aGF0IHRoZXJlIGlzbid0IGEgaW5zdHJ1Y3Rpb24NCj4+PiB2bWNz
MTItPnRvDQo+Pj4gZG8gYSBiaXQgb3Igb3BlcmF0aW9uIGluIHR3byBwYWdlcyBlZmZlY3RpdmVs
eSwgc28gSSBkbyB0aGUgYml0IG9yDQo+Pj4gb3BlcmF0aW9uIGluIG5lc3RlZF92bXhfZGlzYWJs
ZV9pbnRlcmNlcHRfZm9yX21zci4gSWYgdGhlIGhhcmR3YXJlDQo+Pj4gZG8gbm90IHN1cHBvcnQg
dGhpcywgSSB0aGluayBpdCBpcyBmYXN0ZXIgaWYgd2UgZGVhbCB3aXRoIHRoZSBiaXRzIG9uIGRl
bWFuZC4NCj4+PiBuZXN0ZWRfdm14X21lcmdlX21zcl9iaXRtYXAgaXMgdXNlZCB0byBtZXJnZSBM
MCdzIGFuZCBMMSdzIGJpdG1hcHMsDQo+Pj4gYW55IGZlYXR1cmVzIGNhbiBwdXQgdGhlaXIgbG9n
aWMgaGVyZS4NCj4+IA0KPj4gWW91IGNvbnN0cnVjdCB0aGUgbmVzdGVkX21zcl9iaXRtYXAgYmFz
ZWQgb24gdm1jczEyLT5tc3JfYml0bWFwLCB3aGF0DQo+PiBoYXBwZW5zIGlmIHZtY3MwMS0+bXNy
X2JpdG1hcCB3YW50IHRvIHRyYXAgdGhpcyBtc3I/DQo+PiANCj4gDQo+IElmIEwwIHdhbnRzIHRv
IGludGVyY2VwdCBhIG1zciwgd2Ugc2hvdWxkIHNldA0KPiB2bXhfbXNyX2JpdG1hcF9sZWdhY3ko
X3gyYXBpYykgYW5kIHZteF9tc3JfYml0bWFwX2xvbmdtb2RlKF94MmFwaWMpLA0KPiBhbmQgdGhh
dCBiaXRtYXBzIHNob3VsZCBvbmx5IGJlIGxvYWRlZCBpbiBub24tbmVzdGVkIGVudHJ5Lg0KPiBD
dXJyZW50bHkgd2Ugb25seSBjbGVhciB0aGUgY29ycmVzcG9uZGluZyBiaXRzIGlmIEwxIGVuYWJs
ZXMNCj4gdmlydHVhbGl6ZSB4MmFwaWMgbW9kZSwgYWxsIHRoZSBvdGhlciBiaXRzIGFyZSBhbGwg
c2V0LiBPbiBuZXN0ZWQNCj4gZW50cnksIHdlIGxvYWQgbmVzdGVkX21zcl9iaXRtYXAsIG9uIG5l
c3RlZCB2bWV4aXQsIHdlIHdpbGwgcmVzdG9yZSBMMCdzIGJpdG1hcC4NCj4gSW4gdGhlIG5lc3Rl
ZCBlbnRyeSwgTDAgb25seSBjYXJlIGFib3V0IEwyJ3MgbXNyIGFjY2Vzcywgbm90IEwxJ3MuIEkN
Cj4gdGhpbmsgdGhhdCBpbiBuZXN0ZWQgZW50cnksIG5lc3RlZF9tc3JfYml0bWFwIGlzICJ2bWNz
MDEtPm1zcl9iaXRtYXAiDQo+IGFzIHlvdXIgbWVudGlvbmVkIGFib3ZlLg0KDQpNbW0uLi4gSSB0
aGluayBpZiB0aGUgYml0IGlzIHNldHRpbmcgaW4gdm1jczAxLT5tc3JfYml0bWFwIG1lYW5zIHdo
ZW5ldmVyIHRoZSBWQ1BVKG5vIG1hdHRlciBpbiBuZXN0ZWQgZ3Vlc3QgbW9kZSBvciBub3QpIGFj
Y2Vzc2VzIHRoaXMgbXNyIHNob3VsZCBjYXVzZSB2bWV4aXQsIG5vdCBvbmx5IEwxLiBUaGF0J3Mg
d2h5IG5lZWQgdG8gY29uc3RydWN0IHRoZSB2bWNzMDItPm1zcl9iaXRtYXAgYmFzZWQgb24gKHZt
Y3MwMS0+bXNyX2JpdG1hcCBPUmVkIHZtY3MxMi0+bXNyX2JpdG1hcCkuDQoNCj4gDQo+IFRoYW5r
cywNCj4gV2luY3kNCg0KDQpCZXN0IHJlZ2FyZHMsDQpZYW5nDQoNCg0K
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wincy Van Jan. 28, 2015, 12:06 p.m. UTC | #10
On Wed, Jan 28, 2015 at 7:52 PM, Zhang, Yang Z <yang.z.zhang@intel.com> wrote:
>>>
>>
>> If L0 wants to intercept a msr, we should set
>> vmx_msr_bitmap_legacy(_x2apic) and vmx_msr_bitmap_longmode(_x2apic),
>> and that bitmaps should only be loaded in non-nested entry.
>> Currently we only clear the corresponding bits if L1 enables
>> virtualize x2apic mode, all the other bits are all set. On nested
>> entry, we load nested_msr_bitmap, on nested vmexit, we will restore L0's bitmap.
>> In the nested entry, L0 only care about L2's msr access, not L1's. I
>> think that in nested entry, nested_msr_bitmap is "vmcs01->msr_bitmap"
>> as your mentioned above.
>
> Mmm... I think if the bit is setting in vmcs01->msr_bitmap means whenever the VCPU(no matter in nested guest mode or not) accesses this msr should cause vmexit, not only L1. That's why need to construct the vmcs02->msr_bitmap based on (vmcs01->msr_bitmap ORed vmcs12->msr_bitmap).
>

You are right, but this is not fit for all the cases, we should custom
the nested_msr_bitmap.
e.g.  Currently L0 wants to intercept some of the x2apic msrs' reading:
         if (enable_apicv) {
                for (msr = 0x800; msr <= 0x8ff; msr++)
                        vmx_disable_intercept_msr_read_x2apic(msr);

                /* According SDM, in x2apic mode, the whole id reg is used.
                 * But in KVM, it only use the highest eight bits. Need to
                 * intercept it */
                vmx_enable_intercept_msr_read_x2apic(0x802);
                /* TMCCT */
                vmx_enable_intercept_msr_read_x2apic(0x839);
                /* TPR */
                vmx_disable_intercept_msr_write_x2apic(0x808);
                /* EOI */
                vmx_disable_intercept_msr_write_x2apic(0x80b);
                /* SELF-IPI */
                vmx_disable_intercept_msr_write_x2apic(0x83f);
        }
But L1 may not want this. So I think we are better to deal with the msrs
seperately, there is not a common way suit for all the cases. If other
features want to intercept a msr in nested entry, they can put the
custom code in nested_vmx_merge_msr_bitmap.


Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 28, 2015, 12:26 p.m. UTC | #11
Wincy Van wrote on 2015-01-28:
> On Wed, Jan 28, 2015 at 7:52 PM, Zhang, Yang Z <yang.z.zhang@intel.com>

> wrote:

>>>> 

>>> 

>>> If L0 wants to intercept a msr, we should set

>>> vmx_msr_bitmap_legacy(_x2apic) and vmx_msr_bitmap_longmode(_x2apic),

>>> and that bitmaps should only be loaded in non-nested entry. Currently

>>> we only clear the corresponding bits if L1 enables virtualize x2apic

>>> mode, all the other bits are all set. On nested entry, we load

>>> nested_msr_bitmap, on nested vmexit, we will restore L0's bitmap. In

>>> the nested entry, L0 only care about L2's msr access, not L1's. I

>>> think that in nested entry, nested_msr_bitmap is "vmcs01->msr_bitmap"

>>> as your mentioned above.

>> 

>> Mmm... I think if the bit is setting in vmcs01->msr_bitmap means

>> whenever

> the VCPU(no matter in nested guest mode or not) accesses this msr

> should cause vmexit, not only L1. That's why need to construct the

> vmcs02->msr_bitmap based on (vmcs01->msr_bitmap ORed

> vmcs12->msr_bitmap).

>> 

> 

> You are right, but this is not fit for all the cases, we should custom

> the nested_msr_bitmap.

> e.g.  Currently L0 wants to intercept some of the x2apic msrs' reading:

>          if (enable_apicv) {

>                 for (msr = 0x800; msr <= 0x8ff; msr++)

>                         vmx_disable_intercept_msr_read_x2apic(msr);

>                 /* According SDM, in x2apic mode, the whole id reg is used.

>                  * But in KVM, it only use the highest eight bits. Need to

>                  * intercept it */

>                 vmx_enable_intercept_msr_read_x2apic(0x802); /* TMCCT */

>                 vmx_enable_intercept_msr_read_x2apic(0x839); /* TPR */

>                 vmx_disable_intercept_msr_write_x2apic(0x808); /* EOI */

>                 vmx_disable_intercept_msr_write_x2apic(0x80b); /*

>                 SELF-IPI */

>                 vmx_disable_intercept_msr_write_x2apic(0x83f);

>         }

> But L1 may not want this. So I think we are better to deal with the


Actually, from L0's point, it is totally unaware of the L2. The only thing L0 aware is that the CPU should follow L0's configuration when VCPU is running. So if L0 wants to trap a msr, then the read operation to this msr should cause vmexit unconditionally no matter who is running(who means L1, L2, L3.....).

> msrs seperately, there is not a common way suit for all the cases. If

> other features want to intercept a msr in nested entry, they can put

> the custom code in nested_vmx_merge_msr_bitmap.


Yes, if other features want to do it in 'nested' entry, they can fill nested_vmx_merge_msr_bitmap. But if in non-nested case, it should be our responsibly to handle it correctly, how about add following check:

if (type & MSR_TYPE_R && !test_bit(msr, vmcs01_msr_bitmap) && !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
        __clear_bit(msr, msr_bitmap_nested + 0x000 / f);


> 

> 

> Thanks,

> Wincy



Best regards,
Yang
Zhang, Yang Z Jan. 28, 2015, 12:33 p.m. UTC | #12
Zhang, Yang Z wrote on 2015-01-28:
> Wincy Van wrote on 2015-01-28:

>> On Wed, Jan 28, 2015 at 7:52 PM, Zhang, Yang Z

>> <yang.z.zhang@intel.com>

>> wrote:

>>>>> 

>>>> 

>>>> If L0 wants to intercept a msr, we should set

>>>> vmx_msr_bitmap_legacy(_x2apic) and vmx_msr_bitmap_longmode(_x2apic),

>>>> and that bitmaps should only be loaded in non-nested entry. Currently

>>>> we only clear the corresponding bits if L1 enables virtualize x2apic

>>>> mode, all the other bits are all set. On nested entry, we load

>>>> nested_msr_bitmap, on nested vmexit, we will restore L0's bitmap. In

>>>> the nested entry, L0 only care about L2's msr access, not L1's. I

>>>> think that in nested entry, nested_msr_bitmap is "vmcs01->msr_bitmap"

>>>> as your mentioned above.

>>> 

>>> Mmm... I think if the bit is setting in vmcs01->msr_bitmap means

>>> whenever

>> the VCPU(no matter in nested guest mode or not) accesses this msr

>> should cause vmexit, not only L1. That's why need to construct the

>> vmcs02->msr_bitmap based on (vmcs01->msr_bitmap ORed

>> vmcs12->msr_bitmap).

>>> 

>> 

>> You are right, but this is not fit for all the cases, we should

>> custom the nested_msr_bitmap.

>> e.g.  Currently L0 wants to intercept some of the x2apic msrs' reading:

>>          if (enable_apicv) {

>>                 for (msr = 0x800; msr <= 0x8ff; msr++)

> vmx_disable_intercept_msr_read_x2apic(msr);

>>                 /* According SDM, in x2apic mode, the whole id reg

>> is

> used.

>>                  * But in KVM, it only use the highest eight bits. Need to

>>                  * intercept it */

>>                 vmx_enable_intercept_msr_read_x2apic(0x802); /* TMCCT

>>                 */ vmx_enable_intercept_msr_read_x2apic(0x839); /* TPR

>>                 */ vmx_disable_intercept_msr_write_x2apic(0x808); /*

>> EOI

> */

>>                 vmx_disable_intercept_msr_write_x2apic(0x80b); /*

>>                 SELF-IPI */

>>                 vmx_disable_intercept_msr_write_x2apic(0x83f);

>>         }

>> But L1 may not want this. So I think we are better to deal with the

> 

> Actually, from L0's point, it is totally unaware of the L2. The only

> thing L0 aware is that the CPU should follow L0's configuration when

> VCPU is running. So if L0 wants to trap a msr, then the read operation

> to this msr should cause vmexit unconditionally no matter who is running(who means L1, L2, L3.....).

> 

>> msrs seperately, there is not a common way suit for all the cases.

>> If other features want to intercept a msr in nested entry, they can

>> put the custom code in nested_vmx_merge_msr_bitmap.

> 

> Yes, if other features want to do it in 'nested' entry, they can fill

> nested_vmx_merge_msr_bitmap. But if in non-nested case, it should be

> our responsibly to handle it correctly, how about add following check:

> 

> if (type & MSR_TYPE_R && !test_bit(msr, vmcs01_msr_bitmap) &&

> !test_bit(msr, msr_bitmap_l1 + 0x000 / f))

>         __clear_bit(msr, msr_bitmap_nested + 0x000 / f);



Anyway, this is not necessary for your current patch. We can consider it later if there really have other features will use it.

> 

>> 

>> 

>> Thanks,

>> Wincy

> 

> 

> Best regards,

> Yang

>



Best regards,
Yang
Wincy Van Jan. 28, 2015, 12:45 p.m. UTC | #13
On Wed, Jan 28, 2015 at 8:33 PM, Zhang, Yang Z <yang.z.zhang@intel.com> wrote:
>>>
>>> You are right, but this is not fit for all the cases, we should
>>> custom the nested_msr_bitmap.
>>> e.g.  Currently L0 wants to intercept some of the x2apic msrs' reading:
>>>          if (enable_apicv) {
>>>                 for (msr = 0x800; msr <= 0x8ff; msr++)
>> vmx_disable_intercept_msr_read_x2apic(msr);
>>>                 /* According SDM, in x2apic mode, the whole id reg
>>> is
>> used.
>>>                  * But in KVM, it only use the highest eight bits. Need to
>>>                  * intercept it */
>>>                 vmx_enable_intercept_msr_read_x2apic(0x802); /* TMCCT
>>>                 */ vmx_enable_intercept_msr_read_x2apic(0x839); /* TPR
>>>                 */ vmx_disable_intercept_msr_write_x2apic(0x808); /*
>>> EOI
>> */
>>>                 vmx_disable_intercept_msr_write_x2apic(0x80b); /*
>>>                 SELF-IPI */
>>>                 vmx_disable_intercept_msr_write_x2apic(0x83f);
>>>         }
>>> But L1 may not want this. So I think we are better to deal with the
>>
>> Actually, from L0's point, it is totally unaware of the L2. The only
>> thing L0 aware is that the CPU should follow L0's configuration when
>> VCPU is running. So if L0 wants to trap a msr, then the read operation
>> to this msr should cause vmexit unconditionally no matter who is running(who means L1, L2, L3.....).
>>
>>> msrs seperately, there is not a common way suit for all the cases.
>>> If other features want to intercept a msr in nested entry, they can
>>> put the custom code in nested_vmx_merge_msr_bitmap.
>>
>> Yes, if other features want to do it in 'nested' entry, they can fill
>> nested_vmx_merge_msr_bitmap. But if in non-nested case, it should be
>> our responsibly to handle it correctly, how about add following check:
>>
>> if (type & MSR_TYPE_R && !test_bit(msr, vmcs01_msr_bitmap) &&
>> !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
>>         __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
>
>
> Anyway, this is not necessary for your current patch. We can consider it later if there really have other features will use it.
>

Yep, I know what you mean now, for other msrs which are not forwarded
access by a mechanism like virtual-apic page, we should intercept it
unconditionally. I think we should ensure the msr can be allowed
before call nested_vmx_disable_intercept_for_msr, if L0 want to
intercept it, just do not call nested_vmx_disable_intercept_for_msr.

 !test_bit(msr, vmcs01_msr_bitmap) will introduce a problem that some
of the msrs will be affcted by vmcs01_msr_bitmap, TMCCT and TPR, for
example.
Intercept reading for these msrs is okay, but it is not efficient.

Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zhang, Yang Z Jan. 28, 2015, 1:06 p.m. UTC | #14
Wincy Van wrote on 2015-01-28:
> On Wed, Jan 28, 2015 at 8:33 PM, Zhang, Yang Z <yang.z.zhang@intel.com>

> wrote:

>>>> 

>>>> You are right, but this is not fit for all the cases, we should

>>>> custom the nested_msr_bitmap.

>>>> e.g.  Currently L0 wants to intercept some of the x2apic msrs' reading:

>>>>          if (enable_apicv) {

>>>>                 for (msr = 0x800; msr <= 0x8ff; msr++)

>>>>                 vmx_disable_intercept_msr_read_x2apic(msr); /*

>>>>                 According SDM, in x2apic mode, the whole id reg

>>>> is

>>> used.

>>>>                  * But in KVM, it only use the highest eight bits.

>>>> Need

> to

>>>>                  * intercept it */

>>>>                 vmx_enable_intercept_msr_read_x2apic(0x802); /* TMCCT

>>>>                 */ vmx_enable_intercept_msr_read_x2apic(0x839); /*

>>>>                 TPR */ vmx_disable_intercept_msr_write_x2apic(0x808);

> /*

>>>> EOI

>>> */

>>>>                 vmx_disable_intercept_msr_write_x2apic(0x80b); /*

>>>>                 SELF-IPI */

>>>>                 vmx_disable_intercept_msr_write_x2apic(0x83f);

>>>>         }

>>>> But L1 may not want this. So I think we are better to deal with

>>>> the

>>> 

>>> Actually, from L0's point, it is totally unaware of the L2. The only

>>> thing L0 aware is that the CPU should follow L0's configuration when

>>> VCPU is running. So if L0 wants to trap a msr, then the read operation

>>> to this msr should cause vmexit unconditionally no matter who is

>>> running(who means L1, L2, L3.....).

>>> 

>>>> msrs seperately, there is not a common way suit for all the cases.

>>>> If other features want to intercept a msr in nested entry, they

>>>> can put the custom code in nested_vmx_merge_msr_bitmap.

>>> 

>>> Yes, if other features want to do it in 'nested' entry, they can

>>> fill nested_vmx_merge_msr_bitmap. But if in non-nested case, it

>>> should be our responsibly to handle it correctly, how about add following check:

>>> 

>>> if (type & MSR_TYPE_R && !test_bit(msr, vmcs01_msr_bitmap) &&

>>> !test_bit(msr, msr_bitmap_l1 + 0x000 / f))

>>>         __clear_bit(msr, msr_bitmap_nested + 0x000 / f);

>> 

>> 

>> Anyway, this is not necessary for your current patch. We can consider

>> it later if there really have other features will use it.

>> 

> 

> Yep, I know what you mean now, for other msrs which are not forwarded

> access by a mechanism like virtual-apic page, we should intercept it

> unconditionally. I think we should ensure the msr can be allowed

> before call nested_vmx_disable_intercept_for_msr, if L0 want to

> intercept it, just do not call nested_vmx_disable_intercept_for_msr.


Yes, this is a solution. But I prefer to handle it in nested code path since others may not familiar with nested and may block it by mistake.

> 

>  !test_bit(msr, vmcs01_msr_bitmap) will introduce a problem that some

> of the msrs will be affcted by vmcs01_msr_bitmap, TMCCT and TPR, for example.

> Intercept reading for these msrs is okay, but it is not efficient.


TMCCT is always trapped by most VMM. I don't think TPR is trapped in KVM.

> 

> Thanks,

> Wincy



Best regards,
Yang
Wincy Van Jan. 28, 2015, 1:57 p.m. UTC | #15
On Wed, Jan 28, 2015 at 9:06 PM, Zhang, Yang Z <yang.z.zhang@intel.com> wrote:
>>>>         __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
>>>
>>>
>>> Anyway, this is not necessary for your current patch. We can consider
>>> it later if there really have other features will use it.
>>>
>>
>> Yep, I know what you mean now, for other msrs which are not forwarded
>> access by a mechanism like virtual-apic page, we should intercept it
>> unconditionally. I think we should ensure the msr can be allowed
>> before call nested_vmx_disable_intercept_for_msr, if L0 want to
>> intercept it, just do not call nested_vmx_disable_intercept_for_msr.
>
> Yes, this is a solution. But I prefer to handle it in nested code path since others may not familiar with nested and may block it by mistake.
>
>>
>>  !test_bit(msr, vmcs01_msr_bitmap) will introduce a problem that some
>> of the msrs will be affcted by vmcs01_msr_bitmap, TMCCT and TPR, for example.
>> Intercept reading for these msrs is okay, but it is not efficient.
>
> TMCCT is always trapped by most VMM. I don't think TPR is trapped in KVM.
>

Oooops. This piece of code made me confused and I was not think about
that words, just forget it.
( * ^ _ ^ * )


                vmx_enable_intercept_msr_read_x2apic(0x802);
                /* TMCCT */
                vmx_enable_intercept_msr_read_x2apic(0x839);
                /* TPR */


If there are any features use it in the future, let's consider about
this, thank you for your patience.


Thanks,
Wincy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c987374..36d0724 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -798,6 +798,7 @@  static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_nested;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;

@@ -5812,13 +5813,18 @@  static __init int hardware_setup(void)
                                (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_msr_bitmap_longmode_x2apic)
                goto out4;
+
+       vmx_msr_bitmap_nested = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_nested)
+               goto out5;
+
        vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_vmread_bitmap)
-               goto out5;
+               goto out6;

        vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_vmwrite_bitmap)
-               goto out6;
+               goto out7;

        memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
        memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -5834,10 +5840,11 @@  static __init int hardware_setup(void)

        memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
        memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+       memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);

        if (setup_vmcs_config(&vmcs_config) < 0) {
                r = -EIO;
-               goto out7;
+               goto out8;
        }

        if (boot_cpu_has(X86_FEATURE_NX))
@@ -5944,10 +5951,12 @@  static __init int hardware_setup(void)

        return alloc_kvm_area();

-out7:
+out8:
        free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
+out7:
        free_page((unsigned long)vmx_vmread_bitmap);
+out6:
+       free_page((unsigned long)vmx_msr_bitmap_nested);
 out5:
        free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
@@ -5970,6 +5979,7 @@  static __exit void hardware_unsetup(void)
        free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
        free_page((unsigned long)vmx_msr_bitmap_legacy);
        free_page((unsigned long)vmx_msr_bitmap_longmode);
+       free_page((unsigned long)vmx_msr_bitmap_nested);
        free_page((unsigned long)vmx_io_bitmap_b);
        free_page((unsigned long)vmx_io_bitmap_a);
        free_page((unsigned long)vmx_vmwrite_bitmap);
@@ -8305,6 +8315,38 @@  static void vmx_start_preemption_timer(struct
kvm_vcpu *vcpu)
                      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }

+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+                                               struct vmcs12 *vmcs12)
+{
+       int maxphyaddr;
+       u64 addr;
+
+       if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+               return 0;
+
+       if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
+               WARN_ON(1);
+               return -EINVAL;
+       }
+       maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+       if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
+          ((addr + PAGE_SIZE) >> maxphyaddr))
+               return -EINVAL;
+
+       return 0;
+}
+
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+                                              struct vmcs12 *vmcs12)
+{
+       return false;
+}
+
 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
                                       unsigned long count_field,