diff mbox series

[RFC,2/7] KVM: VMX: Expose IA32_PKRS MSR

Message ID 20200807084841.7112-3-chenyi.qiang@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: PKS Virtualization support | expand

Commit Message

Chenyi Qiang Aug. 7, 2020, 8:48 a.m. UTC
Protection Keys for Supervisor Pages (PKS) uses IA32_PKRS MSR (PKRS) at
index 0x6E1 to allow software to manage supervisor protection key
rights. For performance consideration, PKRS intercept will be disabled
so that the guest can access the PKRS without VM exits.
PKS introduces dedicated control fields in VMCS to switch PKRS, which
only does the retore part. In addition, every VM exit saves PKRS into
the guest-state area in VMCS, while VM enter won't save the host value
due to the expectation that the host won't change the MSR often. Update
the host's value in VMCS manually if the MSR has been changed by the
kernel since the last time the VMCS was run.
The function get_current_pkrs() in arch/x86/mm/pkeys.c exports the
per-cpu variable pkrs_cache to avoid frequent rdmsr of PKRS.

Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
---
 arch/x86/include/asm/pkeys.h    |  1 +
 arch/x86/kvm/vmx/capabilities.h |  6 +++
 arch/x86/kvm/vmx/nested.c       |  1 +
 arch/x86/kvm/vmx/vmcs.h         |  1 +
 arch/x86/kvm/vmx/vmx.c          | 66 ++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.h              |  6 +++
 arch/x86/mm/pkeys.c             |  6 +++
 include/linux/pkeys.h           |  4 ++
 8 files changed, 89 insertions(+), 2 deletions(-)

Comments

Jim Mattson Aug. 12, 2020, 9:21 p.m. UTC | #1
On Fri, Aug 7, 2020 at 1:46 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
>
> Protection Keys for Supervisor Pages (PKS) uses IA32_PKRS MSR (PKRS) at
> index 0x6E1 to allow software to manage supervisor protection key
> rights. For performance consideration, PKRS intercept will be disabled
> so that the guest can access the PKRS without VM exits.
> PKS introduces dedicated control fields in VMCS to switch PKRS, which
> only does the retore part. In addition, every VM exit saves PKRS into
> the guest-state area in VMCS, while VM enter won't save the host value
> due to the expectation that the host won't change the MSR often. Update
> the host's value in VMCS manually if the MSR has been changed by the
> kernel since the last time the VMCS was run.
> The function get_current_pkrs() in arch/x86/mm/pkeys.c exports the
> per-cpu variable pkrs_cache to avoid frequent rdmsr of PKRS.
>
> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
> ---

> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 11e4df560018..df2c2e733549 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -289,6 +289,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
>         dest->ds_sel = src->ds_sel;
>         dest->es_sel = src->es_sel;
>  #endif
> +       dest->pkrs = src->pkrs;

Why isn't this (and other PKRS code) inside the #ifdef CONFIG_X86_64?
PKRS isn't usable outside of long mode, is it?

>  }
>
>  static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
> diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
> index 7a3675fddec2..39ec3d0c844b 100644
> --- a/arch/x86/kvm/vmx/vmcs.h
> +++ b/arch/x86/kvm/vmx/vmcs.h
> @@ -40,6 +40,7 @@ struct vmcs_host_state {
>  #ifdef CONFIG_X86_64
>         u16           ds_sel, es_sel;
>  #endif
> +       u32           pkrs;

One thing that does seem odd throughout is that PKRS is a 64-bit
resource, but we store and pass around only 32-bits. Yes, the top 32
bits are currently reserved, but the same can be said of, say, cr4, a
few lines above this. Yet, we store and pass around cr4 as 64-bits.
How do we decide?

>  };
>
>  struct vmcs_controls_shadow {

> @@ -1163,6 +1164,20 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
>          */
>         host_state->ldt_sel = kvm_read_ldt();
>
> +       /*
> +        * Update the host pkrs vmcs field before vcpu runs.
> +        * The setting of VM_EXIT_LOAD_IA32_PKRS can ensure
> +        * kvm_cpu_cap_has(X86_FEATURE_PKS) &&
> +        * guest_cpuid_has(vcpu, X86_FEATURE_VMX).
> +        */
> +       if (vm_exit_controls_get(vmx) & VM_EXIT_LOAD_IA32_PKRS) {
> +               host_pkrs = get_current_pkrs();
> +               if (unlikely(host_pkrs != host_state->pkrs)) {
> +                       vmcs_write64(HOST_IA32_PKRS, host_pkrs);
> +                       host_state->pkrs = host_pkrs;
> +               }
> +       }
> +
>  #ifdef CONFIG_X86_64
>         savesegment(ds, host_state->ds_sel);
>         savesegment(es, host_state->es_sel);
> @@ -1951,6 +1966,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>                 else
>                         msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
>                 break;
> +       case MSR_IA32_PKRS:
> +               if (!kvm_cpu_cap_has(X86_FEATURE_PKS) ||
> +                   (!msr_info->host_initiated &&
> +                   !guest_cpuid_has(vcpu, X86_FEATURE_PKS)))

Could this be simplified if we required
kvm_cpu_cap_has(X86_FEATURE_PKS) as a prerequisite for
guest_cpuid_has(vcpu, X86_FEATURE_PKS)? If not, what is the expected
behavior if guest_cpuid_has(vcpu, X86_FEATURE_PKS) is true and
kvm_cpu_cap_has(X86_FEATURE_PKS)  is false?

> +                       return 1;
> +               msr_info->data = vmcs_read64(GUEST_IA32_PKRS);
> +               break;
>         case MSR_TSC_AUX:
>                 if (!msr_info->host_initiated &&
>                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))

> @@ -7230,6 +7267,26 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
>                 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
>  }
>
> +static void vmx_update_pkrs_cfg(struct kvm_vcpu *vcpu)
> +{
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +       unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
> +       bool pks_supported = guest_cpuid_has(vcpu, X86_FEATURE_PKS);
> +
> +       /*
> +        * set intercept for PKRS when the guest doesn't support pks
> +        */
> +       vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PKRS, MSR_TYPE_RW, !pks_supported);
> +
> +       if (pks_supported) {
> +               vm_entry_controls_setbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
> +               vm_exit_controls_setbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
> +       } else {
> +               vm_entry_controls_clearbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
> +               vm_exit_controls_clearbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
> +       }
> +}

Not just your change, but it is unclear to me what the expected
behavior is when CPUID bits are modified while the guest is running.
For example, it seems that this code results in immediate behavioral
changes for an L1 guest; however, if an L2 guest is active, then there
are no behavioral changes until the next emulated VM-entry from L1 to
L2. Is that right?

On a more general note, do you have any kvm-unit-tests that exercise
the new code?
Chenyi Qiang Aug. 13, 2020, 5:42 a.m. UTC | #2
On 8/13/2020 5:21 AM, Jim Mattson wrote:
> On Fri, Aug 7, 2020 at 1:46 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
>>
>> Protection Keys for Supervisor Pages (PKS) uses IA32_PKRS MSR (PKRS) at
>> index 0x6E1 to allow software to manage supervisor protection key
>> rights. For performance consideration, PKRS intercept will be disabled
>> so that the guest can access the PKRS without VM exits.
>> PKS introduces dedicated control fields in VMCS to switch PKRS, which
>> only does the retore part. In addition, every VM exit saves PKRS into
>> the guest-state area in VMCS, while VM enter won't save the host value
>> due to the expectation that the host won't change the MSR often. Update
>> the host's value in VMCS manually if the MSR has been changed by the
>> kernel since the last time the VMCS was run.
>> The function get_current_pkrs() in arch/x86/mm/pkeys.c exports the
>> per-cpu variable pkrs_cache to avoid frequent rdmsr of PKRS.
>>
>> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
>> ---
> 
>> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
>> index 11e4df560018..df2c2e733549 100644
>> --- a/arch/x86/kvm/vmx/nested.c
>> +++ b/arch/x86/kvm/vmx/nested.c
>> @@ -289,6 +289,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
>>          dest->ds_sel = src->ds_sel;
>>          dest->es_sel = src->es_sel;
>>   #endif
>> +       dest->pkrs = src->pkrs;
> 
> Why isn't this (and other PKRS code) inside the #ifdef CONFIG_X86_64?
> PKRS isn't usable outside of long mode, is it?
> 

Yes, I'm also thinking about whether to put all pks code into 
CONFIG_X86_64. The kernel implementation also wrap its pks code inside 
CONFIG_ARCH_HAS_SUPERVISOR_PKEYS which has dependency with CONFIG_X86_64.
However, maybe this can help when host kernel disable PKS but the guest 
enable it. What do you think about this?


>>   }
>>
>>   static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
>> diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
>> index 7a3675fddec2..39ec3d0c844b 100644
>> --- a/arch/x86/kvm/vmx/vmcs.h
>> +++ b/arch/x86/kvm/vmx/vmcs.h
>> @@ -40,6 +40,7 @@ struct vmcs_host_state {
>>   #ifdef CONFIG_X86_64
>>          u16           ds_sel, es_sel;
>>   #endif
>> +       u32           pkrs;
> 
> One thing that does seem odd throughout is that PKRS is a 64-bit
> resource, but we store and pass around only 32-bits. Yes, the top 32
> bits are currently reserved, but the same can be said of, say, cr4, a
> few lines above this. Yet, we store and pass around cr4 as 64-bits.
> How do we decide?
> 

IMO, If the high part of PKRS is zero-reserved, it's OK to use u32. I 
define it as u32 just to follow the definition pkrs_cache in kernel code.

>>   };
>>
>>   struct vmcs_controls_shadow {
> 
>> @@ -1163,6 +1164,20 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
>>           */
>>          host_state->ldt_sel = kvm_read_ldt();
>>
>> +       /*
>> +        * Update the host pkrs vmcs field before vcpu runs.
>> +        * The setting of VM_EXIT_LOAD_IA32_PKRS can ensure
>> +        * kvm_cpu_cap_has(X86_FEATURE_PKS) &&
>> +        * guest_cpuid_has(vcpu, X86_FEATURE_VMX).
>> +        */
>> +       if (vm_exit_controls_get(vmx) & VM_EXIT_LOAD_IA32_PKRS) {
>> +               host_pkrs = get_current_pkrs();
>> +               if (unlikely(host_pkrs != host_state->pkrs)) {
>> +                       vmcs_write64(HOST_IA32_PKRS, host_pkrs);
>> +                       host_state->pkrs = host_pkrs;
>> +               }
>> +       }
>> +
>>   #ifdef CONFIG_X86_64
>>          savesegment(ds, host_state->ds_sel);
>>          savesegment(es, host_state->es_sel);
>> @@ -1951,6 +1966,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>                  else
>>                          msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
>>                  break;
>> +       case MSR_IA32_PKRS:
>> +               if (!kvm_cpu_cap_has(X86_FEATURE_PKS) ||
>> +                   (!msr_info->host_initiated &&
>> +                   !guest_cpuid_has(vcpu, X86_FEATURE_PKS)))
> 
> Could this be simplified if we required
> kvm_cpu_cap_has(X86_FEATURE_PKS) as a prerequisite for
> guest_cpuid_has(vcpu, X86_FEATURE_PKS)? If not, what is the expected
> behavior if guest_cpuid_has(vcpu, X86_FEATURE_PKS) is true and
> kvm_cpu_cap_has(X86_FEATURE_PKS)  is false?
> 

Yes, kvm_cpu_cap_has() is a prerequisite for guest_cpuid_has() as we 
have done the check in vmx_cpuid_update(). Here I add the 
kvm_cpu_cap_has() check to ensure the vmcs_read(GUEST_IA32_PKRS) can 
execute correctly in case of the userspace access.

>> +                       return 1;
>> +               msr_info->data = vmcs_read64(GUEST_IA32_PKRS);
>> +               break;
>>          case MSR_TSC_AUX:
>>                  if (!msr_info->host_initiated &&
>>                      !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
> 
>> @@ -7230,6 +7267,26 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
>>                  vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
>>   }
>>
>> +static void vmx_update_pkrs_cfg(struct kvm_vcpu *vcpu)
>> +{
>> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +       unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
>> +       bool pks_supported = guest_cpuid_has(vcpu, X86_FEATURE_PKS);
>> +
>> +       /*
>> +        * set intercept for PKRS when the guest doesn't support pks
>> +        */
>> +       vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PKRS, MSR_TYPE_RW, !pks_supported);
>> +
>> +       if (pks_supported) {
>> +               vm_entry_controls_setbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
>> +               vm_exit_controls_setbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
>> +       } else {
>> +               vm_entry_controls_clearbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
>> +               vm_exit_controls_clearbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
>> +       }
>> +}
> 
> Not just your change, but it is unclear to me what the expected
> behavior is when CPUID bits are modified while the guest is running.
> For example, it seems that this code results in immediate behavioral
> changes for an L1 guest; however, if an L2 guest is active, then there
> are no behavioral changes until the next emulated VM-entry from L1 to
> L2. Is that right?
> 

I don't know if there is a way to deal with the CPUID modification in 
KVM while the guest is running. Some CPUID modification like 
X86_FEATURE_OSPKE happens when the guest sets CR4_PKE. But I'm not 
familiar with your case.

Paolo

What's your opinion?


> On a more general note, do you have any kvm-unit-tests that exercise
> the new code?
> 

Yes, I'll attach the kvm-unit-tests in next version.
Jim Mattson Aug. 13, 2020, 5:31 p.m. UTC | #3
On Wed, Aug 12, 2020 at 10:42 PM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
>
>
>
> On 8/13/2020 5:21 AM, Jim Mattson wrote:
> > On Fri, Aug 7, 2020 at 1:46 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
> >>
> >> Protection Keys for Supervisor Pages (PKS) uses IA32_PKRS MSR (PKRS) at
> >> index 0x6E1 to allow software to manage supervisor protection key
> >> rights. For performance consideration, PKRS intercept will be disabled
> >> so that the guest can access the PKRS without VM exits.
> >> PKS introduces dedicated control fields in VMCS to switch PKRS, which
> >> only does the retore part. In addition, every VM exit saves PKRS into
> >> the guest-state area in VMCS, while VM enter won't save the host value
> >> due to the expectation that the host won't change the MSR often. Update
> >> the host's value in VMCS manually if the MSR has been changed by the
> >> kernel since the last time the VMCS was run.
> >> The function get_current_pkrs() in arch/x86/mm/pkeys.c exports the
> >> per-cpu variable pkrs_cache to avoid frequent rdmsr of PKRS.
> >>
> >> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
> >> ---
> >
> >> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> >> index 11e4df560018..df2c2e733549 100644
> >> --- a/arch/x86/kvm/vmx/nested.c
> >> +++ b/arch/x86/kvm/vmx/nested.c
> >> @@ -289,6 +289,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
> >>          dest->ds_sel = src->ds_sel;
> >>          dest->es_sel = src->es_sel;
> >>   #endif
> >> +       dest->pkrs = src->pkrs;
> >
> > Why isn't this (and other PKRS code) inside the #ifdef CONFIG_X86_64?
> > PKRS isn't usable outside of long mode, is it?
> >
>
> Yes, I'm also thinking about whether to put all pks code into
> CONFIG_X86_64. The kernel implementation also wrap its pks code inside
> CONFIG_ARCH_HAS_SUPERVISOR_PKEYS which has dependency with CONFIG_X86_64.
> However, maybe this can help when host kernel disable PKS but the guest
> enable it. What do you think about this?

I see no problem in exposing PKRS to the guest even if the host
doesn't have CONFIG_ARCH_HAS_SUPERVISOR_PKEYS.
Chenyi Qiang Aug. 18, 2020, 7:27 a.m. UTC | #4
On 8/14/2020 1:31 AM, Jim Mattson wrote:
> On Wed, Aug 12, 2020 at 10:42 PM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
>>
>>
>>
>> On 8/13/2020 5:21 AM, Jim Mattson wrote:
>>> On Fri, Aug 7, 2020 at 1:46 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
>>>>
>>>> Protection Keys for Supervisor Pages (PKS) uses IA32_PKRS MSR (PKRS) at
>>>> index 0x6E1 to allow software to manage supervisor protection key
>>>> rights. For performance consideration, PKRS intercept will be disabled
>>>> so that the guest can access the PKRS without VM exits.
>>>> PKS introduces dedicated control fields in VMCS to switch PKRS, which
>>>> only does the retore part. In addition, every VM exit saves PKRS into
>>>> the guest-state area in VMCS, while VM enter won't save the host value
>>>> due to the expectation that the host won't change the MSR often. Update
>>>> the host's value in VMCS manually if the MSR has been changed by the
>>>> kernel since the last time the VMCS was run.
>>>> The function get_current_pkrs() in arch/x86/mm/pkeys.c exports the
>>>> per-cpu variable pkrs_cache to avoid frequent rdmsr of PKRS.
>>>>
>>>> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
>>>> ---
>>>
>>>> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
>>>> index 11e4df560018..df2c2e733549 100644
>>>> --- a/arch/x86/kvm/vmx/nested.c
>>>> +++ b/arch/x86/kvm/vmx/nested.c
>>>> @@ -289,6 +289,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
>>>>           dest->ds_sel = src->ds_sel;
>>>>           dest->es_sel = src->es_sel;
>>>>    #endif
>>>> +       dest->pkrs = src->pkrs;
>>>
>>> Why isn't this (and other PKRS code) inside the #ifdef CONFIG_X86_64?
>>> PKRS isn't usable outside of long mode, is it?
>>>
>>
>> Yes, I'm also thinking about whether to put all pks code into
>> CONFIG_X86_64. The kernel implementation also wrap its pks code inside
>> CONFIG_ARCH_HAS_SUPERVISOR_PKEYS which has dependency with CONFIG_X86_64.
>> However, maybe this can help when host kernel disable PKS but the guest
>> enable it. What do you think about this?
> 
> I see no problem in exposing PKRS to the guest even if the host
> doesn't have CONFIG_ARCH_HAS_SUPERVISOR_PKEYS.
> 

Yes, but I would prefer to keep it outside CONFIG_X86_64. PKS code has 
several code blocks and putting them under x86_64 may end up being a 
mess. In addition, PKU KVM related code isn't under CONFIG_X86_64 as 
well. So, is it really necessary to put inside?
Jim Mattson Aug. 18, 2020, 6:23 p.m. UTC | #5
On Tue, Aug 18, 2020 at 12:28 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
>
>
>
> On 8/14/2020 1:31 AM, Jim Mattson wrote:
> > On Wed, Aug 12, 2020 at 10:42 PM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
> >>
> >>
> >>
> >> On 8/13/2020 5:21 AM, Jim Mattson wrote:
> >>> On Fri, Aug 7, 2020 at 1:46 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
> >>>>
> >>>> Protection Keys for Supervisor Pages (PKS) uses IA32_PKRS MSR (PKRS) at
> >>>> index 0x6E1 to allow software to manage supervisor protection key
> >>>> rights. For performance consideration, PKRS intercept will be disabled
> >>>> so that the guest can access the PKRS without VM exits.
> >>>> PKS introduces dedicated control fields in VMCS to switch PKRS, which
> >>>> only does the retore part. In addition, every VM exit saves PKRS into
> >>>> the guest-state area in VMCS, while VM enter won't save the host value
> >>>> due to the expectation that the host won't change the MSR often. Update
> >>>> the host's value in VMCS manually if the MSR has been changed by the
> >>>> kernel since the last time the VMCS was run.
> >>>> The function get_current_pkrs() in arch/x86/mm/pkeys.c exports the
> >>>> per-cpu variable pkrs_cache to avoid frequent rdmsr of PKRS.
> >>>>
> >>>> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
> >>>> ---
> >>>
> >>>> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> >>>> index 11e4df560018..df2c2e733549 100644
> >>>> --- a/arch/x86/kvm/vmx/nested.c
> >>>> +++ b/arch/x86/kvm/vmx/nested.c
> >>>> @@ -289,6 +289,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
> >>>>           dest->ds_sel = src->ds_sel;
> >>>>           dest->es_sel = src->es_sel;
> >>>>    #endif
> >>>> +       dest->pkrs = src->pkrs;
> >>>
> >>> Why isn't this (and other PKRS code) inside the #ifdef CONFIG_X86_64?
> >>> PKRS isn't usable outside of long mode, is it?
> >>>
> >>
> >> Yes, I'm also thinking about whether to put all pks code into
> >> CONFIG_X86_64. The kernel implementation also wrap its pks code inside
> >> CONFIG_ARCH_HAS_SUPERVISOR_PKEYS which has dependency with CONFIG_X86_64.
> >> However, maybe this can help when host kernel disable PKS but the guest
> >> enable it. What do you think about this?
> >
> > I see no problem in exposing PKRS to the guest even if the host
> > doesn't have CONFIG_ARCH_HAS_SUPERVISOR_PKEYS.
> >
>
> Yes, but I would prefer to keep it outside CONFIG_X86_64. PKS code has
> several code blocks and putting them under x86_64 may end up being a
> mess. In addition, PKU KVM related code isn't under CONFIG_X86_64 as
> well. So, is it really necessary to put inside?

I'll let someone who actually cares about the i386 build answer that question.
Sean Christopherson Aug. 22, 2020, 3:28 a.m. UTC | #6
On Tue, Aug 18, 2020 at 11:23:47AM -0700, Jim Mattson wrote:
> On Tue, Aug 18, 2020 at 12:28 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
> >
> >
> >
> > On 8/14/2020 1:31 AM, Jim Mattson wrote:
> > > On Wed, Aug 12, 2020 at 10:42 PM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
> > >>
> > >>
> > >>
> > >> On 8/13/2020 5:21 AM, Jim Mattson wrote:
> > >>> On Fri, Aug 7, 2020 at 1:46 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
> > >>>>
> > >>>> Protection Keys for Supervisor Pages (PKS) uses IA32_PKRS MSR (PKRS) at
> > >>>> index 0x6E1 to allow software to manage supervisor protection key
> > >>>> rights. For performance consideration, PKRS intercept will be disabled
> > >>>> so that the guest can access the PKRS without VM exits.
> > >>>> PKS introduces dedicated control fields in VMCS to switch PKRS, which
> > >>>> only does the retore part. In addition, every VM exit saves PKRS into
> > >>>> the guest-state area in VMCS, while VM enter won't save the host value
> > >>>> due to the expectation that the host won't change the MSR often. Update
> > >>>> the host's value in VMCS manually if the MSR has been changed by the
> > >>>> kernel since the last time the VMCS was run.
> > >>>> The function get_current_pkrs() in arch/x86/mm/pkeys.c exports the
> > >>>> per-cpu variable pkrs_cache to avoid frequent rdmsr of PKRS.
> > >>>>
> > >>>> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
> > >>>> ---
> > >>>
> > >>>> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > >>>> index 11e4df560018..df2c2e733549 100644
> > >>>> --- a/arch/x86/kvm/vmx/nested.c
> > >>>> +++ b/arch/x86/kvm/vmx/nested.c
> > >>>> @@ -289,6 +289,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
> > >>>>           dest->ds_sel = src->ds_sel;
> > >>>>           dest->es_sel = src->es_sel;
> > >>>>    #endif
> > >>>> +       dest->pkrs = src->pkrs;
> > >>>
> > >>> Why isn't this (and other PKRS code) inside the #ifdef CONFIG_X86_64?
> > >>> PKRS isn't usable outside of long mode, is it?
> > >>>
> > >>
> > >> Yes, I'm also thinking about whether to put all pks code into
> > >> CONFIG_X86_64. The kernel implementation also wrap its pks code inside
> > >> CONFIG_ARCH_HAS_SUPERVISOR_PKEYS which has dependency with CONFIG_X86_64.
> > >> However, maybe this can help when host kernel disable PKS but the guest
> > >> enable it. What do you think about this?
> > >
> > > I see no problem in exposing PKRS to the guest even if the host
> > > doesn't have CONFIG_ARCH_HAS_SUPERVISOR_PKEYS.
> > >
> >
> > Yes, but I would prefer to keep it outside CONFIG_X86_64. PKS code has
> > several code blocks and putting them under x86_64 may end up being a
> > mess. In addition, PKU KVM related code isn't under CONFIG_X86_64 as
> > well. So, is it really necessary to put inside?
> 
> I'll let someone who actually cares about the i386 build answer that question.

Ha, I care about the i386 build to the extent that it doesn't break.  I
don't care at all shaving cycles/memory for things like this.  Given how
long some KVM i386 bugs have gone unnoticed I'm not sure there's anyone that
cares about KVM i386 these days :-)
Paolo Bonzini Jan. 26, 2021, 6:01 p.m. UTC | #7
On 07/08/20 10:48, Chenyi Qiang wrote:
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
> +	bool pks_supported = guest_cpuid_has(vcpu, X86_FEATURE_PKS);
> +
> +	/*
> +	 * set intercept for PKRS when the guest doesn't support pks
> +	 */
> +	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PKRS, MSR_TYPE_RW, !pks_supported);
> +
> +	if (pks_supported) {
> +		vm_entry_controls_setbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
> +		vm_exit_controls_setbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
> +	} else {
> +		vm_entry_controls_clearbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
> +		vm_exit_controls_clearbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
> +	}

Is the guest expected to do a lot of reads/writes to the MSR (e.g. at 
every context switch)?

Even if this is the case, the MSR intercepts and the entry/exit controls 
should only be done if CR4.PKS=1.  If the guest does not use PKS, KVM 
should behave as if these patches did not exist.

Paolo
Chenyi Qiang Jan. 27, 2021, 7:55 a.m. UTC | #8
On 1/27/2021 2:01 AM, Paolo Bonzini wrote:
> On 07/08/20 10:48, Chenyi Qiang wrote:
>> +{
>> +    struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +    unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
>> +    bool pks_supported = guest_cpuid_has(vcpu, X86_FEATURE_PKS);
>> +
>> +    /*
>> +     * set intercept for PKRS when the guest doesn't support pks
>> +     */
>> +    vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PKRS, MSR_TYPE_RW, 
>> !pks_supported);
>> +
>> +    if (pks_supported) {
>> +        vm_entry_controls_setbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
>> +        vm_exit_controls_setbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
>> +    } else {
>> +        vm_entry_controls_clearbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
>> +        vm_exit_controls_clearbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
>> +    }
> 
> Is the guest expected to do a lot of reads/writes to the MSR (e.g. at 
> every context switch)?
> 

In current design for PKS, the PMEM stray write protection is the only 
implemented use case, and PKRS is only temporarily changed during 
specific code paths. Thus reads/writes to MSR is not so frequent, I think.

> Even if this is the case, the MSR intercepts and the entry/exit controls 
> should only be done if CR4.PKS=1.  If the guest does not use PKS, KVM 
> should behave as if these patches did not exist.
> 


I pass through the PKRS and enable the entry/exit controls when PKS is 
supported, and just want to narrow down the window of MSR switch during 
the VMX transition. But yeah, I should also consider the enabling status 
of guest PKS according to CR4.PKS, will fix it in next version.

> Paolo
>
Chenyi Qiang Feb. 1, 2021, 9:53 a.m. UTC | #9
On 1/27/2021 2:01 AM, Paolo Bonzini wrote:
> On 07/08/20 10:48, Chenyi Qiang wrote:
>> +{
>> +    struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +    unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
>> +    bool pks_supported = guest_cpuid_has(vcpu, X86_FEATURE_PKS);
>> +
>> +    /*
>> +     * set intercept for PKRS when the guest doesn't support pks
>> +     */
>> +    vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PKRS, MSR_TYPE_RW, 
>> !pks_supported);
>> +
>> +    if (pks_supported) {
>> +        vm_entry_controls_setbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
>> +        vm_exit_controls_setbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
>> +    } else {
>> +        vm_entry_controls_clearbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
>> +        vm_exit_controls_clearbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
>> +    }
> 
> Is the guest expected to do a lot of reads/writes to the MSR (e.g. at 
> every context switch)?
> 
> Even if this is the case, the MSR intercepts and the entry/exit controls 
> should only be done if CR4.PKS=1.  If the guest does not use PKS, KVM 
> should behave as if these patches did not exist.
> 

Hi Paolo,

Per the MSR intercepts and entry/exit controls, IA32_PKRS access is 
independent of the CR4.PKS bit, it just depends on CPUID enumeration. If 
the guest doesn't set CR4.PKS but still has the CPUID capability, 
modifying on PKRS should be supported but has no effect. IIUC, we can 
not ignore these controls if CR4.PKS=0.

Thanks
Chenyi

> Paolo
>
Paolo Bonzini Feb. 1, 2021, 10:05 a.m. UTC | #10
On 01/02/21 10:53, Chenyi Qiang wrote:
>>>
>>
>> Is the guest expected to do a lot of reads/writes to the MSR (e.g. at 
>> every context switch)?
>>
>> Even if this is the case, the MSR intercepts and the entry/exit 
>> controls should only be done if CR4.PKS=1.  If the guest does not use 
>> PKS, KVM should behave as if these patches did not exist.
>>
> 
> Hi Paolo,
> 
> Per the MSR intercepts and entry/exit controls, IA32_PKRS access is 
> independent of the CR4.PKS bit, it just depends on CPUID enumeration. If 
> the guest doesn't set CR4.PKS but still has the CPUID capability, 
> modifying on PKRS should be supported but has no effect. IIUC, we can 
> not ignore these controls if CR4.PKS=0.

Understood, I wanted to avoid paying the price (if any) of loading PKRS 
on vmentry and vmexit not just if CPUID.PKS=0, but also if CR4.PKS=0. 
If CR4.PKS=0 it would be nicer to enable the MSR intercept and disable 
the vmentry/vmexit controls; just run the guest with the host value of 
IA32_PKRS.

Paolo
diff mbox series

Patch

diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 097abca7784c..d7c405d6eea6 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -142,6 +142,7 @@  u32 get_new_pkr(u32 old_pkr, int pkey, unsigned long init_val);
 int pks_key_alloc(const char *const pkey_user);
 void pks_key_free(int pkey);
 u32 get_new_pkr(u32 old_pkr, int pkey, unsigned long init_val);
+u32 get_current_pkrs(void);
 
 /*
  * pks_update_protection - Update the protection of the specified key
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 4bbd8b448d22..7099e3105f48 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -103,6 +103,12 @@  static inline bool cpu_has_load_perf_global_ctrl(void)
 	       (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
 }
 
+static inline bool cpu_has_load_ia32_pkrs(void)
+{
+	return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PKRS) &&
+	       (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PKRS);
+}
+
 static inline bool cpu_has_vmx_mpx(void)
 {
 	return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 11e4df560018..df2c2e733549 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -289,6 +289,7 @@  static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
 	dest->ds_sel = src->ds_sel;
 	dest->es_sel = src->es_sel;
 #endif
+	dest->pkrs = src->pkrs;
 }
 
 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 7a3675fddec2..39ec3d0c844b 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -40,6 +40,7 @@  struct vmcs_host_state {
 #ifdef CONFIG_X86_64
 	u16           ds_sel, es_sel;
 #endif
+	u32           pkrs;
 };
 
 struct vmcs_controls_shadow {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 13745f2a5ecd..d91d59fb46fa 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1131,6 +1131,7 @@  void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 #endif
 	unsigned long fs_base, gs_base;
 	u16 fs_sel, gs_sel;
+	u32 host_pkrs;
 	int i;
 
 	vmx->req_immediate_exit = false;
@@ -1163,6 +1164,20 @@  void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 	 */
 	host_state->ldt_sel = kvm_read_ldt();
 
+	/*
+	 * Update the host pkrs vmcs field before vcpu runs.
+	 * The setting of VM_EXIT_LOAD_IA32_PKRS can ensure
+	 * kvm_cpu_cap_has(X86_FEATURE_PKS) &&
+	 * guest_cpuid_has(vcpu, X86_FEATURE_VMX).
+	 */
+	if (vm_exit_controls_get(vmx) & VM_EXIT_LOAD_IA32_PKRS) {
+		host_pkrs = get_current_pkrs();
+		if (unlikely(host_pkrs != host_state->pkrs)) {
+			vmcs_write64(HOST_IA32_PKRS, host_pkrs);
+			host_state->pkrs = host_pkrs;
+		}
+	}
+
 #ifdef CONFIG_X86_64
 	savesegment(ds, host_state->ds_sel);
 	savesegment(es, host_state->es_sel);
@@ -1951,6 +1966,13 @@  static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		else
 			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
 		break;
+	case MSR_IA32_PKRS:
+		if (!kvm_cpu_cap_has(X86_FEATURE_PKS) ||
+		    (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_PKS)))
+			return 1;
+		msr_info->data = vmcs_read64(GUEST_IA32_PKRS);
+		break;
 	case MSR_TSC_AUX:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
@@ -2221,6 +2243,15 @@  static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		else
 			vmx->pt_desc.guest.addr_a[index / 2] = data;
 		break;
+	case MSR_IA32_PKRS:
+		if (!kvm_pkrs_valid(data))
+			return 1;
+		if (!kvm_cpu_cap_has(X86_FEATURE_PKS) ||
+		    (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_PKS)))
+			return 1;
+		vmcs_write64(GUEST_IA32_PKRS, data);
+		break;
 	case MSR_TSC_AUX:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
@@ -2510,7 +2541,8 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 	      VM_EXIT_LOAD_IA32_EFER |
 	      VM_EXIT_CLEAR_BNDCFGS |
 	      VM_EXIT_PT_CONCEAL_PIP |
-	      VM_EXIT_CLEAR_IA32_RTIT_CTL;
+	      VM_EXIT_CLEAR_IA32_RTIT_CTL |
+	      VM_EXIT_LOAD_IA32_PKRS;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
 				&_vmexit_control) < 0)
 		return -EIO;
@@ -2534,7 +2566,8 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 	      VM_ENTRY_LOAD_IA32_EFER |
 	      VM_ENTRY_LOAD_BNDCFGS |
 	      VM_ENTRY_PT_CONCEAL_PIP |
-	      VM_ENTRY_LOAD_IA32_RTIT_CTL;
+	      VM_ENTRY_LOAD_IA32_RTIT_CTL |
+	      VM_ENTRY_LOAD_IA32_PKRS;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
 				&_vmentry_control) < 0)
 		return -EIO;
@@ -5868,6 +5901,8 @@  void dump_vmcs(void)
 		       vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
 	if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
 		pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
+	if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PKRS)
+		pr_err("PKRS = 0x%016llx\n", vmcs_read64(GUEST_IA32_PKRS));
 	pr_err("Interruptibility = %08x  ActivityState = %08x\n",
 	       vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
 	       vmcs_read32(GUEST_ACTIVITY_STATE));
@@ -5903,6 +5938,8 @@  void dump_vmcs(void)
 	    vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
 		pr_err("PerfGlobCtl = 0x%016llx\n",
 		       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+	if (vmexit_ctl & VM_EXIT_LOAD_IA32_PKRS)
+		pr_err("PKRS = 0x%016llx\n", vmcs_read64(HOST_IA32_PKRS));
 
 	pr_err("*** Control State ***\n");
 	pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
@@ -7230,6 +7267,26 @@  static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
 		vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
 }
 
+static void vmx_update_pkrs_cfg(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+	bool pks_supported = guest_cpuid_has(vcpu, X86_FEATURE_PKS);
+
+	/*
+	 * set intercept for PKRS when the guest doesn't support pks
+	 */
+	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_PKRS, MSR_TYPE_RW, !pks_supported);
+
+	if (pks_supported) {
+		vm_entry_controls_setbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
+		vm_exit_controls_setbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
+	} else {
+		vm_entry_controls_clearbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
+		vm_exit_controls_clearbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
+	}
+}
+
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7251,6 +7308,11 @@  static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 			~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
 			  FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
 
+	if (kvm_cpu_cap_has(X86_FEATURE_PKS))
+		vmx_update_pkrs_cfg(vcpu);
+	else
+		guest_cpuid_clear(vcpu, X86_FEATURE_PKS);
+
 	if (nested_vmx_allowed(vcpu)) {
 		nested_vmx_cr_fixed1_bits_update(vcpu);
 		nested_vmx_entry_exit_ctls_update(vcpu);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6eb62e97e59f..7fb206f98bed 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -361,6 +361,12 @@  static inline bool kvm_dr7_valid(u64 data)
 	return !(data >> 32);
 }
 
+static inline bool kvm_pkrs_valid(u64 data)
+{
+	/* bit[63,32] must be zero */
+	return !(data >> 32);
+}
+
 void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
 void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
 u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index cc9a28a55ba3..b237c54074ba 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -333,6 +333,12 @@  void pks_key_free(int pkey)
 }
 EXPORT_SYMBOL_GPL(pks_key_free);
 
+u32 get_current_pkrs(void)
+{
+	return this_cpu_read(pkrs_cache);
+}
+EXPORT_SYMBOL_GPL(get_current_pkrs);
+
 static int pks_keys_allocated_show(struct seq_file *m, void *p)
 {
 	int i;
diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index 1d84ab7c12d4..8ac90fae137f 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -66,6 +66,10 @@  static inline void pks_update_protection(int pkey, unsigned long protection)
 {
 	WARN_ON_ONCE(1);
 }
+static inline u32 get_current_pkrs(void)
+{
+	return 0;
+}
 #endif /* ! CONFIG_ARCH_HAS_SUPERVISOR_PKEYS */
 
 #endif /* _LINUX_PKEYS_H */