diff mbox series

[RFC,7/7] KVM: VMX: Enable PKS for nested VM

Message ID 20200807084841.7112-8-chenyi.qiang@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: PKS Virtualization support | expand

Commit Message

Chenyi Qiang Aug. 7, 2020, 8:48 a.m. UTC
PKS MSR passes through guest directly. Configure the MSR to match the
L0/L1 settings so that nested VM runs PKS properly.

Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
---
 arch/x86/kvm/vmx/nested.c | 32 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/vmcs12.c |  2 ++
 arch/x86/kvm/vmx/vmcs12.h |  6 +++++-
 arch/x86/kvm/vmx/vmx.c    | 10 ++++++++++
 arch/x86/kvm/vmx/vmx.h    |  1 +
 5 files changed, 50 insertions(+), 1 deletion(-)

Comments

Jim Mattson Aug. 11, 2020, 12:05 a.m. UTC | #1
On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
>
> PKS MSR passes through guest directly. Configure the MSR to match the
> L0/L1 settings so that nested VM runs PKS properly.
>
> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
> ---
>  arch/x86/kvm/vmx/nested.c | 32 ++++++++++++++++++++++++++++++++
>  arch/x86/kvm/vmx/vmcs12.c |  2 ++
>  arch/x86/kvm/vmx/vmcs12.h |  6 +++++-
>  arch/x86/kvm/vmx/vmx.c    | 10 ++++++++++
>  arch/x86/kvm/vmx/vmx.h    |  1 +
>  5 files changed, 50 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index df2c2e733549..1f9823d21ecd 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -647,6 +647,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
>                                         MSR_IA32_PRED_CMD,
>                                         MSR_TYPE_W);
>
> +       if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PKRS))
> +               nested_vmx_disable_intercept_for_msr(
> +                                       msr_bitmap_l1, msr_bitmap_l0,
> +                                       MSR_IA32_PKRS,
> +                                       MSR_TYPE_R | MSR_TYPE_W);

What if L1 intercepts only *reads* of MSR_IA32_PKRS?

>         kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
>
>         return true;

> @@ -2509,6 +2519,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>         if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
>             !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
>                 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
> +
> +       if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&

Is the above check superfluous? I would assume that the L1 guest can't
set VM_ENTRY_LOAD_IA32_PKRS unless this is true.

> +           (!vmx->nested.nested_run_pending ||
> +            !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
> +               vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);

This doesn't seem right to me. On the target of a live migration, with
L2 active at the time the snapshot was taken (i.e.,
vmx->nested.nested_run_pending=0), it looks like we're going to try to
overwrite the current L2 PKRS value with L1's PKRS value (except that
in this situation, vmx->nested.vmcs01_guest_pkrs should actually be
0). Am I missing something?

>         vmx_set_rflags(vcpu, vmcs12->guest_rflags);
>
>         /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the


> @@ -3916,6 +3943,8 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
>                 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>         if (kvm_mpx_supported())
>                 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
> +       if (kvm_cpu_cap_has(X86_FEATURE_PKS))

Shouldn't we be checking to see if the *virtual* CPU supports PKS
before writing anything into vmcs12->guest_ia32_pkrs?

> +               vmcs12->guest_ia32_pkrs = vmcs_read64(GUEST_IA32_PKRS);
>
>         vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
>  }
Sean Christopherson Aug. 12, 2020, 3 p.m. UTC | #2
On Mon, Aug 10, 2020 at 05:05:36PM -0700, Jim Mattson wrote:
> On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
> >
> > PKS MSR passes through guest directly. Configure the MSR to match the
> > L0/L1 settings so that nested VM runs PKS properly.
> >
> > Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
> > ---
> >  arch/x86/kvm/vmx/nested.c | 32 ++++++++++++++++++++++++++++++++
> >  arch/x86/kvm/vmx/vmcs12.c |  2 ++
> >  arch/x86/kvm/vmx/vmcs12.h |  6 +++++-
> >  arch/x86/kvm/vmx/vmx.c    | 10 ++++++++++
> >  arch/x86/kvm/vmx/vmx.h    |  1 +
> >  5 files changed, 50 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > index df2c2e733549..1f9823d21ecd 100644
> > --- a/arch/x86/kvm/vmx/nested.c
> > +++ b/arch/x86/kvm/vmx/nested.c
> > @@ -647,6 +647,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
> >                                         MSR_IA32_PRED_CMD,
> >                                         MSR_TYPE_W);
> >
> > +       if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PKRS))
> > +               nested_vmx_disable_intercept_for_msr(
> > +                                       msr_bitmap_l1, msr_bitmap_l0,
> > +                                       MSR_IA32_PKRS,
> > +                                       MSR_TYPE_R | MSR_TYPE_W);
> 
> What if L1 intercepts only *reads* of MSR_IA32_PKRS?

nested_vmx_disable_intercept_for_msr() handles merging L1's desires, the
(MSR_TYPE_R | MSR_TYPE_W) param is effectively L0's desire for L2.
Jim Mattson Aug. 12, 2020, 6:32 p.m. UTC | #3
On Wed, Aug 12, 2020 at 8:00 AM Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
>
> On Mon, Aug 10, 2020 at 05:05:36PM -0700, Jim Mattson wrote:
> > On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
> > >
> > > PKS MSR passes through guest directly. Configure the MSR to match the
> > > L0/L1 settings so that nested VM runs PKS properly.
> > >
> > > Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
> > > ---
> > >  arch/x86/kvm/vmx/nested.c | 32 ++++++++++++++++++++++++++++++++
> > >  arch/x86/kvm/vmx/vmcs12.c |  2 ++
> > >  arch/x86/kvm/vmx/vmcs12.h |  6 +++++-
> > >  arch/x86/kvm/vmx/vmx.c    | 10 ++++++++++
> > >  arch/x86/kvm/vmx/vmx.h    |  1 +
> > >  5 files changed, 50 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > > index df2c2e733549..1f9823d21ecd 100644
> > > --- a/arch/x86/kvm/vmx/nested.c
> > > +++ b/arch/x86/kvm/vmx/nested.c
> > > @@ -647,6 +647,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
> > >                                         MSR_IA32_PRED_CMD,
> > >                                         MSR_TYPE_W);
> > >
> > > +       if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PKRS))
> > > +               nested_vmx_disable_intercept_for_msr(
> > > +                                       msr_bitmap_l1, msr_bitmap_l0,
> > > +                                       MSR_IA32_PKRS,
> > > +                                       MSR_TYPE_R | MSR_TYPE_W);
> >
> > What if L1 intercepts only *reads* of MSR_IA32_PKRS?
>
> nested_vmx_disable_intercept_for_msr() handles merging L1's desires, the
> (MSR_TYPE_R | MSR_TYPE_W) param is effectively L0's desire for L2.

I should know better than to assume that a function in kvm actually
does anything like what its name implies, but I never seem to learn.
:-(

Thanks!
Chenyi Qiang Aug. 13, 2020, 4:52 a.m. UTC | #4
On 8/11/2020 8:05 AM, Jim Mattson wrote:
> On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
>>
>> PKS MSR passes through guest directly. Configure the MSR to match the
>> L0/L1 settings so that nested VM runs PKS properly.
>>
>> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
>> ---
>>   arch/x86/kvm/vmx/nested.c | 32 ++++++++++++++++++++++++++++++++
>>   arch/x86/kvm/vmx/vmcs12.c |  2 ++
>>   arch/x86/kvm/vmx/vmcs12.h |  6 +++++-
>>   arch/x86/kvm/vmx/vmx.c    | 10 ++++++++++
>>   arch/x86/kvm/vmx/vmx.h    |  1 +
>>   5 files changed, 50 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
>> index df2c2e733549..1f9823d21ecd 100644
>> --- a/arch/x86/kvm/vmx/nested.c
>> +++ b/arch/x86/kvm/vmx/nested.c
>> @@ -647,6 +647,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
>>                                          MSR_IA32_PRED_CMD,
>>                                          MSR_TYPE_W);
>>
>> +       if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PKRS))
>> +               nested_vmx_disable_intercept_for_msr(
>> +                                       msr_bitmap_l1, msr_bitmap_l0,
>> +                                       MSR_IA32_PKRS,
>> +                                       MSR_TYPE_R | MSR_TYPE_W);
> 
> What if L1 intercepts only *reads* of MSR_IA32_PKRS?
> 
>>          kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
>>
>>          return true;
> 
>> @@ -2509,6 +2519,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>>          if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
>>              !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
>>                  vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
>> +
>> +       if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
> 
> Is the above check superfluous? I would assume that the L1 guest can't
> set VM_ENTRY_LOAD_IA32_PKRS unless this is true.
> 

I enforce this check to ensure vmcs_write to the Guest_IA32_PKRS without 
error. if deleted, vmcs_write to GUEST_IA32_PKRS may executed when PKS 
is unsupported.

>> +           (!vmx->nested.nested_run_pending ||
>> +            !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
>> +               vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);
> 
> This doesn't seem right to me. On the target of a live migration, with
> L2 active at the time the snapshot was taken (i.e.,
> vmx->nested.nested_run_pending=0), it looks like we're going to try to
> overwrite the current L2 PKRS value with L1's PKRS value (except that
> in this situation, vmx->nested.vmcs01_guest_pkrs should actually be
> 0). Am I missing something?
> 

We overwrite the L2 PKRS with L1's value when L2 doesn't support PKS. 
Because the L1's VM_ENTRY_LOAD_IA32_PKRS is off, we need to migrate L1's 
PKRS to L2.

>>          vmx_set_rflags(vcpu, vmcs12->guest_rflags);
>>
>>          /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
> 
> 
>> @@ -3916,6 +3943,8 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
>>                  vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>>          if (kvm_mpx_supported())
>>                  vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
>> +       if (kvm_cpu_cap_has(X86_FEATURE_PKS))
> 
> Shouldn't we be checking to see if the *virtual* CPU supports PKS
> before writing anything into vmcs12->guest_ia32_pkrs?
> 

Yes, It's reasonable.

>> +               vmcs12->guest_ia32_pkrs = vmcs_read64(GUEST_IA32_PKRS);
>>
>>          vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
>>   }
Jim Mattson Aug. 13, 2020, 5:52 p.m. UTC | #5
On Wed, Aug 12, 2020 at 9:54 PM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
>
>
>
> On 8/11/2020 8:05 AM, Jim Mattson wrote:
> > On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
> >>
> >> PKS MSR passes through guest directly. Configure the MSR to match the
> >> L0/L1 settings so that nested VM runs PKS properly.
> >>
> >> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
> >> ---

> >> +           (!vmx->nested.nested_run_pending ||
> >> +            !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
> >> +               vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);
> >
> > This doesn't seem right to me. On the target of a live migration, with
> > L2 active at the time the snapshot was taken (i.e.,
> > vmx->nested.nested_run_pending=0), it looks like we're going to try to
> > overwrite the current L2 PKRS value with L1's PKRS value (except that
> > in this situation, vmx->nested.vmcs01_guest_pkrs should actually be
> > 0). Am I missing something?
> >
>
> We overwrite the L2 PKRS with L1's value when L2 doesn't support PKS.
> Because the L1's VM_ENTRY_LOAD_IA32_PKRS is off, we need to migrate L1's
> PKRS to L2.

I'm thinking of the case where vmx->nested.nested_run_pending is
false, and we are processing a KVM_SET_NESTED_STATE ioctl, yet
VM_ENTRY_LOAD_IA32_PKRS *is* set in the vmcs12.
Chenyi Qiang Aug. 14, 2020, 10:07 a.m. UTC | #6
On 8/14/2020 1:52 AM, Jim Mattson wrote:
> On Wed, Aug 12, 2020 at 9:54 PM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
>>
>>
>>
>> On 8/11/2020 8:05 AM, Jim Mattson wrote:
>>> On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
>>>>
>>>> PKS MSR passes through guest directly. Configure the MSR to match the
>>>> L0/L1 settings so that nested VM runs PKS properly.
>>>>
>>>> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
>>>> ---
> 
>>>> +           (!vmx->nested.nested_run_pending ||
>>>> +            !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
>>>> +               vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);
>>>
>>> This doesn't seem right to me. On the target of a live migration, with
>>> L2 active at the time the snapshot was taken (i.e.,
>>> vmx->nested.nested_run_pending=0), it looks like we're going to try to
>>> overwrite the current L2 PKRS value with L1's PKRS value (except that
>>> in this situation, vmx->nested.vmcs01_guest_pkrs should actually be
>>> 0). Am I missing something?
>>>
>>
>> We overwrite the L2 PKRS with L1's value when L2 doesn't support PKS.
>> Because the L1's VM_ENTRY_LOAD_IA32_PKRS is off, we need to migrate L1's
>> PKRS to L2.
> 
> I'm thinking of the case where vmx->nested.nested_run_pending is
> false, and we are processing a KVM_SET_NESTED_STATE ioctl, yet
> VM_ENTRY_LOAD_IA32_PKRS *is* set in the vmcs12.
> 

Oh, I miss this case. What I'm still confused here is that the 
restoration for GUEST_IA32_DEBUGCTL and GUEST_BNDCFGS have the same 
issue, right? or I miss something.
Jim Mattson Aug. 14, 2020, 5:34 p.m. UTC | #7
On Fri, Aug 14, 2020 at 3:09 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
>
>
>
> On 8/14/2020 1:52 AM, Jim Mattson wrote:
> > On Wed, Aug 12, 2020 at 9:54 PM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
> >>
> >>
> >>
> >> On 8/11/2020 8:05 AM, Jim Mattson wrote:
> >>> On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang <chenyi.qiang@intel.com> wrote:
> >>>>
> >>>> PKS MSR passes through guest directly. Configure the MSR to match the
> >>>> L0/L1 settings so that nested VM runs PKS properly.
> >>>>
> >>>> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
> >>>> ---
> >
> >>>> +           (!vmx->nested.nested_run_pending ||
> >>>> +            !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
> >>>> +               vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);
> >>>
> >>> This doesn't seem right to me. On the target of a live migration, with
> >>> L2 active at the time the snapshot was taken (i.e.,
> >>> vmx->nested.nested_run_pending=0), it looks like we're going to try to
> >>> overwrite the current L2 PKRS value with L1's PKRS value (except that
> >>> in this situation, vmx->nested.vmcs01_guest_pkrs should actually be
> >>> 0). Am I missing something?
> >>>
> >>
> >> We overwrite the L2 PKRS with L1's value when L2 doesn't support PKS.
> >> Because the L1's VM_ENTRY_LOAD_IA32_PKRS is off, we need to migrate L1's
> >> PKRS to L2.
> >
> > I'm thinking of the case where vmx->nested.nested_run_pending is
> > false, and we are processing a KVM_SET_NESTED_STATE ioctl, yet
> > VM_ENTRY_LOAD_IA32_PKRS *is* set in the vmcs12.
> >
>
> Oh, I miss this case. What I'm still confused here is that the
> restoration for GUEST_IA32_DEBUGCTL and GUEST_BNDCFGS have the same
> issue, right? or I miss something.

I take it back. This does work, assuming that userspace calls
KVM_SET_MSRS before calling KVM_SET_NESTED_STATE. Assuming L2 is
active when the checkpoint is taken, the MSR values saved will be the
L2 values. When restoring the MSRs with KVM_SET_MSRS, the L2 MSR
values will be written into vmcs01. They don't belong there, but we're
never going to launch vmcs01 with those MSR values. Instead, when
userspace calls KVM_SET_NESTED_STATE, those values will be transferred
first to the vmcs01_<msr> fields of the vmx->nested struct, and then
to vmcs02.

This is subtle, and I don't think it's documented anywhere that
KVM_SET_NESTED_STATE must be called after KVM_SET_MSRS. In fact, there
are probably a number of dependencies among the various KVM_SET_*
functions that aren't documented anywhere.
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index df2c2e733549..1f9823d21ecd 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -647,6 +647,12 @@  static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 					MSR_IA32_PRED_CMD,
 					MSR_TYPE_W);
 
+	if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PKRS))
+		nested_vmx_disable_intercept_for_msr(
+					msr_bitmap_l1, msr_bitmap_l0,
+					MSR_IA32_PKRS,
+					MSR_TYPE_R | MSR_TYPE_W);
+
 	kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
 
 	return true;
@@ -2427,6 +2433,10 @@  static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
 		if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
 		    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
 			vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+
+		if (vmx->nested.nested_run_pending &&
+		    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))
+			vmcs_write64(GUEST_IA32_PKRS, vmcs12->guest_ia32_pkrs);
 	}
 
 	if (nested_cpu_has_xsaves(vmcs12))
@@ -2509,6 +2519,11 @@  static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
 	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
 		vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+
+	if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
+	    (!vmx->nested.nested_run_pending ||
+	     !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
+		vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);
 	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
 
 	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
@@ -2849,6 +2864,10 @@  static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
 					   vmcs12->host_ia32_perf_global_ctrl)))
 		return -EINVAL;
 
+	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PKRS) &&
+	    CC(!kvm_pkrs_valid(vmcs12->host_ia32_pkrs)))
+		return -EINVAL;
+
 #ifdef CONFIG_X86_64
 	ia32e = !!(vcpu->arch.efer & EFER_LMA);
 #else
@@ -2998,6 +3017,10 @@  static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
 	if (nested_check_guest_non_reg_state(vmcs12))
 		return -EINVAL;
 
+	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS) &&
+	    CC(!kvm_pkrs_valid(vmcs12->guest_ia32_pkrs)))
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -3271,6 +3294,9 @@  enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 	if (kvm_mpx_supported() &&
 		!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
 		vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+	if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
+	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))
+		vmx->nested.vmcs01_guest_pkrs = vmcs_read64(GUEST_IA32_PKRS);
 
 	/*
 	 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
@@ -3865,6 +3891,7 @@  static bool is_vmcs12_ext_field(unsigned long field)
 	case GUEST_IDTR_BASE:
 	case GUEST_PENDING_DBG_EXCEPTIONS:
 	case GUEST_BNDCFGS:
+	case GUEST_IA32_PKRS:
 		return true;
 	default:
 		break;
@@ -3916,6 +3943,8 @@  static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
 		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 	if (kvm_mpx_supported())
 		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+	if (kvm_cpu_cap_has(X86_FEATURE_PKS))
+		vmcs12->guest_ia32_pkrs = vmcs_read64(GUEST_IA32_PKRS);
 
 	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
 }
@@ -4151,6 +4180,9 @@  static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 		WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
 					 vmcs12->host_ia32_perf_global_ctrl));
 
+	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PKRS)
+		vmcs_write64(GUEST_IA32_PKRS, vmcs12->host_ia32_pkrs);
+
 	/* Set L1 segment info according to Intel SDM
 	    27.5.2 Loading Host Segment and Descriptor-Table Registers */
 	seg = (struct kvm_segment) {
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index c8e51c004f78..df7b2143b807 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -61,9 +61,11 @@  const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD64(GUEST_PDPTR2, guest_pdptr2),
 	FIELD64(GUEST_PDPTR3, guest_pdptr3),
 	FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
+	FIELD64(GUEST_IA32_PKRS, guest_ia32_pkrs),
 	FIELD64(HOST_IA32_PAT, host_ia32_pat),
 	FIELD64(HOST_IA32_EFER, host_ia32_efer),
 	FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
+	FIELD64(HOST_IA32_PKRS, host_ia32_pkrs),
 	FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
 	FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
 	FIELD(EXCEPTION_BITMAP, exception_bitmap),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 80232daf00ff..009b4c317375 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -69,7 +69,9 @@  struct __packed vmcs12 {
 	u64 vm_function_control;
 	u64 eptp_list_address;
 	u64 pml_address;
-	u64 padding64[3]; /* room for future expansion */
+	u64 guest_ia32_pkrs;
+	u64 host_ia32_pkrs;
+	u64 padding64[1]; /* room for future expansion */
 	/*
 	 * To allow migration of L1 (complete with its L2 guests) between
 	 * machines of different natural widths (32 or 64 bit), we cannot have
@@ -256,6 +258,8 @@  static inline void vmx_check_vmcs12_offsets(void)
 	CHECK_OFFSET(vm_function_control, 296);
 	CHECK_OFFSET(eptp_list_address, 304);
 	CHECK_OFFSET(pml_address, 312);
+	CHECK_OFFSET(guest_ia32_pkrs, 320);
+	CHECK_OFFSET(host_ia32_pkrs, 328);
 	CHECK_OFFSET(cr0_guest_host_mask, 344);
 	CHECK_OFFSET(cr4_guest_host_mask, 352);
 	CHECK_OFFSET(cr0_read_shadow, 360);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5cdf4d3848fb..3c3fb554c3fc 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7178,6 +7178,7 @@  static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
 	cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
 	cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
 	cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
+	cr4_fixed1_update(X86_CR4_PKS,        ecx, feature_bit(PKS));
 
 #undef cr4_fixed1_update
 }
@@ -7197,6 +7198,15 @@  static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
 			vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
 		}
 	}
+
+	if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
+	    guest_cpuid_has(vcpu, X86_FEATURE_PKS)) {
+		vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_IA32_PKRS;
+		vmx->nested.msrs.exit_ctls_high |= VM_EXIT_LOAD_IA32_PKRS;
+	} else {
+		vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_IA32_PKRS;
+		vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_LOAD_IA32_PKRS;
+	}
 }
 
 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 639798e4a6ca..2819d3e030b9 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -176,6 +176,7 @@  struct nested_vmx {
 	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
 	u64 vmcs01_debugctl;
 	u64 vmcs01_guest_bndcfgs;
+	u64 vmcs01_guest_pkrs;
 
 	/* to migrate it to L1 if L2 writes to L1's CR8 directly */
 	int l1_tpr_threshold;