diff mbox series

[v4,2/5] KVM: X86: Expose PKS to guest

Message ID 20210205083706.14146-3-chenyi.qiang@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: PKS Virtualization support | expand

Commit Message

Chenyi Qiang Feb. 5, 2021, 8:37 a.m. UTC
Protection Keys for Supervisor Pages (PKS) is a feature that extends the
Protection Keys architecture to support thread-specific permission
restrictions on supervisor pages, which extends an existing feature
named PKU (for user-mode pages).

PKS uses IA32_PKRS MSR (PKRS) at index 0x6E1 to allow software to manage
supervisor key rights. For performance consideration, PKRS intercept
will be disabled when guest CR4.PKS=1 so that the guest can access the
PKRS without VM exits.

PKS introduces dedicated control fields in VMCS to switch PKRS, which
only does the retore part. In addition, every VM exit saves PKRS into
the guest-state area in VMCS, while VM enter won't save the host value
due to the expectation that the host won't change the MSR often. Update
the host's value in VMCS manually if the MSR has been changed by the
kernel since the last time the VMCS was run.

Existence of PKS is enumerated via CPUID.(EAX=7H,ECX=0):ECX[31]. It is
enabled by setting CR4.PKS when long mode is active. PKS is only
implemented when EPT is enabled and requires the support of VM_{ENTRY,
EXIT}_LOAD_IA32_PKRS currently.

The function get_current_pkrs() in arch/x86/mm/pkeys.c exports the
per-cpu variable pkrs_cache to avoid frequent rdmsr of PKRS.

Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
---
 arch/x86/include/asm/kvm_host.h |  4 +-
 arch/x86/include/asm/pkeys.h    |  1 +
 arch/x86/kvm/cpuid.c            |  3 +-
 arch/x86/kvm/vmx/capabilities.h |  6 +++
 arch/x86/kvm/vmx/nested.c       |  1 +
 arch/x86/kvm/vmx/vmcs.h         |  1 +
 arch/x86/kvm/vmx/vmx.c          | 89 ++++++++++++++++++++++++++++++---
 arch/x86/kvm/x86.c              |  8 ++-
 arch/x86/kvm/x86.h              |  8 +++
 arch/x86/mm/pkeys.c             |  6 +++
 include/linux/pkeys.h           |  4 ++
 11 files changed, 122 insertions(+), 9 deletions(-)

Comments

Paolo Bonzini Feb. 5, 2021, 9:23 a.m. UTC | #1
On 05/02/21 09:37, Chenyi Qiang wrote:

> +	/*
> +	 * PKS is not yet implemented for shadow paging.
> +	 * If not support VM_{ENTRY, EXIT}_LOAD_IA32_PKRS,
> +	 * don't expose the PKS as well.
> +	 */
> +	if (enable_ept && cpu_has_load_ia32_pkrs())
> +		kvm_cpu_cap_check_and_set(X86_FEATURE_PKS);

This piece should be a separate patch, added after patch 4.

Paolo
Paolo Bonzini Feb. 5, 2021, 9:25 a.m. UTC | #2
On 05/02/21 09:37, Chenyi Qiang wrote:
> 
> diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
> index 57718716cc70..8027f854c600 100644
> --- a/arch/x86/mm/pkeys.c
> +++ b/arch/x86/mm/pkeys.c
> @@ -390,3 +390,9 @@ void pks_key_free(int pkey)
>  	__clear_bit(pkey, &pks_key_allocation_map);
>  }
>  EXPORT_SYMBOL_GPL(pks_key_free);
> +
> +u32 get_current_pkrs(void)
> +{
> +	return this_cpu_read(pkrs_cache);
> +}
> +EXPORT_SYMBOL_GPL(get_current_pkrs);
> diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
> index bed0e293f13b..480429020f4c 100644
> --- a/include/linux/pkeys.h
> +++ b/include/linux/pkeys.h
> @@ -72,6 +72,10 @@ static inline void pks_mk_readwrite(int pkey)
>  {
>  	pr_err("%s is not valid without PKS support\n", __func__);
>  }
> +static inline u32 get_current_pkrs(void)
> +{
> +	return 0;
> +}
>  #endif /* ! CONFIG_ARCH_HAS_SUPERVISOR_PKEYS */
>  

This would need an ack from the x86 people.  Andy, Boris?

Paolo
Borislav Petkov Feb. 5, 2021, 9:56 a.m. UTC | #3
On Fri, Feb 05, 2021 at 10:25:48AM +0100, Paolo Bonzini wrote:
> On 05/02/21 09:37, Chenyi Qiang wrote:
> > 
> > diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
> > index 57718716cc70..8027f854c600 100644
> > --- a/arch/x86/mm/pkeys.c
> > +++ b/arch/x86/mm/pkeys.c
> > @@ -390,3 +390,9 @@ void pks_key_free(int pkey)
> >  	__clear_bit(pkey, &pks_key_allocation_map);
> >  }
> >  EXPORT_SYMBOL_GPL(pks_key_free);
> > +
> > +u32 get_current_pkrs(void)
> > +{
> > +	return this_cpu_read(pkrs_cache);
> > +}
> > +EXPORT_SYMBOL_GPL(get_current_pkrs);
> > diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
> > index bed0e293f13b..480429020f4c 100644
> > --- a/include/linux/pkeys.h
> > +++ b/include/linux/pkeys.h
> > @@ -72,6 +72,10 @@ static inline void pks_mk_readwrite(int pkey)
> >  {
> >  	pr_err("%s is not valid without PKS support\n", __func__);
> >  }
> > +static inline u32 get_current_pkrs(void)
> > +{
> > +	return 0;
> > +}
> >  #endif /* ! CONFIG_ARCH_HAS_SUPERVISOR_PKEYS */
> 
> This would need an ack from the x86 people.  Andy, Boris?

This looks like the PKS baremetal pile needs to be upstream first.
Paolo Bonzini Feb. 5, 2021, 10:10 a.m. UTC | #4
On 05/02/21 10:56, Borislav Petkov wrote:
> On Fri, Feb 05, 2021 at 10:25:48AM +0100, Paolo Bonzini wrote:
>> On 05/02/21 09:37, Chenyi Qiang wrote:
>>>
>>> diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
>>> index 57718716cc70..8027f854c600 100644
>>> --- a/arch/x86/mm/pkeys.c
>>> +++ b/arch/x86/mm/pkeys.c
>>> @@ -390,3 +390,9 @@ void pks_key_free(int pkey)
>>>   	__clear_bit(pkey, &pks_key_allocation_map);
>>>   }
>>>   EXPORT_SYMBOL_GPL(pks_key_free);
>>> +
>>> +u32 get_current_pkrs(void)
>>> +{
>>> +	return this_cpu_read(pkrs_cache);
>>> +}
>>> +EXPORT_SYMBOL_GPL(get_current_pkrs);
>>> diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
>>> index bed0e293f13b..480429020f4c 100644
>>> --- a/include/linux/pkeys.h
>>> +++ b/include/linux/pkeys.h
>>> @@ -72,6 +72,10 @@ static inline void pks_mk_readwrite(int pkey)
>>>   {
>>>   	pr_err("%s is not valid without PKS support\n", __func__);
>>>   }
>>> +static inline u32 get_current_pkrs(void)
>>> +{
>>> +	return 0;
>>> +}
>>>   #endif /* ! CONFIG_ARCH_HAS_SUPERVISOR_PKEYS */
>>
>> This would need an ack from the x86 people.  Andy, Boris?
> 
> This looks like the PKS baremetal pile needs to be upstream first.

Yes, it does.  I would like to have an ack for including the above two 
hunks once PKS is upstream.

I also have CET and bus lock #DB queued and waiting for the bare metal 
functionality, however they do not touch anything outside arch/x86/kvm.

Paolo
Thomas Gleixner Feb. 5, 2021, 11:29 a.m. UTC | #5
On Fri, Feb 05 2021 at 11:10, Paolo Bonzini wrote:
> On 05/02/21 10:56, Borislav Petkov wrote:
>>> This would need an ack from the x86 people.  Andy, Boris?
>> 
>> This looks like the PKS baremetal pile needs to be upstream first.
>
> Yes, it does.  I would like to have an ack for including the above two 
> hunks once PKS is upstream.
>
> I also have CET and bus lock #DB queued and waiting for the bare metal 
> functionality, however they do not touch anything outside arch/x86/kvm.

What's the exact point of queueing random stuff which lacks bare metal
support?

Once PKS, CET or whatever is merged into tip then it's the point for
resending the KVM patches for inclusion and that's the point where it
gets acked and not $N month ahead when everything is still in flux.

Thanks,

        tglx
Paolo Bonzini Feb. 5, 2021, 11:51 a.m. UTC | #6
On 05/02/21 12:29, Thomas Gleixner wrote:
> On Fri, Feb 05 2021 at 11:10, Paolo Bonzini wrote:
>> On 05/02/21 10:56, Borislav Petkov wrote:
>>>> This would need an ack from the x86 people.  Andy, Boris?
>>>
>>> This looks like the PKS baremetal pile needs to be upstream first.
>>
>> Yes, it does.  I would like to have an ack for including the above two
>> hunks once PKS is upstream.
>>
>> I also have CET and bus lock #DB queued and waiting for the bare metal
>> functionality, however they do not touch anything outside arch/x86/kvm.
> 
> What's the exact point of queueing random stuff which lacks bare metal
> support?

The code is often completely independent of bare metal support even if 
it depends of it (CET and bus lock for example only share the #defines; 
for PKS this is not the case just because Intel decided not to use 
XSAVES *shrug*).

I prefer to queue early, because it keeps my backlog small and because 
every resend comes with the risk of random changes sneaking in since the 
version that I reviewed.  An early ack would also mean that I don't have 
to bug you in the middle of the merge window.  But it's not a problem, 
I'll ask for acks again once PKS is merged into tip.

Thanks,

Paolo

> Once PKS, CET or whatever is merged into tip then it's the point for
> resending the KVM patches for inclusion and that's the point where it
> gets acked and not $N month ahead when everything is still in flux.
Sean Christopherson July 29, 2021, 4:44 p.m. UTC | #7
On Fri, Feb 05, 2021, Chenyi Qiang wrote:
> @@ -539,6 +540,7 @@ struct kvm_vcpu_arch {
>  	unsigned long cr8;
>  	u32 host_pkru;
>  	u32 pkru;
> +	u32 pkrs;
>  	u32 hflags;
>  	u64 efer;
>  	u64 apic_base;

...

> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 89af692deb7e..2266d98ace6f 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -250,6 +250,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
>  	dest->ds_sel = src->ds_sel;
>  	dest->es_sel = src->es_sel;
>  #endif
> +	dest->pkrs = src->pkrs;

This is wrong.  It also arguably belongs in patch 04.

The part that's missing is actually updating vmcs.HOST_IA32_PKRS.  FS/GS are
handled by vmx_set_host_fs_gs(), while LDT/DS/ES are oddballs where the selector
is also the state that's restored.

In other words, this will cause nested VM-Exit to load the wrong PKRS.

>  }
>  
>  static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
> diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
> index 1472c6c376f7..b5ba6407c5e1 100644
> --- a/arch/x86/kvm/vmx/vmcs.h
> +++ b/arch/x86/kvm/vmx/vmcs.h
> @@ -40,6 +40,7 @@ struct vmcs_host_state {
>  #ifdef CONFIG_X86_64
>  	u16           ds_sel, es_sel;
>  #endif
> +	u32           pkrs;
>  };
>  
>  struct vmcs_controls_shadow {
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 47b8357b9751..a3d95492e2b7 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -363,6 +363,8 @@ module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
>  static u32 vmx_segment_access_rights(struct kvm_segment *var);
>  static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
>  							  u32 msr, int type);
> +static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
> +							 u32 msr, int type);
>  
>  void vmx_vmexit(void);
>  
> @@ -660,6 +662,8 @@ static bool is_valid_passthrough_msr(u32 msr)
>  	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
>  		/* PT MSRs. These are handled in pt_update_intercept_for_msr() */
>  		return true;
> +	case MSR_IA32_PKRS:
> +		return true;

This is wrong with respect to MSR filtering, as KVM will fail to intercept the
MSR in response to a filter change.  See vmx_msr_filter_changed()..  I also think
that special casing PKRS is a bad idea in general.  More later.

>  	}
>  
>  	r = possible_passthrough_msr_slot(msr) != -ENOENT;

...

> +	case MSR_IA32_PKRS:
> +		if (!kvm_pkrs_valid(data))
> +			return 1;
> +		if (!kvm_cpu_cap_has(X86_FEATURE_PKS) ||
> +		    (!msr_info->host_initiated &&
> +		    !guest_cpuid_has(vcpu, X86_FEATURE_PKS)))
> +			return 1;
> +		if (vcpu->arch.pkrs != data) {

This will need to be modified if we go with my below proposal.

> +			vcpu->arch.pkrs = data;
> +			vmcs_write64(GUEST_IA32_PKRS, data);
> +		}
> +		break;
>  	case MSR_TSC_AUX:
>  		if (!msr_info->host_initiated &&
>  		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
> @@ -2479,7 +2518,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>  	      VM_EXIT_LOAD_IA32_EFER |
>  	      VM_EXIT_CLEAR_BNDCFGS |
>  	      VM_EXIT_PT_CONCEAL_PIP |
> -	      VM_EXIT_CLEAR_IA32_RTIT_CTL;
> +	      VM_EXIT_CLEAR_IA32_RTIT_CTL |
> +	      VM_EXIT_LOAD_IA32_PKRS;
>  	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
>  				&_vmexit_control) < 0)
>  		return -EIO;
> @@ -2503,7 +2543,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>  	      VM_ENTRY_LOAD_IA32_EFER |
>  	      VM_ENTRY_LOAD_BNDCFGS |
>  	      VM_ENTRY_PT_CONCEAL_PIP |
> -	      VM_ENTRY_LOAD_IA32_RTIT_CTL;
> +	      VM_ENTRY_LOAD_IA32_RTIT_CTL |
> +	      VM_ENTRY_LOAD_IA32_PKRS;
>  	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
>  				&_vmentry_control) < 0)
>  		return -EIO;
> @@ -3103,8 +3144,9 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
>  	 * is in force while we are in guest mode.  Do not let guests control
>  	 * this bit, even if host CR4.MCE == 0.
>  	 */
> -	unsigned long hw_cr4;
> +	unsigned long hw_cr4, old_cr4;
>  
> +	old_cr4 = kvm_read_cr4(vcpu);
>  	hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
>  	if (is_unrestricted_guest(vcpu))
>  		hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
> @@ -3152,7 +3194,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
>  		}
>  
>  		/*
> -		 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
> +		 * SMEP/SMAP/PKU/PKS is disabled if CPU is in non-paging mode in
>  		 * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
>  		 * to be manually disabled when guest switches to non-paging
>  		 * mode.
> @@ -3160,10 +3202,29 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
>  		 * If !enable_unrestricted_guest, the CPU is always running
>  		 * with CR0.PG=1 and CR4 needs to be modified.
>  		 * If enable_unrestricted_guest, the CPU automatically
> -		 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
> +		 * disables SMEP/SMAP/PKU/PKS when the guest sets CR0.PG=0.
>  		 */
>  		if (!is_paging(vcpu))
> -			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
> +			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE |
> +				    X86_CR4_PKS);
> +	}
> +
> +	if ((hw_cr4 ^ old_cr4) & X86_CR4_PKS) {

Comparing hw_cr4 and old_cr4 is wrong, they are representative of two different
contexts.  old_cr4 is the guest's previous CR4 irrespective of KVM maniuplations,
whereas hw_cr4 does include KVM's modification, e.g. the is_paging() logic above
may clear CR4.PKS and may lead to incorrect behavior.

E.g.:

	guest.CR4.PKS = 1
	guest.CR0.PG  = 0
	guest_hw.CR4.PKS = 0  // KVM
	vmcs.LOAD_PKRS = 0    // KVM
	guest.CR0.PG  = 1
	guest_hw.CR4.PKS = 1  // KVM
	vmcs.LOAD_PKRS not modified because guest.CR4.PKS == guest_hw.CR4.PKS == 1

This logic also fails to handle the case where L1 doesn't intercept CR4.PKS
modifications for L2.

The VM-Exit path that does:

> +     if (static_cpu_has(X86_FEATURE_PKS) &&
> +         kvm_read_cr4_bits(vcpu, X86_CR4_PKS))
> +             vcpu->arch.pkrs = vmcs_read64(GUEST_IA32_PKRS)

is also flawed in that, per this scheme, it may unnecessarily read vmcs.GUEST_PKRS,
though I don't think it can get the wrong value, unless of course it's running L2...

In short, IMO toggling PKRS interception/load on CR4 changes is a really, really
bad idea.  UMIP emulation attempted do fancy toggling and got it wrong multiple
times, and this is more complicated than what UMIP was trying to do.

The only motiviation for toggling PKRS interception/load is to avoid the VMREAD in
the VM-Exit path.  Given that vcpu->arch.pkrs is rarely accessed by KVM, e.g. only
on host userspace MSR reads and and GVA->GPA translation via permission_fault(),
keeping vcpu->arch.pkrs up-to-date at all times is unnecessary, i.e. it can be
synchronized on-demand.  And regarding permission_fault(), that's indirectly gated
by CR4.PKS=1, thus deferring the VMREAD to permission_fault() is guaranteed to be
more performant than reading on every VM-Exit (with CR4.PKS=1).

So:

  - Disable PKRS MSR interception if it's exposed to the guest.
  - Load PKRS on entry/exit if it's exposed to the guest.
  - Add VCPU_EXREG_PKRS and clear its bits in vmx_register_cache_reset().
  - Add helpers to read/write/cache PKRS and use accordingly.

> +		/* pass through PKRS to guest when CR4.PKS=1 */
> +		if (guest_cpuid_has(vcpu, X86_FEATURE_PKS) && hw_cr4 & X86_CR4_PKS) {
> +			vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PKRS, MSR_TYPE_RW);
> +			vm_entry_controls_setbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
> +			vm_exit_controls_setbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
> +			/*
> +			 * Every vm exit saves guest pkrs automatically, sync vcpu->arch.pkrs
> +			 * to VMCS.GUEST_PKRS to avoid the pollution by host pkrs.
> +			 */
> +			vmcs_write64(GUEST_IA32_PKRS, vcpu->arch.pkrs);
> +		} else {
> +			vmx_enable_intercept_for_msr(vcpu, MSR_IA32_PKRS, MSR_TYPE_RW);
> +			vm_entry_controls_clearbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
> +			vm_exit_controls_clearbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
> +		}
>  	}
>  
>  	vmcs_writel(CR4_READ_SHADOW, cr4);

...

> @@ -6776,6 +6841,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
>  
>  	pt_guest_exit(vmx);
>  
> +	if (static_cpu_has(X86_FEATURE_PKS) &&
> +	    kvm_read_cr4_bits(vcpu, X86_CR4_PKS))
> +		vcpu->arch.pkrs = vmcs_read64(GUEST_IA32_PKRS);
> +
>  	kvm_load_host_xsave_state(vcpu);
>  
>  	vmx->nested.nested_run_pending = 0;
> @@ -7280,6 +7349,14 @@ static __init void vmx_set_cpu_caps(void)
>  	if (vmx_pt_mode_is_host_guest())
>  		kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
>  
> +	/*
> +	 * PKS is not yet implemented for shadow paging.
> +	 * If not support VM_{ENTRY, EXIT}_LOAD_IA32_PKRS,
> +	 * don't expose the PKS as well.
> +	 */
> +	if (enable_ept && cpu_has_load_ia32_pkrs())
> +		kvm_cpu_cap_check_and_set(X86_FEATURE_PKS);
> +
>  	if (vmx_umip_emulated())
>  		kvm_cpu_cap_set(X86_FEATURE_UMIP);
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index f5ede41bf9e6..684ef760481c 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1213,7 +1213,7 @@ static const u32 msrs_to_save_all[] = {
>  	MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
>  	MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
>  	MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
> -	MSR_IA32_UMWAIT_CONTROL,
> +	MSR_IA32_UMWAIT_CONTROL, MSR_IA32_PKRS,
>  
>  	MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
>  	MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
> @@ -5718,6 +5718,10 @@ static void kvm_init_msr_list(void)
>  				intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
>  				continue;
>  			break;
> +		case MSR_IA32_PKRS:
> +			if (!kvm_cpu_cap_has(X86_FEATURE_PKS))
> +				continue;
> +			break;
>  		case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
>  			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
>  			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
> @@ -10026,6 +10030,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>  
>  	vcpu->arch.ia32_xss = 0;
>  
> +	vcpu->arch.pkrs = 0;

This is wrong and also unreviewable.  It's wrong because the write isn't propagted
to vmcs.GUEST_PKRS, e.g. if the guest enables CR0.PG and CR4.PKS without writing
PKRS, KVM will run the guest with a stale value.

It's unreviewable because the SDM doesn't specify whether PKRS is cleared or
preserved on INIT.  The SDM needs an entry for PRKS in Table 9-1. "IA-32 and Intel
64 Processor States Following Power-up, Reset, or INIT" before this can be merged.

> +
>  	kvm_x86_ops.vcpu_reset(vcpu, init_event);
>  }
Chenyi Qiang Aug. 3, 2021, 8:50 a.m. UTC | #8
Thanks Sean for your review.

On 7/30/2021 12:44 AM, Sean Christopherson wrote:
> On Fri, Feb 05, 2021, Chenyi Qiang wrote:
>> @@ -539,6 +540,7 @@ struct kvm_vcpu_arch {
>>   	unsigned long cr8;
>>   	u32 host_pkru;
>>   	u32 pkru;
>> +	u32 pkrs;
>>   	u32 hflags;
>>   	u64 efer;
>>   	u64 apic_base;
> 
> ...
> 
>> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
>> index 89af692deb7e..2266d98ace6f 100644
>> --- a/arch/x86/kvm/vmx/nested.c
>> +++ b/arch/x86/kvm/vmx/nested.c
>> @@ -250,6 +250,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
>>   	dest->ds_sel = src->ds_sel;
>>   	dest->es_sel = src->es_sel;
>>   #endif
>> +	dest->pkrs = src->pkrs;
> 
> This is wrong.  It also arguably belongs in patch 04.
> 
> The part that's missing is actually updating vmcs.HOST_IA32_PKRS.  FS/GS are
> handled by vmx_set_host_fs_gs(), while LDT/DS/ES are oddballs where the selector
> is also the state that's restored.
> 

Will fix it. I guess it should belong in patch 05.

> In other words, this will cause nested VM-Exit to load the wrong PKRS.
> 
>>   }
>>   
>>   static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
>> diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
>> index 1472c6c376f7..b5ba6407c5e1 100644
>> --- a/arch/x86/kvm/vmx/vmcs.h
>> +++ b/arch/x86/kvm/vmx/vmcs.h
>> @@ -40,6 +40,7 @@ struct vmcs_host_state {
>>   #ifdef CONFIG_X86_64
>>   	u16           ds_sel, es_sel;
>>   #endif
>> +	u32           pkrs;
>>   };
>>   
>>   struct vmcs_controls_shadow {
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 47b8357b9751..a3d95492e2b7 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -363,6 +363,8 @@ module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
>>   static u32 vmx_segment_access_rights(struct kvm_segment *var);
>>   static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
>>   							  u32 msr, int type);
>> +static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
>> +							 u32 msr, int type);
>>   
>>   void vmx_vmexit(void);
>>   
>> @@ -660,6 +662,8 @@ static bool is_valid_passthrough_msr(u32 msr)
>>   	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
>>   		/* PT MSRs. These are handled in pt_update_intercept_for_msr() */
>>   		return true;
>> +	case MSR_IA32_PKRS:
>> +		return true;
> 
> This is wrong with respect to MSR filtering, as KVM will fail to intercept the
> MSR in response to a filter change.  See vmx_msr_filter_changed()..  I also think
> that special casing PKRS is a bad idea in general.  More later.
> 

Yes, msr filter support for PKRS was missing. Will add the support in 
vmx_msr_filter_changed().

>>   	}
>>   
>>   	r = possible_passthrough_msr_slot(msr) != -ENOENT;
> 
> ...
> 
>> +	case MSR_IA32_PKRS:
>> +		if (!kvm_pkrs_valid(data))
>> +			return 1;
>> +		if (!kvm_cpu_cap_has(X86_FEATURE_PKS) ||
>> +		    (!msr_info->host_initiated &&
>> +		    !guest_cpuid_has(vcpu, X86_FEATURE_PKS)))
>> +			return 1;
>> +		if (vcpu->arch.pkrs != data) {
> 
> This will need to be modified if we go with my below proposal.
> 
>> +			vcpu->arch.pkrs = data;
>> +			vmcs_write64(GUEST_IA32_PKRS, data);
>> +		}
>> +		break;
>>   	case MSR_TSC_AUX:
>>   		if (!msr_info->host_initiated &&
>>   		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
>> @@ -2479,7 +2518,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>>   	      VM_EXIT_LOAD_IA32_EFER |
>>   	      VM_EXIT_CLEAR_BNDCFGS |
>>   	      VM_EXIT_PT_CONCEAL_PIP |
>> -	      VM_EXIT_CLEAR_IA32_RTIT_CTL;
>> +	      VM_EXIT_CLEAR_IA32_RTIT_CTL |
>> +	      VM_EXIT_LOAD_IA32_PKRS;
>>   	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
>>   				&_vmexit_control) < 0)
>>   		return -EIO;
>> @@ -2503,7 +2543,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>>   	      VM_ENTRY_LOAD_IA32_EFER |
>>   	      VM_ENTRY_LOAD_BNDCFGS |
>>   	      VM_ENTRY_PT_CONCEAL_PIP |
>> -	      VM_ENTRY_LOAD_IA32_RTIT_CTL;
>> +	      VM_ENTRY_LOAD_IA32_RTIT_CTL |
>> +	      VM_ENTRY_LOAD_IA32_PKRS;
>>   	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
>>   				&_vmentry_control) < 0)
>>   		return -EIO;
>> @@ -3103,8 +3144,9 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
>>   	 * is in force while we are in guest mode.  Do not let guests control
>>   	 * this bit, even if host CR4.MCE == 0.
>>   	 */
>> -	unsigned long hw_cr4;
>> +	unsigned long hw_cr4, old_cr4;
>>   
>> +	old_cr4 = kvm_read_cr4(vcpu);
>>   	hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
>>   	if (is_unrestricted_guest(vcpu))
>>   		hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
>> @@ -3152,7 +3194,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
>>   		}
>>   
>>   		/*
>> -		 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
>> +		 * SMEP/SMAP/PKU/PKS is disabled if CPU is in non-paging mode in
>>   		 * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
>>   		 * to be manually disabled when guest switches to non-paging
>>   		 * mode.
>> @@ -3160,10 +3202,29 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
>>   		 * If !enable_unrestricted_guest, the CPU is always running
>>   		 * with CR0.PG=1 and CR4 needs to be modified.
>>   		 * If enable_unrestricted_guest, the CPU automatically
>> -		 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
>> +		 * disables SMEP/SMAP/PKU/PKS when the guest sets CR0.PG=0.
>>   		 */
>>   		if (!is_paging(vcpu))
>> -			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
>> +			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE |
>> +				    X86_CR4_PKS);
>> +	}
>> +
>> +	if ((hw_cr4 ^ old_cr4) & X86_CR4_PKS) {
> 
> Comparing hw_cr4 and old_cr4 is wrong, they are representative of two different
> contexts.  old_cr4 is the guest's previous CR4 irrespective of KVM maniuplations,
> whereas hw_cr4 does include KVM's modification, e.g. the is_paging() logic above
> may clear CR4.PKS and may lead to incorrect behavior.
> 
> E.g.:
> 
> 	guest.CR4.PKS = 1
> 	guest.CR0.PG  = 0
> 	guest_hw.CR4.PKS = 0  // KVM
> 	vmcs.LOAD_PKRS = 0    // KVM
> 	guest.CR0.PG  = 1
> 	guest_hw.CR4.PKS = 1  // KVM
> 	vmcs.LOAD_PKRS not modified because guest.CR4.PKS == guest_hw.CR4.PKS == 1
> 
> This logic also fails to handle the case where L1 doesn't intercept CR4.PKS
> modifications for L2.
> 
> The VM-Exit path that does:
> 
>> +     if (static_cpu_has(X86_FEATURE_PKS) &&
>> +         kvm_read_cr4_bits(vcpu, X86_CR4_PKS))
>> +             vcpu->arch.pkrs = vmcs_read64(GUEST_IA32_PKRS)
> 
> is also flawed in that, per this scheme, it may unnecessarily read vmcs.GUEST_PKRS,
> though I don't think it can get the wrong value, unless of course it's running L2...
> 
> In short, IMO toggling PKRS interception/load on CR4 changes is a really, really
> bad idea.  UMIP emulation attempted do fancy toggling and got it wrong multiple
> times, and this is more complicated than what UMIP was trying to do.
> 
> The only motiviation for toggling PKRS interception/load is to avoid the VMREAD in
> the VM-Exit path.  Given that vcpu->arch.pkrs is rarely accessed by KVM, e.g. only
> on host userspace MSR reads and and GVA->GPA translation via permission_fault(),
> keeping vcpu->arch.pkrs up-to-date at all times is unnecessary, i.e. it can be
> synchronized on-demand.  And regarding permission_fault(), that's indirectly gated
> by CR4.PKS=1, thus deferring the VMREAD to permission_fault() is guaranteed to be
> more performant than reading on every VM-Exit (with CR4.PKS=1).
> 
> So:
> 
>    - Disable PKRS MSR interception if it's exposed to the guest.
>    - Load PKRS on entry/exit if it's exposed to the guest.
>    - Add VCPU_EXREG_PKRS and clear its bits in vmx_register_cache_reset().
>    - Add helpers to read/write/cache PKRS and use accordingly.
> 

Make sense. Will refactor all these mentioned in next version.

>> +		/* pass through PKRS to guest when CR4.PKS=1 */
>> +		if (guest_cpuid_has(vcpu, X86_FEATURE_PKS) && hw_cr4 & X86_CR4_PKS) {
>> +			vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PKRS, MSR_TYPE_RW);
>> +			vm_entry_controls_setbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
>> +			vm_exit_controls_setbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
>> +			/*
>> +			 * Every vm exit saves guest pkrs automatically, sync vcpu->arch.pkrs
>> +			 * to VMCS.GUEST_PKRS to avoid the pollution by host pkrs.
>> +			 */
>> +			vmcs_write64(GUEST_IA32_PKRS, vcpu->arch.pkrs);
>> +		} else {
>> +			vmx_enable_intercept_for_msr(vcpu, MSR_IA32_PKRS, MSR_TYPE_RW);
>> +			vm_entry_controls_clearbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
>> +			vm_exit_controls_clearbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
>> +		}
>>   	}
>>   
>>   	vmcs_writel(CR4_READ_SHADOW, cr4);
> 
> ...
> 
>> @@ -6776,6 +6841,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
>>   
>>   	pt_guest_exit(vmx);
>>   
>> +	if (static_cpu_has(X86_FEATURE_PKS) &&
>> +	    kvm_read_cr4_bits(vcpu, X86_CR4_PKS))
>> +		vcpu->arch.pkrs = vmcs_read64(GUEST_IA32_PKRS);
>> +
>>   	kvm_load_host_xsave_state(vcpu);
>>   
>>   	vmx->nested.nested_run_pending = 0;
>> @@ -7280,6 +7349,14 @@ static __init void vmx_set_cpu_caps(void)
>>   	if (vmx_pt_mode_is_host_guest())
>>   		kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
>>   
>> +	/*
>> +	 * PKS is not yet implemented for shadow paging.
>> +	 * If not support VM_{ENTRY, EXIT}_LOAD_IA32_PKRS,
>> +	 * don't expose the PKS as well.
>> +	 */
>> +	if (enable_ept && cpu_has_load_ia32_pkrs())
>> +		kvm_cpu_cap_check_and_set(X86_FEATURE_PKS);
>> +
>>   	if (vmx_umip_emulated())
>>   		kvm_cpu_cap_set(X86_FEATURE_UMIP);
>>   
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index f5ede41bf9e6..684ef760481c 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -1213,7 +1213,7 @@ static const u32 msrs_to_save_all[] = {
>>   	MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
>>   	MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
>>   	MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
>> -	MSR_IA32_UMWAIT_CONTROL,
>> +	MSR_IA32_UMWAIT_CONTROL, MSR_IA32_PKRS,
>>   
>>   	MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
>>   	MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
>> @@ -5718,6 +5718,10 @@ static void kvm_init_msr_list(void)
>>   				intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
>>   				continue;
>>   			break;
>> +		case MSR_IA32_PKRS:
>> +			if (!kvm_cpu_cap_has(X86_FEATURE_PKS))
>> +				continue;
>> +			break;
>>   		case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
>>   			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
>>   			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
>> @@ -10026,6 +10030,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>>   
>>   	vcpu->arch.ia32_xss = 0;
>>   
>> +	vcpu->arch.pkrs = 0;
> 
> This is wrong and also unreviewable.  It's wrong because the write isn't propagted
> to vmcs.GUEST_PKRS, e.g. if the guest enables CR0.PG and CR4.PKS without writing
> PKRS, KVM will run the guest with a stale value.
> 

Yes, should write to vmcs to do reset.
  > It's unreviewable because the SDM doesn't specify whether PKRS is 
cleared or
> preserved on INIT.  The SDM needs an entry for PRKS in Table 9-1. "IA-32 and Intel
> 64 Processor States Following Power-up, Reset, or INIT" before this can be merged.
> 

PKRS is missing in Table 9-1. It will be updated in next version of SDM, 
just let you know to help current review:

"PKRS is cleared on INIT. It should be 0 in all cases."

>> +
>>   	kvm_x86_ops.vcpu_reset(vcpu, init_event);
>>   }
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d44858b69353..c8b149d9775a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -100,7 +100,8 @@ 
 			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
 			  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
-			  | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP))
+			  | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \
+			  | X86_CR4_PKS))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
@@ -539,6 +540,7 @@  struct kvm_vcpu_arch {
 	unsigned long cr8;
 	u32 host_pkru;
 	u32 pkru;
+	u32 pkrs;
 	u32 hflags;
 	u64 efer;
 	u64 apic_base;
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 990fe9c4787c..610c94b12dbf 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -142,6 +142,7 @@  u32 update_pkey_val(u32 pk_reg, int pkey, unsigned int flags);
 #ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
 __must_check int pks_key_alloc(const char *const pkey_user, int flags);
 void pks_key_free(int pkey);
+u32 get_current_pkrs(void);
 
 void pks_mk_noaccess(int pkey);
 void pks_mk_readonly(int pkey);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 06a278b3701d..4062b83091b9 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -390,7 +390,8 @@  void kvm_set_cpu_caps(void)
 		F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
 		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
 		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
-		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
+		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
+		0 /*PKS*/
 	);
 	/* Set LA57 based on hardware capability. */
 	if (cpuid_ecx(7) & F(LA57))
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 3a1861403d73..1cadeaaf9985 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -103,6 +103,12 @@  static inline bool cpu_has_load_perf_global_ctrl(void)
 	       (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
 }
 
+static inline bool cpu_has_load_ia32_pkrs(void)
+{
+	return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PKRS) &&
+	       (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PKRS);
+}
+
 static inline bool cpu_has_vmx_mpx(void)
 {
 	return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 89af692deb7e..2266d98ace6f 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -250,6 +250,7 @@  static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
 	dest->ds_sel = src->ds_sel;
 	dest->es_sel = src->es_sel;
 #endif
+	dest->pkrs = src->pkrs;
 }
 
 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 1472c6c376f7..b5ba6407c5e1 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -40,6 +40,7 @@  struct vmcs_host_state {
 #ifdef CONFIG_X86_64
 	u16           ds_sel, es_sel;
 #endif
+	u32           pkrs;
 };
 
 struct vmcs_controls_shadow {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 47b8357b9751..a3d95492e2b7 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -363,6 +363,8 @@  module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 static u32 vmx_segment_access_rights(struct kvm_segment *var);
 static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
 							  u32 msr, int type);
+static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
+							 u32 msr, int type);
 
 void vmx_vmexit(void);
 
@@ -660,6 +662,8 @@  static bool is_valid_passthrough_msr(u32 msr)
 	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
 		/* PT MSRs. These are handled in pt_update_intercept_for_msr() */
 		return true;
+	case MSR_IA32_PKRS:
+		return true;
 	}
 
 	r = possible_passthrough_msr_slot(msr) != -ENOENT;
@@ -1201,6 +1205,7 @@  void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 #endif
 	unsigned long fs_base, gs_base;
 	u16 fs_sel, gs_sel;
+	u32 host_pkrs;
 	int i;
 
 	vmx->req_immediate_exit = false;
@@ -1233,6 +1238,21 @@  void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 	 */
 	host_state->ldt_sel = kvm_read_ldt();
 
+	/*
+	 * Update the host pkrs vmcs field before vcpu runs.
+	 * The setting of VM_EXIT_LOAD_IA32_PKRS can ensure
+	 * kvm_cpu_cap_has(X86_FEATURE_PKS) &&
+	 * guest_cpuid_has(vcpu, X86_FEATURE_PKS) &&
+	 * CR4.PKS=1.
+	 */
+	if (vm_exit_controls_get(vmx) & VM_EXIT_LOAD_IA32_PKRS) {
+		host_pkrs = get_current_pkrs();
+		if (unlikely(host_pkrs != host_state->pkrs)) {
+			vmcs_write64(HOST_IA32_PKRS, host_pkrs);
+			host_state->pkrs = host_pkrs;
+		}
+	}
+
 #ifdef CONFIG_X86_64
 	savesegment(ds, host_state->ds_sel);
 	savesegment(es, host_state->es_sel);
@@ -1920,6 +1940,13 @@  static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		else
 			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
 		break;
+	case MSR_IA32_PKRS:
+		if (!kvm_cpu_cap_has(X86_FEATURE_PKS) ||
+		    (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_PKS)))
+			return 1;
+		msr_info->data = vcpu->arch.pkrs;
+		break;
 	case MSR_TSC_AUX:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
@@ -2189,6 +2216,18 @@  static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		else
 			vmx->pt_desc.guest.addr_a[index / 2] = data;
 		break;
+	case MSR_IA32_PKRS:
+		if (!kvm_pkrs_valid(data))
+			return 1;
+		if (!kvm_cpu_cap_has(X86_FEATURE_PKS) ||
+		    (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_PKS)))
+			return 1;
+		if (vcpu->arch.pkrs != data) {
+			vcpu->arch.pkrs = data;
+			vmcs_write64(GUEST_IA32_PKRS, data);
+		}
+		break;
 	case MSR_TSC_AUX:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
@@ -2479,7 +2518,8 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 	      VM_EXIT_LOAD_IA32_EFER |
 	      VM_EXIT_CLEAR_BNDCFGS |
 	      VM_EXIT_PT_CONCEAL_PIP |
-	      VM_EXIT_CLEAR_IA32_RTIT_CTL;
+	      VM_EXIT_CLEAR_IA32_RTIT_CTL |
+	      VM_EXIT_LOAD_IA32_PKRS;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
 				&_vmexit_control) < 0)
 		return -EIO;
@@ -2503,7 +2543,8 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 	      VM_ENTRY_LOAD_IA32_EFER |
 	      VM_ENTRY_LOAD_BNDCFGS |
 	      VM_ENTRY_PT_CONCEAL_PIP |
-	      VM_ENTRY_LOAD_IA32_RTIT_CTL;
+	      VM_ENTRY_LOAD_IA32_RTIT_CTL |
+	      VM_ENTRY_LOAD_IA32_PKRS;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
 				&_vmentry_control) < 0)
 		return -EIO;
@@ -3103,8 +3144,9 @@  int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	 * is in force while we are in guest mode.  Do not let guests control
 	 * this bit, even if host CR4.MCE == 0.
 	 */
-	unsigned long hw_cr4;
+	unsigned long hw_cr4, old_cr4;
 
+	old_cr4 = kvm_read_cr4(vcpu);
 	hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
 	if (is_unrestricted_guest(vcpu))
 		hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
@@ -3152,7 +3194,7 @@  int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		}
 
 		/*
-		 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
+		 * SMEP/SMAP/PKU/PKS is disabled if CPU is in non-paging mode in
 		 * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
 		 * to be manually disabled when guest switches to non-paging
 		 * mode.
@@ -3160,10 +3202,29 @@  int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		 * If !enable_unrestricted_guest, the CPU is always running
 		 * with CR0.PG=1 and CR4 needs to be modified.
 		 * If enable_unrestricted_guest, the CPU automatically
-		 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
+		 * disables SMEP/SMAP/PKU/PKS when the guest sets CR0.PG=0.
 		 */
 		if (!is_paging(vcpu))
-			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE |
+				    X86_CR4_PKS);
+	}
+
+	if ((hw_cr4 ^ old_cr4) & X86_CR4_PKS) {
+		/* pass through PKRS to guest when CR4.PKS=1 */
+		if (guest_cpuid_has(vcpu, X86_FEATURE_PKS) && hw_cr4 & X86_CR4_PKS) {
+			vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PKRS, MSR_TYPE_RW);
+			vm_entry_controls_setbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
+			vm_exit_controls_setbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
+			/*
+			 * Every vm exit saves guest pkrs automatically, sync vcpu->arch.pkrs
+			 * to VMCS.GUEST_PKRS to avoid the pollution by host pkrs.
+			 */
+			vmcs_write64(GUEST_IA32_PKRS, vcpu->arch.pkrs);
+		} else {
+			vmx_enable_intercept_for_msr(vcpu, MSR_IA32_PKRS, MSR_TYPE_RW);
+			vm_entry_controls_clearbit(vmx, VM_ENTRY_LOAD_IA32_PKRS);
+			vm_exit_controls_clearbit(vmx, VM_EXIT_LOAD_IA32_PKRS);
+		}
 	}
 
 	vmcs_writel(CR4_READ_SHADOW, cr4);
@@ -5841,6 +5902,8 @@  void dump_vmcs(void)
 		       vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
 	if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
 		pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
+	if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PKRS)
+		pr_err("PKRS = 0x%016llx\n", vmcs_read64(GUEST_IA32_PKRS));
 	pr_err("Interruptibility = %08x  ActivityState = %08x\n",
 	       vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
 	       vmcs_read32(GUEST_ACTIVITY_STATE));
@@ -5876,6 +5939,8 @@  void dump_vmcs(void)
 	    vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
 		pr_err("PerfGlobCtl = 0x%016llx\n",
 		       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+	if (vmexit_ctl & VM_EXIT_LOAD_IA32_PKRS)
+		pr_err("PKRS = 0x%016llx\n", vmcs_read64(HOST_IA32_PKRS));
 
 	pr_err("*** Control State ***\n");
 	pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
@@ -6776,6 +6841,10 @@  static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	pt_guest_exit(vmx);
 
+	if (static_cpu_has(X86_FEATURE_PKS) &&
+	    kvm_read_cr4_bits(vcpu, X86_CR4_PKS))
+		vcpu->arch.pkrs = vmcs_read64(GUEST_IA32_PKRS);
+
 	kvm_load_host_xsave_state(vcpu);
 
 	vmx->nested.nested_run_pending = 0;
@@ -7280,6 +7349,14 @@  static __init void vmx_set_cpu_caps(void)
 	if (vmx_pt_mode_is_host_guest())
 		kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
 
+	/*
+	 * PKS is not yet implemented for shadow paging.
+	 * If not support VM_{ENTRY, EXIT}_LOAD_IA32_PKRS,
+	 * don't expose the PKS as well.
+	 */
+	if (enable_ept && cpu_has_load_ia32_pkrs())
+		kvm_cpu_cap_check_and_set(X86_FEATURE_PKS);
+
 	if (vmx_umip_emulated())
 		kvm_cpu_cap_set(X86_FEATURE_UMIP);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f5ede41bf9e6..684ef760481c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1213,7 +1213,7 @@  static const u32 msrs_to_save_all[] = {
 	MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
 	MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
 	MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
-	MSR_IA32_UMWAIT_CONTROL,
+	MSR_IA32_UMWAIT_CONTROL, MSR_IA32_PKRS,
 
 	MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
 	MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
@@ -5718,6 +5718,10 @@  static void kvm_init_msr_list(void)
 				intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
 				continue;
 			break;
+		case MSR_IA32_PKRS:
+			if (!kvm_cpu_cap_has(X86_FEATURE_PKS))
+				continue;
+			break;
 		case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
 			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
 			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
@@ -10026,6 +10030,8 @@  void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	vcpu->arch.ia32_xss = 0;
 
+	vcpu->arch.pkrs = 0;
+
 	kvm_x86_ops.vcpu_reset(vcpu, init_event);
 }
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 3900ab0c6004..b63e32baad00 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -366,6 +366,12 @@  static inline bool kvm_dr6_valid(u64 data)
 	return !(data >> 32);
 }
 
+static inline bool kvm_pkrs_valid(u64 data)
+{
+	/* bit[63,32] must be zero */
+	return !(data >> 32);
+}
+
 void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
 void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
 int kvm_spec_ctrl_test_value(u64 value);
@@ -398,6 +404,8 @@  bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
 		__reserved_bits |= X86_CR4_UMIP;        \
 	if (!__cpu_has(__c, X86_FEATURE_VMX))           \
 		__reserved_bits |= X86_CR4_VMXE;        \
+	if (!__cpu_has(__c, X86_FEATURE_PKS))           \
+		__reserved_bits |= X86_CR4_PKS;         \
 	__reserved_bits;                                \
 })
 
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 57718716cc70..8027f854c600 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -390,3 +390,9 @@  void pks_key_free(int pkey)
 	__clear_bit(pkey, &pks_key_allocation_map);
 }
 EXPORT_SYMBOL_GPL(pks_key_free);
+
+u32 get_current_pkrs(void)
+{
+	return this_cpu_read(pkrs_cache);
+}
+EXPORT_SYMBOL_GPL(get_current_pkrs);
diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index bed0e293f13b..480429020f4c 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -72,6 +72,10 @@  static inline void pks_mk_readwrite(int pkey)
 {
 	pr_err("%s is not valid without PKS support\n", __func__);
 }
+static inline u32 get_current_pkrs(void)
+{
+	return 0;
+}
 #endif /* ! CONFIG_ARCH_HAS_SUPERVISOR_PKEYS */
 
 #endif /* _LINUX_PKEYS_H */