diff mbox series

[v6,7/7] KVM: VMX: Enable PKS for nested VM

Message ID 20220221080840.7369-8-chenyi.qiang@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: PKS Virtualization support | expand

Commit Message

Chenyi Qiang Feb. 21, 2022, 8:08 a.m. UTC
PKS MSR passes through guest directly. Configure the MSR to match the
L0/L1 settings so that nested VM runs PKS properly.

Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
---
 arch/x86/kvm/vmx/nested.c | 38 ++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/vmx/vmcs12.c |  2 ++
 arch/x86/kvm/vmx/vmcs12.h |  4 ++++
 arch/x86/kvm/vmx/vmx.c    |  1 +
 arch/x86/kvm/vmx/vmx.h    |  2 ++
 5 files changed, 45 insertions(+), 2 deletions(-)

Comments

Sean Christopherson March 30, 2022, 9:47 p.m. UTC | #1
On Mon, Feb 21, 2022, Chenyi Qiang wrote:
> PKS MSR passes through guest directly. Configure the MSR to match the
> L0/L1 settings so that nested VM runs PKS properly.
> 
> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
> ---
>  arch/x86/kvm/vmx/nested.c | 38 ++++++++++++++++++++++++++++++++++++--
>  arch/x86/kvm/vmx/vmcs12.c |  2 ++
>  arch/x86/kvm/vmx/vmcs12.h |  4 ++++
>  arch/x86/kvm/vmx/vmx.c    |  1 +
>  arch/x86/kvm/vmx/vmx.h    |  2 ++
>  5 files changed, 45 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index f235f77cbc03..c42a1df385ef 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -252,6 +252,10 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
>  	dest->ds_sel = src->ds_sel;
>  	dest->es_sel = src->es_sel;
>  #endif
> +	if (unlikely(src->pkrs != dest->pkrs)) {
> +		vmcs_write64(HOST_IA32_PKRS, src->pkrs);
> +		dest->pkrs = src->pkrs;
> +	}

It's worth adding a helper for this, a la vmx_set_host_fs_gs(), though this one
can probably be an inline in vmx.h.  E.g. to yield

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index bfa37c7665a5..906a2913a886 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -252,10 +252,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
        dest->ds_sel = src->ds_sel;
        dest->es_sel = src->es_sel;
 #endif
-       if (unlikely(src->pkrs != dest->pkrs)) {
-               vmcs_write64(HOST_IA32_PKRS, src->pkrs);
-               dest->pkrs = src->pkrs;
-       }
+       vmx_set_host_pkrs(dest, src->pkrs);
 }

 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 35fee600fae7..b6b5f1a46544 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1157,10 +1157,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
         */
        if (vm_exit_controls_get(vmx) & VM_EXIT_LOAD_IA32_PKRS) {
                host_pkrs = get_current_pkrs();
-               if (unlikely(host_pkrs != host_state->pkrs)) {
-                       vmcs_write64(HOST_IA32_PKRS, host_pkrs);
-                       host_state->pkrs = host_pkrs;
-               }
+               vmx_set_host_pkrs(host_state, host_pkrs);
        }

 #ifdef CONFIG_X86_64


>  }
>  
>  static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
> @@ -685,6 +689,9 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
>  	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
>  					 MSR_IA32_PRED_CMD, MSR_TYPE_W);
>  
> +	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
> +					 MSR_IA32_PKRS, MSR_TYPE_RW);
> +
>  	kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
>  
>  	vmx->nested.force_msr_bitmap_recalc = false;
> @@ -2433,6 +2440,10 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
>  		if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
>  		    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
>  			vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
> +
> +		if (vmx->nested.nested_run_pending &&
> +		    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))
> +			vmcs_write64(GUEST_IA32_PKRS, vmcs12->guest_ia32_pkrs);
>  	}
>  
>  	if (nested_cpu_has_xsaves(vmcs12))
> @@ -2521,6 +2532,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>  	if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
>  	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
>  		vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
> +	if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
> +	    (!vmx->nested.nested_run_pending ||
> +	     !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
> +		vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);
> +
>  	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
>  
>  	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
> @@ -2897,6 +2913,10 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
>  					   vmcs12->host_ia32_perf_global_ctrl)))
>  		return -EINVAL;
>  
> +	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PKRS) &&
> +		CC(!kvm_pkrs_valid(vmcs12->host_ia32_pkrs)))

Please align the indentation:

	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PKRS) &&
	    CC(!kvm_pkrs_valid(vmcs12->host_ia32_pkrs)))
		return -EINVAL;

> +		return -EINVAL;
> +
>  #ifdef CONFIG_X86_64
>  	ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
>  #else
> @@ -3049,6 +3069,10 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
>  	if (nested_check_guest_non_reg_state(vmcs12))
>  		return -EINVAL;
>  
> +	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS) &&
> +	    CC(!kvm_pkrs_valid(vmcs12->guest_ia32_pkrs)))
> +		return -EINVAL;
> +
>  	return 0;
>  }
>  
> @@ -3377,6 +3401,9 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
>  	if (kvm_mpx_supported() &&
>  		!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
>  		vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
> +	if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
> +	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))

This needs read the current PKRS if from_vmentry == false, e.g.

	if (kvm_cpu_cap_has(X86_FEATURE_PKS) && 
	    (!from_vmentry ||
	     !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))

because in the migration case, if nested state is set after MSR state, the value
needs to come from the current MSR value, which was propagated to vmc02 (which
this calls vmcs01, but whatever).

Note, I'm pretty sure the GUEST_BNDCFGS code is broken, surprise surprise.

> +		vmx->nested.vmcs01_guest_pkrs = vmcs_read64(GUEST_IA32_PKRS);
>  
>  	/*
>  	 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
> @@ -4022,6 +4049,7 @@ static bool is_vmcs12_ext_field(unsigned long field)
>  	case GUEST_IDTR_BASE:
>  	case GUEST_PENDING_DBG_EXCEPTIONS:
>  	case GUEST_BNDCFGS:
> +	case GUEST_IA32_PKRS:
>  		return true;
>  	default:
>  		break;
> @@ -4073,6 +4101,8 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
>  		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>  	if (kvm_mpx_supported())
>  		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
> +	if (guest_cpuid_has(vcpu, X86_FEATURE_PKS))

This needs to check vmx->nested.msrs.entry_ctls_* (I can never remember if it's
the high or low part...).  The SDM states PKRS is saved "if the processor supports
the 1-setting of the 'load PKRS' VM-entry control", which is different than PKRS
being supported in CPUID.  Also, guest CPUID is userspace controlled, e.g. userspace
could induce a failed VMREAD by giving a garbage CPUID model, where vmx->nested.msrs
can only be restricted by userspace, i.e. is trusted.

Happyily, checking vmx->nested.msrs is also a performance win, as guest_cpuid_has()
can require walking a large array.
Chenyi Qiang March 31, 2022, 6:08 a.m. UTC | #2
On 3/31/2022 5:47 AM, Sean Christopherson wrote:
> On Mon, Feb 21, 2022, Chenyi Qiang wrote:
>> PKS MSR passes through guest directly. Configure the MSR to match the
>> L0/L1 settings so that nested VM runs PKS properly.
>>
>> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
>> ---
>>   arch/x86/kvm/vmx/nested.c | 38 ++++++++++++++++++++++++++++++++++++--
>>   arch/x86/kvm/vmx/vmcs12.c |  2 ++
>>   arch/x86/kvm/vmx/vmcs12.h |  4 ++++
>>   arch/x86/kvm/vmx/vmx.c    |  1 +
>>   arch/x86/kvm/vmx/vmx.h    |  2 ++
>>   5 files changed, 45 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
>> index f235f77cbc03..c42a1df385ef 100644
>> --- a/arch/x86/kvm/vmx/nested.c
>> +++ b/arch/x86/kvm/vmx/nested.c
>> @@ -252,6 +252,10 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
>>   	dest->ds_sel = src->ds_sel;
>>   	dest->es_sel = src->es_sel;
>>   #endif
>> +	if (unlikely(src->pkrs != dest->pkrs)) {
>> +		vmcs_write64(HOST_IA32_PKRS, src->pkrs);
>> +		dest->pkrs = src->pkrs;
>> +	}
> 
> It's worth adding a helper for this, a la vmx_set_host_fs_gs(), though this one
> can probably be an inline in vmx.h.  E.g. to yield
> 
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index bfa37c7665a5..906a2913a886 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -252,10 +252,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
>          dest->ds_sel = src->ds_sel;
>          dest->es_sel = src->es_sel;
>   #endif
> -       if (unlikely(src->pkrs != dest->pkrs)) {
> -               vmcs_write64(HOST_IA32_PKRS, src->pkrs);
> -               dest->pkrs = src->pkrs;
> -       }
> +       vmx_set_host_pkrs(dest, src->pkrs);
>   }
> 
>   static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 35fee600fae7..b6b5f1a46544 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -1157,10 +1157,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
>           */
>          if (vm_exit_controls_get(vmx) & VM_EXIT_LOAD_IA32_PKRS) {
>                  host_pkrs = get_current_pkrs();
> -               if (unlikely(host_pkrs != host_state->pkrs)) {
> -                       vmcs_write64(HOST_IA32_PKRS, host_pkrs);
> -                       host_state->pkrs = host_pkrs;
> -               }
> +               vmx_set_host_pkrs(host_state, host_pkrs);
>          }
> 
>   #ifdef CONFIG_X86_64
> 
> 

Will do.

>>   }
>>   
>>   static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
>> @@ -685,6 +689,9 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
>>   	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
>>   					 MSR_IA32_PRED_CMD, MSR_TYPE_W);
>>   
>> +	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
>> +					 MSR_IA32_PKRS, MSR_TYPE_RW);
>> +
>>   	kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
>>   
>>   	vmx->nested.force_msr_bitmap_recalc = false;
>> @@ -2433,6 +2440,10 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
>>   		if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
>>   		    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
>>   			vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
>> +
>> +		if (vmx->nested.nested_run_pending &&
>> +		    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))
>> +			vmcs_write64(GUEST_IA32_PKRS, vmcs12->guest_ia32_pkrs);
>>   	}
>>   
>>   	if (nested_cpu_has_xsaves(vmcs12))
>> @@ -2521,6 +2532,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>>   	if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
>>   	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
>>   		vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
>> +	if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
>> +	    (!vmx->nested.nested_run_pending ||
>> +	     !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
>> +		vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);
>> +
>>   	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
>>   
>>   	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
>> @@ -2897,6 +2913,10 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
>>   					   vmcs12->host_ia32_perf_global_ctrl)))
>>   		return -EINVAL;
>>   
>> +	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PKRS) &&
>> +		CC(!kvm_pkrs_valid(vmcs12->host_ia32_pkrs)))
> 
> Please align the indentation:
> 
> 	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PKRS) &&
> 	    CC(!kvm_pkrs_valid(vmcs12->host_ia32_pkrs)))
> 		return -EINVAL;
> 

Fixed.

>> +		return -EINVAL;
>> +
>>   #ifdef CONFIG_X86_64
>>   	ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
>>   #else
>> @@ -3049,6 +3069,10 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
>>   	if (nested_check_guest_non_reg_state(vmcs12))
>>   		return -EINVAL;
>>   
>> +	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS) &&
>> +	    CC(!kvm_pkrs_valid(vmcs12->guest_ia32_pkrs)))
>> +		return -EINVAL;
>> +
>>   	return 0;
>>   }
>>   
>> @@ -3377,6 +3401,9 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
>>   	if (kvm_mpx_supported() &&
>>   		!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
>>   		vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
>> +	if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
>> +	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))
> 
> This needs read the current PKRS if from_vmentry == false, e.g.
> 
> 	if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
> 	    (!from_vmentry ||
> 	     !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
> 
> because in the migration case, if nested state is set after MSR state, the value
> needs to come from the current MSR value, which was propagated to vmc02 (which
> this calls vmcs01, but whatever).
> 
> Note, I'm pretty sure the GUEST_BNDCFGS code is broken, surprise surprise.
> 

Yes, I miss the migration case and know your point here. Will fix it and 
also the GUEST_BNDCFGS code in a separate patch.

>> +		vmx->nested.vmcs01_guest_pkrs = vmcs_read64(GUEST_IA32_PKRS);
>>   
>>   	/*
>>   	 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
>> @@ -4022,6 +4049,7 @@ static bool is_vmcs12_ext_field(unsigned long field)
>>   	case GUEST_IDTR_BASE:
>>   	case GUEST_PENDING_DBG_EXCEPTIONS:
>>   	case GUEST_BNDCFGS:
>> +	case GUEST_IA32_PKRS:
>>   		return true;
>>   	default:
>>   		break;
>> @@ -4073,6 +4101,8 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
>>   		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>>   	if (kvm_mpx_supported())
>>   		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
>> +	if (guest_cpuid_has(vcpu, X86_FEATURE_PKS))
> 
> This needs to check vmx->nested.msrs.entry_ctls_* (I can never remember if it's
> the high or low part...).  The SDM states PKRS is saved "if the processor supports
> the 1-setting of the 'load PKRS' VM-entry control", which is different than PKRS
> being supported in CPUID.  Also, guest CPUID is userspace controlled, e.g. userspace
> could induce a failed VMREAD by giving a garbage CPUID model, where vmx->nested.msrs
> can only be restricted by userspace, i.e. is trusted.
> 
> Happyily, checking vmx->nested.msrs is also a performance win, as guest_cpuid_has()
> can require walking a large array.

Make sense. Checking vmx->nested.msrs.entry_ctls_high is more accurate.
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f235f77cbc03..c42a1df385ef 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -252,6 +252,10 @@  static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
 	dest->ds_sel = src->ds_sel;
 	dest->es_sel = src->es_sel;
 #endif
+	if (unlikely(src->pkrs != dest->pkrs)) {
+		vmcs_write64(HOST_IA32_PKRS, src->pkrs);
+		dest->pkrs = src->pkrs;
+	}
 }
 
 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
@@ -685,6 +689,9 @@  static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
 					 MSR_IA32_PRED_CMD, MSR_TYPE_W);
 
+	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+					 MSR_IA32_PKRS, MSR_TYPE_RW);
+
 	kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
 
 	vmx->nested.force_msr_bitmap_recalc = false;
@@ -2433,6 +2440,10 @@  static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
 		if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
 		    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
 			vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+
+		if (vmx->nested.nested_run_pending &&
+		    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))
+			vmcs_write64(GUEST_IA32_PKRS, vmcs12->guest_ia32_pkrs);
 	}
 
 	if (nested_cpu_has_xsaves(vmcs12))
@@ -2521,6 +2532,11 @@  static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
 	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
 		vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+	if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
+	    (!vmx->nested.nested_run_pending ||
+	     !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
+		vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);
+
 	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
 
 	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
@@ -2897,6 +2913,10 @@  static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
 					   vmcs12->host_ia32_perf_global_ctrl)))
 		return -EINVAL;
 
+	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PKRS) &&
+		CC(!kvm_pkrs_valid(vmcs12->host_ia32_pkrs)))
+		return -EINVAL;
+
 #ifdef CONFIG_X86_64
 	ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
 #else
@@ -3049,6 +3069,10 @@  static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
 	if (nested_check_guest_non_reg_state(vmcs12))
 		return -EINVAL;
 
+	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS) &&
+	    CC(!kvm_pkrs_valid(vmcs12->guest_ia32_pkrs)))
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -3377,6 +3401,9 @@  enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 	if (kvm_mpx_supported() &&
 		!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
 		vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+	if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
+	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))
+		vmx->nested.vmcs01_guest_pkrs = vmcs_read64(GUEST_IA32_PKRS);
 
 	/*
 	 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
@@ -4022,6 +4049,7 @@  static bool is_vmcs12_ext_field(unsigned long field)
 	case GUEST_IDTR_BASE:
 	case GUEST_PENDING_DBG_EXCEPTIONS:
 	case GUEST_BNDCFGS:
+	case GUEST_IA32_PKRS:
 		return true;
 	default:
 		break;
@@ -4073,6 +4101,8 @@  static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
 		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 	if (kvm_mpx_supported())
 		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+	if (guest_cpuid_has(vcpu, X86_FEATURE_PKS))
+		vmcs12->guest_ia32_pkrs = vmcs_read64(GUEST_IA32_PKRS);
 
 	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
 }
@@ -4310,6 +4340,9 @@  static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 		WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
 					 vmcs12->host_ia32_perf_global_ctrl));
 
+	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PKRS)
+		vmcs_write64(GUEST_IA32_PKRS, vmcs12->host_ia32_pkrs);
+
 	/* Set L1 segment info according to Intel SDM
 	    27.5.2 Loading Host Segment and Descriptor-Table Registers */
 	seg = (struct kvm_segment) {
@@ -6527,7 +6560,8 @@  void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
 		VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
 		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
-		VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+		VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
+		VM_EXIT_LOAD_IA32_PKRS;
 	msrs->exit_ctls_high |=
 		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 		VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
@@ -6547,7 +6581,7 @@  void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
 		VM_ENTRY_IA32E_MODE |
 #endif
 		VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
-		VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+		VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_PKRS;
 	msrs->entry_ctls_high |=
 		(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
 
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index cab6ba7a5005..f3b964e1496c 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -62,9 +62,11 @@  const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD64(GUEST_PDPTR2, guest_pdptr2),
 	FIELD64(GUEST_PDPTR3, guest_pdptr3),
 	FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
+	FIELD64(GUEST_IA32_PKRS, guest_ia32_pkrs),
 	FIELD64(HOST_IA32_PAT, host_ia32_pat),
 	FIELD64(HOST_IA32_EFER, host_ia32_efer),
 	FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
+	FIELD64(HOST_IA32_PKRS, host_ia32_pkrs),
 	FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
 	FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
 	FIELD(EXCEPTION_BITMAP, exception_bitmap),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 2a45f026ee11..42cb4a2b806a 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -185,6 +185,8 @@  struct __packed vmcs12 {
 	u16 host_gs_selector;
 	u16 host_tr_selector;
 	u16 guest_pml_index;
+	u64 host_ia32_pkrs;
+	u64 guest_ia32_pkrs;
 };
 
 /*
@@ -359,6 +361,8 @@  static inline void vmx_check_vmcs12_offsets(void)
 	CHECK_OFFSET(host_gs_selector, 992);
 	CHECK_OFFSET(host_tr_selector, 994);
 	CHECK_OFFSET(guest_pml_index, 996);
+	CHECK_OFFSET(host_ia32_pkrs, 998);
+	CHECK_OFFSET(guest_ia32_pkrs, 1006);
 }
 
 extern const unsigned short vmcs_field_to_offset_table[];
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6fc70ddeff5a..978f4a61771f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7248,6 +7248,7 @@  static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
 	cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
 	cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
 	cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
+	cr4_fixed1_update(X86_CR4_PKS,        ecx, feature_bit(PKS));
 
 #undef cr4_fixed1_update
 }
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index d704ba3a4af7..78bca3d14ed4 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -222,6 +222,8 @@  struct nested_vmx {
 	u64 vmcs01_debugctl;
 	u64 vmcs01_guest_bndcfgs;
 
+	u64 vmcs01_guest_pkrs;
+
 	/* to migrate it to L1 if L2 writes to L1's CR8 directly */
 	int l1_tpr_threshold;