[RESEND,v4,5/9] KVM: VMX: Add init/set/get functions for SPP
diff mbox series

Message ID 20190814070403.6588-6-weijiang.yang@intel.com
State New
Headers show
Series
  • Enable Sub-page Write Protection Support
Related show

Commit Message

Yang Weijiang Aug. 14, 2019, 7:03 a.m. UTC
init_spp() must be called before {get, set}_subpage
functions, it creates subpage access bitmaps for memory pages
and issues a KVM request to setup SPPT root pages.

kvm_mmu_set_subpages() is to enable SPP bit in EPT leaf page
and setup corresponding SPPT entries. The mmu_lock
is held before above operation. If it's called in EPT fault and
SPPT mis-config induced handler, mmu_lock is acquired outside
the function, otherwise, it's acquired inside it.

kvm_mmu_get_subpages() is used to query access bitmap for
protected page, it's also used in EPT fault handler to check
whether the fault EPT page is SPP protected as well.

Co-developed-by: He Chen <he.chen@linux.intel.com>
Signed-off-by: He Chen <he.chen@linux.intel.com>
Co-developed-by: Zhang Yi <yi.z.zhang@linux.intel.com>
Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
---
 arch/x86/include/asm/kvm_host.h |  18 ++++
 arch/x86/include/asm/vmx.h      |   2 +
 arch/x86/kvm/mmu.c              | 160 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/vmx.c          |  48 ++++++++++
 arch/x86/kvm/x86.c              |  40 ++++++++
 include/linux/kvm_host.h        |   4 +-
 include/uapi/linux/kvm.h        |   9 ++
 7 files changed, 280 insertions(+), 1 deletion(-)

Comments

Vitaly Kuznetsov Aug. 14, 2019, 12:43 p.m. UTC | #1
Yang Weijiang <weijiang.yang@intel.com> writes:

> init_spp() must be called before {get, set}_subpage
> functions, it creates subpage access bitmaps for memory pages
> and issues a KVM request to setup SPPT root pages.
>
> kvm_mmu_set_subpages() is to enable SPP bit in EPT leaf page
> and setup corresponding SPPT entries. The mmu_lock
> is held before above operation. If it's called in EPT fault and
> SPPT mis-config induced handler, mmu_lock is acquired outside
> the function, otherwise, it's acquired inside it.
>
> kvm_mmu_get_subpages() is used to query access bitmap for
> protected page, it's also used in EPT fault handler to check
> whether the fault EPT page is SPP protected as well.
>
> Co-developed-by: He Chen <he.chen@linux.intel.com>
> Signed-off-by: He Chen <he.chen@linux.intel.com>
> Co-developed-by: Zhang Yi <yi.z.zhang@linux.intel.com>
> Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
> Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  18 ++++
>  arch/x86/include/asm/vmx.h      |   2 +
>  arch/x86/kvm/mmu.c              | 160 ++++++++++++++++++++++++++++++++
>  arch/x86/kvm/vmx/vmx.c          |  48 ++++++++++
>  arch/x86/kvm/x86.c              |  40 ++++++++
>  include/linux/kvm_host.h        |   4 +-
>  include/uapi/linux/kvm.h        |   9 ++
>  7 files changed, 280 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 44f6e1757861..5c4882015acc 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -398,8 +398,13 @@ struct kvm_mmu {
>  	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
>  	void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
>  			   u64 *spte, const void *pte);
> +	int (*get_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
> +	int (*set_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
> +	int (*init_spp)(struct kvm *kvm);
> +
>  	hpa_t root_hpa;
>  	gpa_t root_cr3;
> +	hpa_t sppt_root;

(I'm sorry if this was previously discussed, I didn't look into previous
submissions).

What happens when we launch a nested guest and switch vcpu->arch.mmu to
point at arch.guest_mmu? sppt_root will point to INVALID_PAGE and SPP
won't be enabled in VMCS?

(I'm sorry again, I'm likely missing something obvious)
Yang Weijiang Aug. 14, 2019, 2:34 p.m. UTC | #2
On Wed, Aug 14, 2019 at 02:43:39PM +0200, Vitaly Kuznetsov wrote:
> Yang Weijiang <weijiang.yang@intel.com> writes:
> 
> > init_spp() must be called before {get, set}_subpage
> > functions, it creates subpage access bitmaps for memory pages
> > and issues a KVM request to setup SPPT root pages.
> >
> > kvm_mmu_set_subpages() is to enable SPP bit in EPT leaf page
> > and setup corresponding SPPT entries. The mmu_lock
> > is held before above operation. If it's called in EPT fault and
> > SPPT mis-config induced handler, mmu_lock is acquired outside
> > the function, otherwise, it's acquired inside it.
> >
> > kvm_mmu_get_subpages() is used to query access bitmap for
> > protected page, it's also used in EPT fault handler to check
> > whether the fault EPT page is SPP protected as well.
> >
> > Co-developed-by: He Chen <he.chen@linux.intel.com>
> > Signed-off-by: He Chen <he.chen@linux.intel.com>
> > Co-developed-by: Zhang Yi <yi.z.zhang@linux.intel.com>
> > Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
> > Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |  18 ++++
> >  arch/x86/include/asm/vmx.h      |   2 +
> >  arch/x86/kvm/mmu.c              | 160 ++++++++++++++++++++++++++++++++
> >  arch/x86/kvm/vmx/vmx.c          |  48 ++++++++++
> >  arch/x86/kvm/x86.c              |  40 ++++++++
> >  include/linux/kvm_host.h        |   4 +-
> >  include/uapi/linux/kvm.h        |   9 ++
> >  7 files changed, 280 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 44f6e1757861..5c4882015acc 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -398,8 +398,13 @@ struct kvm_mmu {
> >  	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
> >  	void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> >  			   u64 *spte, const void *pte);
> > +	int (*get_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
> > +	int (*set_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
> > +	int (*init_spp)(struct kvm *kvm);
> > +
> >  	hpa_t root_hpa;
> >  	gpa_t root_cr3;
> > +	hpa_t sppt_root;
> 
> (I'm sorry if this was previously discussed, I didn't look into previous
> submissions).
> 
> What happens when we launch a nested guest and switch vcpu->arch.mmu to
> point at arch.guest_mmu? sppt_root will point to INVALID_PAGE and SPP
> won't be enabled in VMCS?
> 
> (I'm sorry again, I'm likely missing something obvious)
> 
> -- 
> Vitaly

Hi, Vitaly,
Thanks for raising a good qeustion, I must have missed the nested case,
I'll double check how to support the nested case.
Yang Weijiang Aug. 15, 2019, 1:43 p.m. UTC | #3
On Wed, Aug 14, 2019 at 02:43:39PM +0200, Vitaly Kuznetsov wrote:
> Yang Weijiang <weijiang.yang@intel.com> writes:
> 
> > init_spp() must be called before {get, set}_subpage
> > functions, it creates subpage access bitmaps for memory pages
> > and issues a KVM request to setup SPPT root pages.
> >
> > kvm_mmu_set_subpages() is to enable SPP bit in EPT leaf page
> > and setup corresponding SPPT entries. The mmu_lock
> > is held before above operation. If it's called in EPT fault and
> > SPPT mis-config induced handler, mmu_lock is acquired outside
> > the function, otherwise, it's acquired inside it.
> >
> > kvm_mmu_get_subpages() is used to query access bitmap for
> > protected page, it's also used in EPT fault handler to check
> > whether the fault EPT page is SPP protected as well.
> >
> > Co-developed-by: He Chen <he.chen@linux.intel.com>
> > Signed-off-by: He Chen <he.chen@linux.intel.com>
> > Co-developed-by: Zhang Yi <yi.z.zhang@linux.intel.com>
> > Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
> > Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |  18 ++++
> >  arch/x86/include/asm/vmx.h      |   2 +
> >  arch/x86/kvm/mmu.c              | 160 ++++++++++++++++++++++++++++++++
> >  arch/x86/kvm/vmx/vmx.c          |  48 ++++++++++
> >  arch/x86/kvm/x86.c              |  40 ++++++++
> >  include/linux/kvm_host.h        |   4 +-
> >  include/uapi/linux/kvm.h        |   9 ++
> >  7 files changed, 280 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 44f6e1757861..5c4882015acc 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -398,8 +398,13 @@ struct kvm_mmu {
> >  	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
> >  	void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> >  			   u64 *spte, const void *pte);
> > +	int (*get_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
> > +	int (*set_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
> > +	int (*init_spp)(struct kvm *kvm);
> > +
> >  	hpa_t root_hpa;
> >  	gpa_t root_cr3;
> > +	hpa_t sppt_root;
> 
> (I'm sorry if this was previously discussed, I didn't look into previous
> submissions).
> 
> What happens when we launch a nested guest and switch vcpu->arch.mmu to
> point at arch.guest_mmu? sppt_root will point to INVALID_PAGE and SPP
> won't be enabled in VMCS?
> 
> (I'm sorry again, I'm likely missing something obvious)
> 
> -- 
> Vitaly
Hi, Vitaly,
After looked into the issue and others, I feel to make SPP co-existing
with nested VM is not good, the major reason is, L1 pages protected by
SPP are transparent to L1 VM, if it launches L2 VM, probably the
pages would be allocated to L2 VM, and that will bother to L1 and L2.
Given the feature is new and I don't see nested VM can benefit
from it right now, I would like to make SPP and nested feature mutually
exclusive, i.e., detecting if the other part is active before activate one
feature,what do you think of it? 
thanks!
Vitaly Kuznetsov Aug. 15, 2019, 2:03 p.m. UTC | #4
Yang Weijiang <weijiang.yang@intel.com> writes:

> After looked into the issue and others, I feel to make SPP co-existing
> with nested VM is not good, the major reason is, L1 pages protected by
> SPP are transparent to L1 VM, if it launches L2 VM, probably the
> pages would be allocated to L2 VM, and that will bother to L1 and L2.
> Given the feature is new and I don't see nested VM can benefit
> from it right now, I would like to make SPP and nested feature mutually
> exclusive, i.e., detecting if the other part is active before activate one
> feature,what do you think of it? 

I was mostly worried about creating a loophole (if I understand
correctly) for guests to defeat SPP protection: just launching a nested
guest and giving it a protected page. I don't see a problem if we limit
SPP to non-nested guests as step 1: we, however, need to document this
side-effect of the ioctl. Also, if you decide to do this enforecement,
I'd suggest you forbid VMLAUCH/VMRESUME and not VMXON as kvm module
loads in linux guests automatically when the hardware is suitable.

Thanks,
Jim Mattson Aug. 15, 2019, 4:25 p.m. UTC | #5
On Thu, Aug 15, 2019 at 6:41 AM Yang Weijiang <weijiang.yang@intel.com> wrote:

> Hi, Vitaly,
> After looked into the issue and others, I feel to make SPP co-existing
> with nested VM is not good, the major reason is, L1 pages protected by
> SPP are transparent to L1 VM, if it launches L2 VM, probably the
> pages would be allocated to L2 VM, and that will bother to L1 and L2.
> Given the feature is new and I don't see nested VM can benefit
> from it right now, I would like to make SPP and nested feature mutually
> exclusive, i.e., detecting if the other part is active before activate one
> feature,what do you think of it?
> thanks!

How do you propose making the features mutually exclusive?
Sean Christopherson Aug. 15, 2019, 4:38 p.m. UTC | #6
On Thu, Aug 15, 2019 at 09:25:41AM -0700, Jim Mattson wrote:
> On Thu, Aug 15, 2019 at 6:41 AM Yang Weijiang <weijiang.yang@intel.com> wrote:
> 
> > Hi, Vitaly,
> > After looked into the issue and others, I feel to make SPP co-existing
> > with nested VM is not good, the major reason is, L1 pages protected by
> > SPP are transparent to L1 VM, if it launches L2 VM, probably the
> > pages would be allocated to L2 VM, and that will bother to L1 and L2.
> > Given the feature is new and I don't see nested VM can benefit
> > from it right now, I would like to make SPP and nested feature mutually
> > exclusive, i.e., detecting if the other part is active before activate one
> > feature,what do you think of it?
> > thanks!
> 
> How do you propose making the features mutually exclusive?

I haven't looked at the details or the end to end flow, but would it make
sense to exit to userspace on nested VMLAUNCH/VMRESUME if there are SPP
mappings?  And have the SPP ioctl() kick vCPUs out of guest.

KVM already exits on SPP violations, so presumably this is something that
can be punted to userspace.
Yang Weijiang Aug. 16, 2019, 1:31 p.m. UTC | #7
On Thu, Aug 15, 2019 at 09:38:44AM -0700, Sean Christopherson wrote:
> On Thu, Aug 15, 2019 at 09:25:41AM -0700, Jim Mattson wrote:
> > On Thu, Aug 15, 2019 at 6:41 AM Yang Weijiang <weijiang.yang@intel.com> wrote:
> > 
> > > Hi, Vitaly,
> > > After looked into the issue and others, I feel to make SPP co-existing
> > > with nested VM is not good, the major reason is, L1 pages protected by
> > > SPP are transparent to L1 VM, if it launches L2 VM, probably the
> > > pages would be allocated to L2 VM, and that will bother to L1 and L2.
> > > Given the feature is new and I don't see nested VM can benefit
> > > from it right now, I would like to make SPP and nested feature mutually
> > > exclusive, i.e., detecting if the other part is active before activate one
> > > feature,what do you think of it?
> > > thanks!
> > 
> > How do you propose making the features mutually exclusive?
> 
> I haven't looked at the details or the end to end flow, but would it make
> sense to exit to userspace on nested VMLAUNCH/VMRESUME if there are SPP
> mappings?  And have the SPP ioctl() kick vCPUs out of guest.
> 
> KVM already exits on SPP violations, so presumably this is something that
> can be punted to userspace.
Thanks Jim and Sean! Could we add a new flag in kvm to identify if nested VM is on
or off? That would make things easier. When VMLAUNCH is trapped,
set the flag, if VMXOFF is trapped, clear the flag.
Jim Mattson Aug. 16, 2019, 6:19 p.m. UTC | #8
On Fri, Aug 16, 2019 at 6:29 AM Yang Weijiang <weijiang.yang@intel.com> wrote:

> Thanks Jim and Sean! Could we add a new flag in kvm to identify if nested VM is on
> or off? That would make things easier. When VMLAUNCH is trapped,
> set the flag, if VMXOFF is trapped, clear the flag.

KVM_GET_NESTED_STATE has the requested information. If
data.vmx.vmxon_pa is anything other than -1, then the vCPU is in VMX
operation. If (flags & KVM_STATE_NESTED_GUEST_MODE), then L2 is
active.
Yang Weijiang Aug. 19, 2019, 2:08 a.m. UTC | #9
On Fri, Aug 16, 2019 at 11:19:46AM -0700, Jim Mattson wrote:
> On Fri, Aug 16, 2019 at 6:29 AM Yang Weijiang <weijiang.yang@intel.com> wrote:
> 
> > Thanks Jim and Sean! Could we add a new flag in kvm to identify if nested VM is on
> > or off? That would make things easier. When VMLAUNCH is trapped,
> > set the flag, if VMXOFF is trapped, clear the flag.
> 
> KVM_GET_NESTED_STATE has the requested information. If
> data.vmx.vmxon_pa is anything other than -1, then the vCPU is in VMX
> operation. If (flags & KVM_STATE_NESTED_GUEST_MODE), then L2 is
> active.
Thanks Jim, I'll reference the code and make necessary change in next
SPP patch release.
Yang Weijiang Aug. 19, 2019, 2:06 p.m. UTC | #10
On Thu, Aug 15, 2019 at 04:03:31PM +0200, Vitaly Kuznetsov wrote:
> Yang Weijiang <weijiang.yang@intel.com> writes:
> 
> > After looked into the issue and others, I feel to make SPP co-existing
> > with nested VM is not good, the major reason is, L1 pages protected by
> > SPP are transparent to L1 VM, if it launches L2 VM, probably the
> > pages would be allocated to L2 VM, and that will bother to L1 and L2.
> > Given the feature is new and I don't see nested VM can benefit
> > from it right now, I would like to make SPP and nested feature mutually
> > exclusive, i.e., detecting if the other part is active before activate one
> > feature,what do you think of it? 
> 
> I was mostly worried about creating a loophole (if I understand
> correctly) for guests to defeat SPP protection: just launching a nested
> guest and giving it a protected page. I don't see a problem if we limit
> SPP to non-nested guests as step 1: we, however, need to document this
> side-effect of the ioctl. Also, if you decide to do this enforecement,
> I'd suggest you forbid VMLAUCH/VMRESUME and not VMXON as kvm module
> loads in linux guests automatically when the hardware is suitable.
> 
> Thanks,
> 
> -- 
> Vitaly
OK, I'll follow your suggestion to add the exclusion, thanks!
Paolo Bonzini Aug. 19, 2019, 3:05 p.m. UTC | #11
On 14/08/19 09:03, Yang Weijiang wrote:
> +
> +int kvm_mmu_get_subpages(struct kvm *kvm, struct kvm_subpage *spp_info,
> +			 bool mmu_locked)
> +{
> +	u32 *access = spp_info->access_map;
> +	gfn_t gfn = spp_info->base_gfn;
> +	int npages = spp_info->npages;
> +	struct kvm_memory_slot *slot;
> +	int i;
> +	int ret;
> +
> +	if (!kvm->arch.spp_active)
> +	      return -ENODEV;
> +
> +	if (!mmu_locked)
> +	      spin_lock(&kvm->mmu_lock);
> +

Do not add this argument.  Just lock mmu_lock in the callers.

Paolo
Paolo Bonzini Aug. 19, 2019, 3:13 p.m. UTC | #12
On 14/08/19 09:03, Yang Weijiang wrote:
> init_spp() must be called before {get, set}_subpage
> functions, it creates subpage access bitmaps for memory pages
> and issues a KVM request to setup SPPT root pages.
> 
> kvm_mmu_set_subpages() is to enable SPP bit in EPT leaf page
> and setup corresponding SPPT entries. The mmu_lock
> is held before above operation. If it's called in EPT fault and
> SPPT mis-config induced handler, mmu_lock is acquired outside
> the function, otherwise, it's acquired inside it.
> 
> kvm_mmu_get_subpages() is used to query access bitmap for
> protected page, it's also used in EPT fault handler to check
> whether the fault EPT page is SPP protected as well.
> 
> Co-developed-by: He Chen <he.chen@linux.intel.com>
> Signed-off-by: He Chen <he.chen@linux.intel.com>
> Co-developed-by: Zhang Yi <yi.z.zhang@linux.intel.com>
> Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
> Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  18 ++++
>  arch/x86/include/asm/vmx.h      |   2 +
>  arch/x86/kvm/mmu.c              | 160 ++++++++++++++++++++++++++++++++
>  arch/x86/kvm/vmx/vmx.c          |  48 ++++++++++
>  arch/x86/kvm/x86.c              |  40 ++++++++
>  include/linux/kvm_host.h        |   4 +-
>  include/uapi/linux/kvm.h        |   9 ++
>  7 files changed, 280 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 44f6e1757861..5c4882015acc 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -398,8 +398,13 @@ struct kvm_mmu {
>  	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
>  	void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
>  			   u64 *spte, const void *pte);
> +	int (*get_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
> +	int (*set_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
> +	int (*init_spp)(struct kvm *kvm);
> +
>  	hpa_t root_hpa;
>  	gpa_t root_cr3;
> +	hpa_t sppt_root;
>  	union kvm_mmu_role mmu_role;
>  	u8 root_level;
>  	u8 shadow_root_level;
> @@ -927,6 +932,8 @@ struct kvm_arch {
>  
>  	bool guest_can_read_msr_platform_info;
>  	bool exception_payload_enabled;
> +
> +	bool spp_active;
>  };
>  
>  struct kvm_vm_stat {
> @@ -1199,6 +1206,11 @@ struct kvm_x86_ops {
>  	uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu);
>  
>  	bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
> +
> +	bool (*get_spp_status)(void);
> +	int (*get_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
> +	int (*set_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
> +	int (*init_spp)(struct kvm *kvm);
>  };
>  
>  struct kvm_arch_async_pf {
> @@ -1417,6 +1429,12 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
>  void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
>  void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush);
>  
> +int kvm_mmu_get_subpages(struct kvm *kvm, struct kvm_subpage *spp_info,
> +			 bool mmu_locked);
> +int kvm_mmu_set_subpages(struct kvm *kvm, struct kvm_subpage *spp_info,
> +			 bool mmu_locked);
> +int kvm_mmu_init_spp(struct kvm *kvm);
> +
>  void kvm_enable_tdp(void);
>  void kvm_disable_tdp(void);
>  
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index a2c9e18e0ad7..6cb05ac07453 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -224,6 +224,8 @@ enum vmcs_field {
>  	XSS_EXIT_BITMAP_HIGH            = 0x0000202D,
>  	ENCLS_EXITING_BITMAP		= 0x0000202E,
>  	ENCLS_EXITING_BITMAP_HIGH	= 0x0000202F,
> +	SPPT_POINTER			= 0x00002030,
> +	SPPT_POINTER_HIGH		= 0x00002031,
>  	TSC_MULTIPLIER                  = 0x00002032,
>  	TSC_MULTIPLIER_HIGH             = 0x00002033,
>  	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 16e390fd6e58..16d3ca544b67 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3756,6 +3756,9 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>  		    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
>  			mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
>  					   &invalid_list);
> +			if (vcpu->kvm->arch.spp_active)
> +				mmu_free_root_page(vcpu->kvm, &mmu->sppt_root,
> +						   &invalid_list);
>  		} else {
>  			for (i = 0; i < 4; ++i)
>  				if (mmu->pae_root[i] != 0)
> @@ -4414,6 +4417,158 @@ int kvm_mmu_setup_spp_structure(struct kvm_vcpu *vcpu,
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_setup_spp_structure);
> +
> +int kvm_mmu_init_spp(struct kvm *kvm)
> +{
> +	int i, ret;
> +	struct kvm_vcpu *vcpu;
> +	int root_level;
> +	struct kvm_mmu_page *ssp_sp;
> +
> +
> +	if (!kvm_x86_ops->get_spp_status())
> +	      return -ENODEV;
> +
> +	if (kvm->arch.spp_active)
> +	      return 0;
> +
> +	ret = kvm_subpage_create_bitmaps(kvm);
> +
> +	if (ret)
> +	      return ret;
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		/* prepare caches for SPP setup.*/
> +		mmu_topup_memory_caches(vcpu);
> +		root_level = vcpu->arch.mmu->shadow_root_level;
> +		ssp_sp = kvm_mmu_get_spp_page(vcpu, 0, root_level);
> +		++ssp_sp->root_count;
> +		vcpu->arch.mmu->sppt_root = __pa(ssp_sp->spt);
> +		kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
> +	}
> +
> +	kvm->arch.spp_active = true;
> +	return 0;
> +}
> +
> +int kvm_mmu_get_subpages(struct kvm *kvm, struct kvm_subpage *spp_info,
> +			 bool mmu_locked)
> +{
> +	u32 *access = spp_info->access_map;
> +	gfn_t gfn = spp_info->base_gfn;
> +	int npages = spp_info->npages;
> +	struct kvm_memory_slot *slot;
> +	int i;
> +	int ret;
> +
> +	if (!kvm->arch.spp_active)
> +	      return -ENODEV;
> +
> +	if (!mmu_locked)
> +	      spin_lock(&kvm->mmu_lock);
> +
> +	for (i = 0; i < npages; i++, gfn++) {
> +		slot = gfn_to_memslot(kvm, gfn);
> +		if (!slot) {
> +			ret = -EFAULT;
> +			goto out_unlock;
> +		}
> +		access[i] = *gfn_to_subpage_wp_info(slot, gfn);
> +	}
> +
> +	ret = i;
> +
> +out_unlock:
> +	if (!mmu_locked)
> +	      spin_unlock(&kvm->mmu_lock);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(kvm_mmu_get_subpages);
> +
> +int kvm_mmu_set_subpages(struct kvm *kvm, struct kvm_subpage *spp_info,
> +			 bool mmu_locked)
> +{
> +	u32 *access = spp_info->access_map;
> +	gfn_t gfn = spp_info->base_gfn;
> +	int npages = spp_info->npages;
> +	struct kvm_memory_slot *slot;
> +	struct kvm_vcpu *vcpu;
> +	struct kvm_rmap_head *rmap_head;
> +	int i, k;
> +	u32 *wp_map;
> +	int ret = -EFAULT;
> +
> +	if (!kvm->arch.spp_active)
> +		return -ENODEV;
> +
> +	if (!mmu_locked)
> +	      spin_lock(&kvm->mmu_lock);
> +
> +	for (i = 0; i < npages; i++, gfn++) {
> +		slot = gfn_to_memslot(kvm, gfn);
> +		if (!slot)
> +			goto out_unlock;
> +
> +		/*
> +		 * check whether the target 4KB page exists in EPT leaf
> +		 * entries.If it's there, we can setup SPP protection now,
> +		 * otherwise, need to defer it to EPT page fault handler.
> +		 */
> +		rmap_head = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
> +
> +		if (rmap_head->val) {
> +			/*
> +			 * if all subpages are not writable, open SPP bit in
> +			 * EPT leaf entry to enable SPP protection for
> +			 * corresponding page.
> +			 */
> +			if (access[i] != FULL_SPP_ACCESS) {
> +				ret = kvm_mmu_open_subpage_write_protect(kvm,
> +						slot, gfn);
> +
> +				if (ret)
> +					goto out_err;
> +
> +				kvm_for_each_vcpu(k, vcpu, kvm)
> +					kvm_mmu_setup_spp_structure(vcpu,
> +						access[i], gfn);
> +			} else {
> +				ret = kvm_mmu_clear_subpage_write_protect(kvm,
> +						slot, gfn);
> +				if (ret)
> +					goto out_err;
> +			}
> +
> +		} else
> +			pr_info("%s - No ETP entry, gfn = 0x%llx, access = 0x%x.\n", __func__, gfn, access[i]);
> +
> +		/* if this function is called in tdp_page_fault() or
> +		 * spp_handler(), mmu_locked = true, SPP access bitmap
> +		 * is being used, otherwise, it's being stored.
> +		 */
> +		if (!mmu_locked) {
> +			wp_map = gfn_to_subpage_wp_info(slot, gfn);
> +			*wp_map = access[i];
> +		}
> +	}
> +
> +	ret = i;
> +out_err:
> +	if (ret < 0)
> +	      pr_info("SPP-Error, didn't get the gfn:" \
> +		      "%llx from EPT leaf.\n"
> +		      "Current we don't support SPP on" \
> +		      "huge page.\n"
> +		      "Please disable huge page and have" \
> +		      "another try.\n", gfn);
> +out_unlock:
> +	if (!mmu_locked)
> +	      spin_unlock(&kvm->mmu_lock);
> +
> +	return ret;
> +}
> +
>  static void nonpaging_init_context(struct kvm_vcpu *vcpu,
>  				   struct kvm_mmu *context)
>  {
> @@ -5104,6 +5259,9 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
>  	context->get_cr3 = get_cr3;
>  	context->get_pdptr = kvm_pdptr_read;
>  	context->inject_page_fault = kvm_inject_page_fault;
> +	context->get_subpages = kvm_x86_ops->get_subpages;
> +	context->set_subpages = kvm_x86_ops->set_subpages;
> +	context->init_spp = kvm_x86_ops->init_spp;
>  
>  	if (!is_paging(vcpu)) {
>  		context->nx = false;
> @@ -5309,6 +5467,8 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
>  		uint i;
>  
>  		vcpu->arch.mmu->root_hpa = INVALID_PAGE;
> +		if (!vcpu->kvm->arch.spp_active)
> +			vcpu->arch.mmu->sppt_root = INVALID_PAGE;
>  
>  		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
>  			vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index a0753a36155d..855d02dd94c7 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -2847,11 +2847,17 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
>  	return eptp;
>  }
>  
> +static inline u64 construct_spptp(unsigned long root_hpa)
> +{
> +	return root_hpa & PAGE_MASK;
> +}
> +
>  void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
>  {
>  	struct kvm *kvm = vcpu->kvm;
>  	unsigned long guest_cr3;
>  	u64 eptp;
> +	u64 spptp;
>  
>  	guest_cr3 = cr3;
>  	if (enable_ept) {
> @@ -2874,6 +2880,12 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
>  		ept_load_pdptrs(vcpu);
>  	}
>  
> +	if (kvm->arch.spp_active && VALID_PAGE(vcpu->arch.mmu->sppt_root)) {
> +		spptp = construct_spptp(vcpu->arch.mmu->sppt_root);
> +		vmcs_write64(SPPT_POINTER, spptp);
> +		vmx_flush_tlb(vcpu, true);
> +	}
> +
>  	vmcs_writel(GUEST_CR3, guest_cr3);
>  }
>  
> @@ -5735,6 +5747,9 @@ void dump_vmcs(void)
>  		pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
>  	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
>  		pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
> +	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_SPP))
> +		pr_err("SPPT pointer = 0x%016llx\n", vmcs_read64(SPPT_POINTER));
> +
>  	n = vmcs_read32(CR3_TARGET_COUNT);
>  	for (i = 0; i + 1 < n; i += 4)
>  		pr_err("CR3 target%u=%016lx target%u=%016lx\n",
> @@ -7546,6 +7561,12 @@ static __init int hardware_setup(void)
>  		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
>  	}
>  
> +	if (!spp_supported) {
> +		kvm_x86_ops->get_subpages = NULL;
> +		kvm_x86_ops->set_subpages = NULL;
> +		kvm_x86_ops->init_spp = NULL;
> +	}
> +
>  	if (!cpu_has_vmx_preemption_timer())
>  		kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
>  
> @@ -7592,6 +7613,28 @@ static __exit void hardware_unsetup(void)
>  	free_kvm_area();
>  }
>  
> +static bool vmx_get_spp_status(void)
> +{
> +	return spp_supported;
> +}
> +
> +static int vmx_get_subpages(struct kvm *kvm,
> +			    struct kvm_subpage *spp_info)
> +{
> +	return kvm_get_subpages(kvm, spp_info);
> +}
> +
> +static int vmx_set_subpages(struct kvm *kvm,
> +			    struct kvm_subpage *spp_info)
> +{
> +	return kvm_set_subpages(kvm, spp_info);
> +}
> +
> +static int vmx_init_spp(struct kvm *kvm)
> +{
> +	return kvm_init_spp(kvm);
> +}
> +
>  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.cpu_has_kvm_support = cpu_has_kvm_support,
>  	.disabled_by_bios = vmx_disabled_by_bios,
> @@ -7740,6 +7783,11 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.get_vmcs12_pages = NULL,
>  	.nested_enable_evmcs = NULL,
>  	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
> +
> +	.get_spp_status = vmx_get_spp_status,
> +	.get_subpages = vmx_get_subpages,
> +	.set_subpages = vmx_set_subpages,
> +	.init_spp = vmx_init_spp,
>  };

There's no need for the get_subpages kvm_x86_ops.  You do need init_spp
of course, but you do not need get_spp_status either; instead you can
check if init_spp is NULL (if NULL, SPP is not supported).

In addition, kvm_mmu_set_subpages should not set up the SPP pages.  This
should be handled entirely by handle_spp when the processor reports an
SPPT miss, so you do not need that callback either.  You may need a
flush_subpages callback, called by kvm_mmu_set_subpages to clear the SPP
page tables for a given GPA range.  The next access then will cause an
SPPT miss.

Finally, please move all SPP-related code to arch/x86/kvm/vmx/{spp.c,spp.h}.

Thanks,

Paolo



>  static void vmx_cleanup_l1d_flush(void)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 26e4c574731c..c7cb17941344 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -4589,6 +4589,44 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
>  	return r;
>  }
>  
> +int kvm_get_subpages(struct kvm *kvm,
> +		     struct kvm_subpage *spp_info)
> +{
> +	int ret;
> +
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_mmu_get_subpages(kvm, spp_info, false);
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(kvm_get_subpages);
> +
> +int kvm_set_subpages(struct kvm *kvm,
> +		     struct kvm_subpage *spp_info)
> +{
> +	int ret;
> +
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_mmu_set_subpages(kvm, spp_info, false);
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(kvm_set_subpages);
> +
> +int kvm_init_spp(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_mmu_init_spp(kvm);
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(kvm_init_spp);
> +
>  long kvm_arch_vm_ioctl(struct file *filp,
>  		       unsigned int ioctl, unsigned long arg)
>  {
> @@ -9349,6 +9387,8 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
>  	}
>  
>  	kvm_page_track_free_memslot(free, dont);
> +	if (kvm->arch.spp_active)
> +	      kvm_subpage_free_memslot(free, dont);
>  }
>  
>  int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7f904d4d788c..da30fcbb2727 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -849,7 +849,9 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
>  
>  struct kvm_mmu_page *kvm_mmu_get_spp_page(struct kvm_vcpu *vcpu,
>  			gfn_t gfn, unsigned int level);
> -
> +int kvm_get_subpages(struct kvm *kvm, struct kvm_subpage *spp_info);
> +int kvm_set_subpages(struct kvm *kvm, struct kvm_subpage *spp_info);
> +int kvm_init_spp(struct kvm *kvm);
>  #ifndef __KVM_HAVE_ARCH_VM_ALLOC
>  /*
>   * All architectures that want to use vzalloc currently also
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 6d4ea4b6c922..2c75a87ab3b5 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -102,6 +102,15 @@ struct kvm_userspace_memory_region {
>  	__u64 userspace_addr; /* start of the userspace allocated memory */
>  };
>  
> +/* for KVM_SUBPAGES_GET_ACCESS and KVM_SUBPAGES_SET_ACCESS */
> +#define SUBPAGE_MAX_BITMAP   64
> +struct kvm_subpage {
> +	__u64 base_gfn;
> +	__u64 npages;
> +	 /* sub-page write-access bitmap array */
> +	__u32 access_map[SUBPAGE_MAX_BITMAP];
> +};
> +
>  /*
>   * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace,
>   * other bits are reserved for kvm internal use which are defined in
>
Paolo Bonzini Aug. 19, 2019, 3:15 p.m. UTC | #13
On 19/08/19 04:08, Yang Weijiang wrote:
>> KVM_GET_NESTED_STATE has the requested information. If
>> data.vmx.vmxon_pa is anything other than -1, then the vCPU is in VMX
>> operation. If (flags & KVM_STATE_NESTED_GUEST_MODE), then L2 is
>> active.
> Thanks Jim, I'll reference the code and make necessary change in next
> SPP patch release.

Since SPP will not be used very much in the beginning, it would be
simplest to enable it only if nested==0.

Paolo
Yang Weijiang Aug. 20, 2019, 12:33 p.m. UTC | #14
On Mon, Aug 19, 2019 at 05:15:01PM +0200, Paolo Bonzini wrote:
> On 19/08/19 04:08, Yang Weijiang wrote:
> >> KVM_GET_NESTED_STATE has the requested information. If
> >> data.vmx.vmxon_pa is anything other than -1, then the vCPU is in VMX
> >> operation. If (flags & KVM_STATE_NESTED_GUEST_MODE), then L2 is
> >> active.
> > Thanks Jim, I'll reference the code and make necessary change in next
> > SPP patch release.
> 
> Since SPP will not be used very much in the beginning, it would be
> simplest to enable it only if nested==0.
> 
> Paolo
Thanks Paolo! Sure, will make the change and document it.
Yang Weijiang Aug. 20, 2019, 12:36 p.m. UTC | #15
On Mon, Aug 19, 2019 at 05:05:22PM +0200, Paolo Bonzini wrote:
> On 14/08/19 09:03, Yang Weijiang wrote:
> > +
> > +int kvm_mmu_get_subpages(struct kvm *kvm, struct kvm_subpage *spp_info,
> > +			 bool mmu_locked)
> > +{
> > +	u32 *access = spp_info->access_map;
> > +	gfn_t gfn = spp_info->base_gfn;
> > +	int npages = spp_info->npages;
> > +	struct kvm_memory_slot *slot;
> > +	int i;
> > +	int ret;
> > +
> > +	if (!kvm->arch.spp_active)
> > +	      return -ENODEV;
> > +
> > +	if (!mmu_locked)
> > +	      spin_lock(&kvm->mmu_lock);
> > +
> 
> Do not add this argument.  Just lock mmu_lock in the callers.
> 
> Paolo
OK, will remove it, thanks!
Yang Weijiang Aug. 20, 2019, 1:09 p.m. UTC | #16
On Mon, Aug 19, 2019 at 05:13:47PM +0200, Paolo Bonzini wrote:
> On 14/08/19 09:03, Yang Weijiang wrote:
> >  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
> >  	.cpu_has_kvm_support = cpu_has_kvm_support,
> >  	.disabled_by_bios = vmx_disabled_by_bios,
> > @@ -7740,6 +7783,11 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
> >  	.get_vmcs12_pages = NULL,
> >  	.nested_enable_evmcs = NULL,
> >  	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
> > +
> > +	.get_spp_status = vmx_get_spp_status,
> > +	.get_subpages = vmx_get_subpages,
> > +	.set_subpages = vmx_set_subpages,
> > +	.init_spp = vmx_init_spp,
> >  };
> 
> There's no need for the get_subpages kvm_x86_ops.  You do need init_spp
> of course, but you do not need get_spp_status either; instead you can
> check if init_spp is NULL (if NULL, SPP is not supported).
So first set .init_spp = NULL, then if all SPP preconditions meet, set
.init_spp = vmx_init_spp?

> In addition, kvm_mmu_set_subpages should not set up the SPP pages.  This
> should be handled entirely by handle_spp when the processor reports an
> SPPT miss, so you do not need that callback either.  You may need a
> flush_subpages callback, called by kvm_mmu_set_subpages to clear the SPP
> page tables for a given GPA range.  The next access then will cause an
> SPPT miss.
Good suggestion, thanks! Will do change.

> Finally, please move all SPP-related code to arch/x86/kvm/vmx/{spp.c,spp.h}.
>
Sure.

> Thanks,
> 
> Paolo
>

Patch
diff mbox series

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 44f6e1757861..5c4882015acc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -398,8 +398,13 @@  struct kvm_mmu {
 	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
 	void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			   u64 *spte, const void *pte);
+	int (*get_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
+	int (*set_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
+	int (*init_spp)(struct kvm *kvm);
+
 	hpa_t root_hpa;
 	gpa_t root_cr3;
+	hpa_t sppt_root;
 	union kvm_mmu_role mmu_role;
 	u8 root_level;
 	u8 shadow_root_level;
@@ -927,6 +932,8 @@  struct kvm_arch {
 
 	bool guest_can_read_msr_platform_info;
 	bool exception_payload_enabled;
+
+	bool spp_active;
 };
 
 struct kvm_vm_stat {
@@ -1199,6 +1206,11 @@  struct kvm_x86_ops {
 	uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu);
 
 	bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
+
+	bool (*get_spp_status)(void);
+	int (*get_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
+	int (*set_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
+	int (*init_spp)(struct kvm *kvm);
 };
 
 struct kvm_arch_async_pf {
@@ -1417,6 +1429,12 @@  void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush);
 
+int kvm_mmu_get_subpages(struct kvm *kvm, struct kvm_subpage *spp_info,
+			 bool mmu_locked);
+int kvm_mmu_set_subpages(struct kvm *kvm, struct kvm_subpage *spp_info,
+			 bool mmu_locked);
+int kvm_mmu_init_spp(struct kvm *kvm);
+
 void kvm_enable_tdp(void);
 void kvm_disable_tdp(void);
 
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index a2c9e18e0ad7..6cb05ac07453 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -224,6 +224,8 @@  enum vmcs_field {
 	XSS_EXIT_BITMAP_HIGH            = 0x0000202D,
 	ENCLS_EXITING_BITMAP		= 0x0000202E,
 	ENCLS_EXITING_BITMAP_HIGH	= 0x0000202F,
+	SPPT_POINTER			= 0x00002030,
+	SPPT_POINTER_HIGH		= 0x00002031,
 	TSC_MULTIPLIER                  = 0x00002032,
 	TSC_MULTIPLIER_HIGH             = 0x00002033,
 	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 16e390fd6e58..16d3ca544b67 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3756,6 +3756,9 @@  void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 		    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
 			mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
 					   &invalid_list);
+			if (vcpu->kvm->arch.spp_active)
+				mmu_free_root_page(vcpu->kvm, &mmu->sppt_root,
+						   &invalid_list);
 		} else {
 			for (i = 0; i < 4; ++i)
 				if (mmu->pae_root[i] != 0)
@@ -4414,6 +4417,158 @@  int kvm_mmu_setup_spp_structure(struct kvm_vcpu *vcpu,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_setup_spp_structure);
+
+int kvm_mmu_init_spp(struct kvm *kvm)
+{
+	int i, ret;
+	struct kvm_vcpu *vcpu;
+	int root_level;
+	struct kvm_mmu_page *ssp_sp;
+
+
+	if (!kvm_x86_ops->get_spp_status())
+	      return -ENODEV;
+
+	if (kvm->arch.spp_active)
+	      return 0;
+
+	ret = kvm_subpage_create_bitmaps(kvm);
+
+	if (ret)
+	      return ret;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		/* prepare caches for SPP setup.*/
+		mmu_topup_memory_caches(vcpu);
+		root_level = vcpu->arch.mmu->shadow_root_level;
+		ssp_sp = kvm_mmu_get_spp_page(vcpu, 0, root_level);
+		++ssp_sp->root_count;
+		vcpu->arch.mmu->sppt_root = __pa(ssp_sp->spt);
+		kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
+	}
+
+	kvm->arch.spp_active = true;
+	return 0;
+}
+
+int kvm_mmu_get_subpages(struct kvm *kvm, struct kvm_subpage *spp_info,
+			 bool mmu_locked)
+{
+	u32 *access = spp_info->access_map;
+	gfn_t gfn = spp_info->base_gfn;
+	int npages = spp_info->npages;
+	struct kvm_memory_slot *slot;
+	int i;
+	int ret;
+
+	if (!kvm->arch.spp_active)
+	      return -ENODEV;
+
+	if (!mmu_locked)
+	      spin_lock(&kvm->mmu_lock);
+
+	for (i = 0; i < npages; i++, gfn++) {
+		slot = gfn_to_memslot(kvm, gfn);
+		if (!slot) {
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+		access[i] = *gfn_to_subpage_wp_info(slot, gfn);
+	}
+
+	ret = i;
+
+out_unlock:
+	if (!mmu_locked)
+	      spin_unlock(&kvm->mmu_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_get_subpages);
+
+int kvm_mmu_set_subpages(struct kvm *kvm, struct kvm_subpage *spp_info,
+			 bool mmu_locked)
+{
+	u32 *access = spp_info->access_map;
+	gfn_t gfn = spp_info->base_gfn;
+	int npages = spp_info->npages;
+	struct kvm_memory_slot *slot;
+	struct kvm_vcpu *vcpu;
+	struct kvm_rmap_head *rmap_head;
+	int i, k;
+	u32 *wp_map;
+	int ret = -EFAULT;
+
+	if (!kvm->arch.spp_active)
+		return -ENODEV;
+
+	if (!mmu_locked)
+	      spin_lock(&kvm->mmu_lock);
+
+	for (i = 0; i < npages; i++, gfn++) {
+		slot = gfn_to_memslot(kvm, gfn);
+		if (!slot)
+			goto out_unlock;
+
+		/*
+		 * check whether the target 4KB page exists in EPT leaf
+		 * entries.If it's there, we can setup SPP protection now,
+		 * otherwise, need to defer it to EPT page fault handler.
+		 */
+		rmap_head = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
+
+		if (rmap_head->val) {
+			/*
+			 * if all subpages are not writable, open SPP bit in
+			 * EPT leaf entry to enable SPP protection for
+			 * corresponding page.
+			 */
+			if (access[i] != FULL_SPP_ACCESS) {
+				ret = kvm_mmu_open_subpage_write_protect(kvm,
+						slot, gfn);
+
+				if (ret)
+					goto out_err;
+
+				kvm_for_each_vcpu(k, vcpu, kvm)
+					kvm_mmu_setup_spp_structure(vcpu,
+						access[i], gfn);
+			} else {
+				ret = kvm_mmu_clear_subpage_write_protect(kvm,
+						slot, gfn);
+				if (ret)
+					goto out_err;
+			}
+
+		} else
+			pr_info("%s - No ETP entry, gfn = 0x%llx, access = 0x%x.\n", __func__, gfn, access[i]);
+
+		/* if this function is called in tdp_page_fault() or
+		 * spp_handler(), mmu_locked = true, SPP access bitmap
+		 * is being used, otherwise, it's being stored.
+		 */
+		if (!mmu_locked) {
+			wp_map = gfn_to_subpage_wp_info(slot, gfn);
+			*wp_map = access[i];
+		}
+	}
+
+	ret = i;
+out_err:
+	if (ret < 0)
+	      pr_info("SPP-Error, didn't get the gfn:" \
+		      "%llx from EPT leaf.\n"
+		      "Current we don't support SPP on" \
+		      "huge page.\n"
+		      "Please disable huge page and have" \
+		      "another try.\n", gfn);
+out_unlock:
+	if (!mmu_locked)
+	      spin_unlock(&kvm->mmu_lock);
+
+	return ret;
+}
+
 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
 				   struct kvm_mmu *context)
 {
@@ -5104,6 +5259,9 @@  static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->get_cr3 = get_cr3;
 	context->get_pdptr = kvm_pdptr_read;
 	context->inject_page_fault = kvm_inject_page_fault;
+	context->get_subpages = kvm_x86_ops->get_subpages;
+	context->set_subpages = kvm_x86_ops->set_subpages;
+	context->init_spp = kvm_x86_ops->init_spp;
 
 	if (!is_paging(vcpu)) {
 		context->nx = false;
@@ -5309,6 +5467,8 @@  void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
 		uint i;
 
 		vcpu->arch.mmu->root_hpa = INVALID_PAGE;
+		if (!vcpu->kvm->arch.spp_active)
+			vcpu->arch.mmu->sppt_root = INVALID_PAGE;
 
 		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
 			vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a0753a36155d..855d02dd94c7 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2847,11 +2847,17 @@  u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
 	return eptp;
 }
 
+static inline u64 construct_spptp(unsigned long root_hpa)
+{
+	return root_hpa & PAGE_MASK;
+}
+
 void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long guest_cr3;
 	u64 eptp;
+	u64 spptp;
 
 	guest_cr3 = cr3;
 	if (enable_ept) {
@@ -2874,6 +2880,12 @@  void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		ept_load_pdptrs(vcpu);
 	}
 
+	if (kvm->arch.spp_active && VALID_PAGE(vcpu->arch.mmu->sppt_root)) {
+		spptp = construct_spptp(vcpu->arch.mmu->sppt_root);
+		vmcs_write64(SPPT_POINTER, spptp);
+		vmx_flush_tlb(vcpu, true);
+	}
+
 	vmcs_writel(GUEST_CR3, guest_cr3);
 }
 
@@ -5735,6 +5747,9 @@  void dump_vmcs(void)
 		pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
 	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
 		pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
+	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_SPP))
+		pr_err("SPPT pointer = 0x%016llx\n", vmcs_read64(SPPT_POINTER));
+
 	n = vmcs_read32(CR3_TARGET_COUNT);
 	for (i = 0; i + 1 < n; i += 4)
 		pr_err("CR3 target%u=%016lx target%u=%016lx\n",
@@ -7546,6 +7561,12 @@  static __init int hardware_setup(void)
 		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
 	}
 
+	if (!spp_supported) {
+		kvm_x86_ops->get_subpages = NULL;
+		kvm_x86_ops->set_subpages = NULL;
+		kvm_x86_ops->init_spp = NULL;
+	}
+
 	if (!cpu_has_vmx_preemption_timer())
 		kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
 
@@ -7592,6 +7613,28 @@  static __exit void hardware_unsetup(void)
 	free_kvm_area();
 }
 
+static bool vmx_get_spp_status(void)
+{
+	return spp_supported;
+}
+
+static int vmx_get_subpages(struct kvm *kvm,
+			    struct kvm_subpage *spp_info)
+{
+	return kvm_get_subpages(kvm, spp_info);
+}
+
+static int vmx_set_subpages(struct kvm *kvm,
+			    struct kvm_subpage *spp_info)
+{
+	return kvm_set_subpages(kvm, spp_info);
+}
+
+static int vmx_init_spp(struct kvm *kvm)
+{
+	return kvm_init_spp(kvm);
+}
+
 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -7740,6 +7783,11 @@  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.get_vmcs12_pages = NULL,
 	.nested_enable_evmcs = NULL,
 	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
+
+	.get_spp_status = vmx_get_spp_status,
+	.get_subpages = vmx_get_subpages,
+	.set_subpages = vmx_set_subpages,
+	.init_spp = vmx_init_spp,
 };
 
 static void vmx_cleanup_l1d_flush(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 26e4c574731c..c7cb17941344 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4589,6 +4589,44 @@  int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 	return r;
 }
 
+int kvm_get_subpages(struct kvm *kvm,
+		     struct kvm_subpage *spp_info)
+{
+	int ret;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_mmu_get_subpages(kvm, spp_info, false);
+	mutex_unlock(&kvm->slots_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_get_subpages);
+
+int kvm_set_subpages(struct kvm *kvm,
+		     struct kvm_subpage *spp_info)
+{
+	int ret;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_mmu_set_subpages(kvm, spp_info, false);
+	mutex_unlock(&kvm->slots_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_set_subpages);
+
+int kvm_init_spp(struct kvm *kvm)
+{
+	int ret;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_mmu_init_spp(kvm);
+	mutex_unlock(&kvm->slots_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_init_spp);
+
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg)
 {
@@ -9349,6 +9387,8 @@  void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 	}
 
 	kvm_page_track_free_memslot(free, dont);
+	if (kvm->arch.spp_active)
+	      kvm_subpage_free_memslot(free, dont);
 }
 
 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7f904d4d788c..da30fcbb2727 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -849,7 +849,9 @@  int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 
 struct kvm_mmu_page *kvm_mmu_get_spp_page(struct kvm_vcpu *vcpu,
 			gfn_t gfn, unsigned int level);
-
+int kvm_get_subpages(struct kvm *kvm, struct kvm_subpage *spp_info);
+int kvm_set_subpages(struct kvm *kvm, struct kvm_subpage *spp_info);
+int kvm_init_spp(struct kvm *kvm);
 #ifndef __KVM_HAVE_ARCH_VM_ALLOC
 /*
  * All architectures that want to use vzalloc currently also
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 6d4ea4b6c922..2c75a87ab3b5 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -102,6 +102,15 @@  struct kvm_userspace_memory_region {
 	__u64 userspace_addr; /* start of the userspace allocated memory */
 };
 
+/* for KVM_SUBPAGES_GET_ACCESS and KVM_SUBPAGES_SET_ACCESS */
+#define SUBPAGE_MAX_BITMAP   64
+struct kvm_subpage {
+	__u64 base_gfn;
+	__u64 npages;
+	 /* sub-page write-access bitmap array */
+	__u32 access_map[SUBPAGE_MAX_BITMAP];
+};
+
 /*
  * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace,
  * other bits are reserved for kvm internal use which are defined in