diff mbox series

[RESEND,v10,06/10] vmx: spp: Set up SPP paging table at vmentry/vmexit

Message ID 20200102061319.10077-7-weijiang.yang@intel.com (mailing list archive)
State New, archived
Headers show
Series Enable Sub-Page Write Protection Support | expand

Commit Message

Yang, Weijiang Jan. 2, 2020, 6:13 a.m. UTC
If write to subpage is not allowed, EPT violation generates
and it's handled in fast_page_fault().

In current implementation, SPPT setup is only handled in handle_spp()
vmexit handler, it's triggered when SPP bit is set in EPT leaf
entry while SPPT entries are not ready.

A SPP specific bit(11) is added to exit_qualification and a new
exit reason(66) is introduced for SPP.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Co-developed-by: He Chen <he.chen@linux.intel.com>
Signed-off-by: He Chen <he.chen@linux.intel.com>
Co-developed-by: Zhang Yi <yi.z.zhang@linux.intel.com>
Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
---
 arch/x86/include/asm/vmx.h      |  9 ++++
 arch/x86/include/uapi/asm/vmx.h |  2 +
 arch/x86/kvm/mmu/mmu.c          | 69 +++++++++++++++++++++++++---
 arch/x86/kvm/mmu/spp.c          | 13 ++++++
 arch/x86/kvm/mmu/spp.h          |  2 +
 arch/x86/kvm/trace.h            | 66 +++++++++++++++++++++++++++
 arch/x86/kvm/vmx/vmx.c          | 80 ++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              |  4 ++
 include/uapi/linux/kvm.h        |  6 +++
 9 files changed, 244 insertions(+), 7 deletions(-)

Comments

Sean Christopherson Jan. 10, 2020, 5:55 p.m. UTC | #1
On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote:
> If write to subpage is not allowed, EPT violation generates
> and it's handled in fast_page_fault().
> 
> In current implementation, SPPT setup is only handled in handle_spp()
> vmexit handler, it's triggered when SPP bit is set in EPT leaf
> entry while SPPT entries are not ready.
> 
> A SPP specific bit(11) is added to exit_qualification and a new
> exit reason(66) is introduced for SPP.

...

> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6f92b40d798c..c41791ebee65 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -6372,6 +6427,8 @@ unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
>  	return nr_mmu_pages;
>  }
>  
> +#include "spp.c"
> +

Unless there is a *very* good reason for these shenanigans, spp.c needs
to built via the Makefile like any other source.  If this is justified
for whatever reason, then that justification needs to be very clearly
stated in the changelog.

In general, the code organization of this entire series likely needs to
be overhauled.  There are gobs exports which are either completely
unnecessary or completely backswards.

E.g. exporting VMX-only functions from spp.c, which presumably are only
callbed by VMX.

	EXPORT_SYMBOL_GPL(vmx_spp_flush_sppt);
	EXPORT_SYMBOL_GPL(vmx_spp_init);

Exporting ioctl helpers from the same file, which are presumably called
only from x86.c.

	EXPORT_SYMBOL_GPL(kvm_vm_ioctl_get_subpages);
	EXPORT_SYMBOL_GPL(kvm_vm_ioctl_set_subpages);
Sean Christopherson Jan. 10, 2020, 6:04 p.m. UTC | #2
On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote:
> @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
>  		if ((error_code & PFERR_WRITE_MASK) &&
>  		    spte_can_locklessly_be_made_writable(spte))
>  		{
> -			new_spte |= PT_WRITABLE_MASK;
> +			/*
> +			 * Record write protect fault caused by
> +			 * Sub-page Protection, let VMI decide
> +			 * the next step.
> +			 */
> +			if (spte & PT_SPP_MASK) {
> +				int len = kvm_x86_ops->get_inst_len(vcpu);

There's got to be a better way to handle SPP exits than adding a helper
to retrieve the instruction length.

> +
> +				fault_handled = true;
> +				vcpu->run->exit_reason = KVM_EXIT_SPP;
> +				vcpu->run->spp.addr = gva;
> +				vcpu->run->spp.ins_len = len;

s/ins_len/insn_len to be consistent with other KVM nomenclature.

> +				trace_kvm_spp_induced_page_fault(vcpu,
> +								 gva,
> +								 len);
> +				break;
> +			}
> +
> +			if (was_spp_armed(new_spte)) {
> +				restore_spp_bit(&new_spte);
> +				spp_protected = true;
> +			} else {
> +				new_spte |= PT_WRITABLE_MASK;
> +			}
>  
>  			/*
>  			 * Do not fix write-permission on the large spte.  Since

...

> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 24e4e1c47f42..97d862c79124 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -200,7 +200,6 @@ static const struct {
>  	[VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
>  	[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
>  };
> -

Spurious whitepsace.

>  #define L1D_CACHE_ORDER 4
>  static void *vmx_l1d_flush_pages;
>  
> @@ -2999,6 +2998,7 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
>  	bool update_guest_cr3 = true;
>  	unsigned long guest_cr3;
>  	u64 eptp;
> +	u64 spptp;
>  
>  	guest_cr3 = cr3;
>  	if (enable_ept) {
> @@ -3027,6 +3027,12 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
>  
>  	if (update_guest_cr3)
>  		vmcs_writel(GUEST_CR3, guest_cr3);
> +
> +	if (kvm->arch.spp_active && VALID_PAGE(vcpu->kvm->arch.sppt_root)) {
> +		spptp = construct_spptp(vcpu->kvm->arch.sppt_root);
> +		vmcs_write64(SPPT_POINTER, spptp);
> +		vmx_flush_tlb(vcpu, true);

Why is SPP so special that it gets to force TLB flushes?

> +	}
>  }
>  
>  int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
> @@ -5361,6 +5367,74 @@ static int handle_monitor_trap(struct kvm_vcpu *vcpu)
>  	return 1;
>  }
>  
> +int handle_spp(struct kvm_vcpu *vcpu)

Can be static.

> +{
> +	unsigned long exit_qualification;
> +	struct kvm_memory_slot *slot;
> +	gpa_t gpa;
> +	gfn_t gfn;
> +
> +	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +
> +	/*
> +	 * SPP VM exit happened while executing iret from NMI,
> +	 * "blocked by NMI" bit has to be set before next VM entry.
> +	 * There are errata that may cause this bit to not be set:
> +	 * AAK134, BY25.
> +	 */
> +	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
> +	    (exit_qualification & SPPT_INTR_INFO_UNBLOCK_NMI))
> +		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
> +			      GUEST_INTR_STATE_NMI);
> +
> +	vcpu->arch.exit_qualification = exit_qualification;

	if (WARN_ON(!(exit_qualification & SPPT_INDUCED_EXIT_TYPE)))
		goto out_err;

	<handle spp exit>

	return 1;

out_err:
	vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
	vcpu->run->hw.hardware_exit_reason = EXIT_REASON_SPP;
	return 0;

> +	if (exit_qualification & SPPT_INDUCED_EXIT_TYPE) {
> +		int page_num = KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL);

The compiler is probably clever enough to make these constants, but if
this logic is a fundamental property of SPP then it should be defined as
a macro somewhere.

> +		u32 *access;
> +		gfn_t gfn_max;
> +
> +		/*
> +		 * SPPT missing
> +		 * We don't set SPP write access for the corresponding
> +		 * GPA, if we haven't setup, we need to construct
> +		 * SPP table here.
> +		 */
> +		gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> +		gfn = gpa >> PAGE_SHIFT;

gpa_to_gfn()

> +		trace_kvm_spp_induced_exit(vcpu, gpa, exit_qualification);
> +		/*
> +		 * In level 1 of SPPT, there's no PRESENT bit, all data is
> +		 * regarded as permission vector, so need to check from
> +		 * level 2 to set up the vector if target page is protected.
> +		 */
> +		spin_lock(&vcpu->kvm->mmu_lock);
> +		gfn &= ~(page_num - 1);



> +		gfn_max = gfn + page_num - 1;

s/gfn_max/gfn_end

> +		for (; gfn <= gfn_max; gfn++) {

My preference would be to do:
		gfn_end = gfn + page_num;

		for ( ; gfn < gfn_end; gfn++)

> +			slot = gfn_to_memslot(vcpu->kvm, gfn);
> +			if (!slot)
> +				continue;
> +			access = gfn_to_subpage_wp_info(slot, gfn);
> +			if (access && *access != FULL_SPP_ACCESS)
> +				kvm_spp_setup_structure(vcpu,
> +							*access,
> +							gfn);
> +		}
> +		spin_unlock(&vcpu->kvm->mmu_lock);
> +		return 1;
> +	}
> +	/*
> +	 * SPPT Misconfig
> +	 * This is probably caused by some mis-configuration in SPPT
> +	 * entries, cannot handle it here, escalate the fault to
> +	 * emulator.
> +	 */
> +	WARN_ON(1);
> +	vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
> +	vcpu->run->hw.hardware_exit_reason = EXIT_REASON_SPP;
> +	return 0;
> +}
> +
>  static int handle_monitor(struct kvm_vcpu *vcpu)
>  {
>  	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
> @@ -5575,6 +5649,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>  	[EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
>  	[EXIT_REASON_RDRAND]                  = handle_invalid_op,
>  	[EXIT_REASON_RDSEED]                  = handle_invalid_op,
> +	[EXIT_REASON_SPP]                     = handle_spp,
>  	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
>  	[EXIT_REASON_INVPCID]                 = handle_invpcid,
>  	[EXIT_REASON_VMFUNC]		      = handle_vmx_instruction,
> @@ -5807,6 +5882,9 @@ void dump_vmcs(void)
>  		pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
>  	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
>  		pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
> +	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_SPP))
> +		pr_err("SPPT pointer = 0x%016llx\n", vmcs_read64(SPPT_POINTER));
> +
>  	n = vmcs_read32(CR3_TARGET_COUNT);
>  	for (i = 0; i + 1 < n; i += 4)
>  		pr_err("CR3 target%u=%016lx target%u=%016lx\n",
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index fb7da000ceaf..a9d7fc21dad6 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9782,6 +9782,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
>  	}
>  
>  	kvm_page_track_free_memslot(free, dont);
> +	kvm_spp_free_memslot(free, dont);
>  }
>  
>  int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
> @@ -10406,3 +10407,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_spp_set_subpages);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_spp_induced_exit);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_spp_induced_page_fault);
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 09e5e8e6e6dd..c0f3162ee46a 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -244,6 +244,7 @@ struct kvm_hyperv_exit {
>  #define KVM_EXIT_IOAPIC_EOI       26
>  #define KVM_EXIT_HYPERV           27
>  #define KVM_EXIT_ARM_NISV         28
> +#define KVM_EXIT_SPP              29
>  
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  /* Emulate instruction failed. */
> @@ -401,6 +402,11 @@ struct kvm_run {
>  		struct {
>  			__u8 vector;
>  		} eoi;
> +		/* KVM_EXIT_SPP */
> +		struct {
> +			__u64 addr;
> +			__u8 ins_len;
> +		} spp;
>  		/* KVM_EXIT_HYPERV */
>  		struct kvm_hyperv_exit hyperv;
>  		/* KVM_EXIT_ARM_NISV */
> -- 
> 2.17.2
>
Yang, Weijiang Jan. 13, 2020, 6:50 a.m. UTC | #3
On Fri, Jan 10, 2020 at 09:55:37AM -0800, Sean Christopherson wrote:
> On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote:
> > If write to subpage is not allowed, EPT violation generates
> > and it's handled in fast_page_fault().
> > 
> > In current implementation, SPPT setup is only handled in handle_spp()
> > vmexit handler, it's triggered when SPP bit is set in EPT leaf
> > entry while SPPT entries are not ready.
> > 
> > A SPP specific bit(11) is added to exit_qualification and a new
> > exit reason(66) is introduced for SPP.
> 
> ...
> 
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 6f92b40d798c..c41791ebee65 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -6372,6 +6427,8 @@ unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
> >  	return nr_mmu_pages;
> >  }
> >  
> > +#include "spp.c"
> > +
> 
> Unless there is a *very* good reason for these shenanigans, spp.c needs
> to built via the Makefile like any other source.  If this is justified
> for whatever reason, then that justification needs to be very clearly
> stated in the changelog.

Yes, it looks odd. When extracted the SPP code from mmu.c, I found a lot
of functions in mmu.c should be exposed so that spp.c can see them, I
took them as unnecessary modification to mmu.c, so just add the spp.c file back
to mmu.c, if you suggest change it with a seperate object file, I'll do it.

> 
> In general, the code organization of this entire series likely needs to
> be overhauled.  There are gobs exports which are either completely
> unnecessary or completely backswards.
> 
> E.g. exporting VMX-only functions from spp.c, which presumably are only
> callbed by VMX.
> 
> 	EXPORT_SYMBOL_GPL(vmx_spp_flush_sppt);
> 	EXPORT_SYMBOL_GPL(vmx_spp_init);
> 
> Exporting ioctl helpers from the same file, which are presumably called
> only from x86.c.
> 
> 	EXPORT_SYMBOL_GPL(kvm_vm_ioctl_get_subpages);
> 	EXPORT_SYMBOL_GPL(kvm_vm_ioctl_set_subpages);

Thanks for the suggestion, I'll go over the patches and or-organize them.
Yang, Weijiang Jan. 13, 2020, 8:10 a.m. UTC | #4
On Fri, Jan 10, 2020 at 10:04:59AM -0800, Sean Christopherson wrote:
> On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote:
> > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
> >  		if ((error_code & PFERR_WRITE_MASK) &&
> >  		    spte_can_locklessly_be_made_writable(spte))
> >  		{
> > -			new_spte |= PT_WRITABLE_MASK;
> > +			/*
> > +			 * Record write protect fault caused by
> > +			 * Sub-page Protection, let VMI decide
> > +			 * the next step.
> > +			 */
> > +			if (spte & PT_SPP_MASK) {
> > +				int len = kvm_x86_ops->get_inst_len(vcpu);
> 
> There's got to be a better way to handle SPP exits than adding a helper
> to retrieve the instruction length.
>
The fault instruction was skipped by kvm_skip_emulated_instruction()
before, but Paolo suggested leave the re-do or skip option to user-space
to make it flexible for write protection or write tracking, so return
length to user-space.

> > +
> > +				fault_handled = true;
> > +				vcpu->run->exit_reason = KVM_EXIT_SPP;
> > +				vcpu->run->spp.addr = gva;
> > +				vcpu->run->spp.ins_len = len;
> 
> s/ins_len/insn_len to be consistent with other KVM nomenclature.
> 
OK.

> > +				trace_kvm_spp_induced_page_fault(vcpu,
> > +								 gva,
> > +								 len);
> > +				break;
> > +			}
> > +
> > +			if (was_spp_armed(new_spte)) {
> > +				restore_spp_bit(&new_spte);
> > +				spp_protected = true;
> > +			} else {
> > +				new_spte |= PT_WRITABLE_MASK;
> > +			}
> >  
> >  			/*
> >  			 * Do not fix write-permission on the large spte.  Since
> 
> ...
> 
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 24e4e1c47f42..97d862c79124 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -200,7 +200,6 @@ static const struct {
> >  	[VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
> >  	[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
> >  };
> > -
> 
> Spurious whitepsace.
>
OK.

> >  #define L1D_CACHE_ORDER 4
> >  static void *vmx_l1d_flush_pages;
> >  
> > @@ -2999,6 +2998,7 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
> >  	bool update_guest_cr3 = true;
> >  	unsigned long guest_cr3;
> >  	u64 eptp;
> > +	u64 spptp;
> >  
> >  	guest_cr3 = cr3;
> >  	if (enable_ept) {
> > @@ -3027,6 +3027,12 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
> >  
> >  	if (update_guest_cr3)
> >  		vmcs_writel(GUEST_CR3, guest_cr3);
> > +
> > +	if (kvm->arch.spp_active && VALID_PAGE(vcpu->kvm->arch.sppt_root)) {
> > +		spptp = construct_spptp(vcpu->kvm->arch.sppt_root);
> > +		vmcs_write64(SPPT_POINTER, spptp);
> > +		vmx_flush_tlb(vcpu, true);
> 
> Why is SPP so special that it gets to force TLB flushes?
I double checked the code, there's a call to vmx_flush_tlb() in
mmu_load(), so it's unnecessary here, thank you!

> > +	}
> >  }
> >  
> >  int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
> > @@ -5361,6 +5367,74 @@ static int handle_monitor_trap(struct kvm_vcpu *vcpu)
> >  	return 1;
> >  }
> >  
> > +int handle_spp(struct kvm_vcpu *vcpu)
> 
> Can be static.
>
Thanks!

> > +{
> > +	unsigned long exit_qualification;
> > +	struct kvm_memory_slot *slot;
> > +	gpa_t gpa;
> > +	gfn_t gfn;
> > +
> > +	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> > +
> > +	/*
> > +	 * SPP VM exit happened while executing iret from NMI,
> > +	 * "blocked by NMI" bit has to be set before next VM entry.
> > +	 * There are errata that may cause this bit to not be set:
> > +	 * AAK134, BY25.
> > +	 */
> > +	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
> > +	    (exit_qualification & SPPT_INTR_INFO_UNBLOCK_NMI))
> > +		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
> > +			      GUEST_INTR_STATE_NMI);
> > +
> > +	vcpu->arch.exit_qualification = exit_qualification;
> 
> 	if (WARN_ON(!(exit_qualification & SPPT_INDUCED_EXIT_TYPE)))
> 		goto out_err;
> 
> 	<handle spp exit>
> 
> 	return 1;
> 
> out_err:
> 	vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
> 	vcpu->run->hw.hardware_exit_reason = EXIT_REASON_SPP;
> 	return 0;
>
Sure, will change it.

> > +	if (exit_qualification & SPPT_INDUCED_EXIT_TYPE) {
> > +		int page_num = KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL);
> 
> The compiler is probably clever enough to make these constants, but if
> this logic is a fundamental property of SPP then it should be defined as
> a macro somewhere.
>
OK, will change it.

> > +		u32 *access;
> > +		gfn_t gfn_max;
> > +
> > +		/*
> > +		 * SPPT missing
> > +		 * We don't set SPP write access for the corresponding
> > +		 * GPA, if we haven't setup, we need to construct
> > +		 * SPP table here.
> > +		 */
> > +		gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> > +		gfn = gpa >> PAGE_SHIFT;
> 
> gpa_to_gfn()
>
OK.

> > +		trace_kvm_spp_induced_exit(vcpu, gpa, exit_qualification);
> > +		/*
> > +		 * In level 1 of SPPT, there's no PRESENT bit, all data is
> > +		 * regarded as permission vector, so need to check from
> > +		 * level 2 to set up the vector if target page is protected.
> > +		 */
> > +		spin_lock(&vcpu->kvm->mmu_lock);
> > +		gfn &= ~(page_num - 1);
> 
> 
> 
> > +		gfn_max = gfn + page_num - 1;
> 
> s/gfn_max/gfn_end
OK.

> 
> > +		for (; gfn <= gfn_max; gfn++) {
> 
> My preference would be to do:
> 		gfn_end = gfn + page_num;
> 
> 		for ( ; gfn < gfn_end; gfn++)
>
Thank you!
> > +			slot = gfn_to_memslot(vcpu->kvm, gfn);
Sean Christopherson Jan. 13, 2020, 5:33 p.m. UTC | #5
On Mon, Jan 13, 2020 at 04:10:50PM +0800, Yang Weijiang wrote:
> On Fri, Jan 10, 2020 at 10:04:59AM -0800, Sean Christopherson wrote:
> > On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote:
> > > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
> > >  		if ((error_code & PFERR_WRITE_MASK) &&
> > >  		    spte_can_locklessly_be_made_writable(spte))
> > >  		{
> > > -			new_spte |= PT_WRITABLE_MASK;
> > > +			/*
> > > +			 * Record write protect fault caused by
> > > +			 * Sub-page Protection, let VMI decide
> > > +			 * the next step.
> > > +			 */
> > > +			if (spte & PT_SPP_MASK) {
> > > +				int len = kvm_x86_ops->get_inst_len(vcpu);
> > 
> > There's got to be a better way to handle SPP exits than adding a helper
> > to retrieve the instruction length.
> >
> The fault instruction was skipped by kvm_skip_emulated_instruction()
> before, but Paolo suggested leave the re-do or skip option to user-space
> to make it flexible for write protection or write tracking, so return
> length to user-space.

Sorry, my comment was unclear.  I have no objection to punting the fault
to userspace, it's the mechanics of how it's done that I dislike.

Specifically, (a) using run->exit_reason to propagate the SPP exit up the
stack, e.g. instead of modifying affected call stacks to play nice with
any exit to userspace, (b) assuming ->get_insn_len() will always be
accurate, e.g. see the various caveats in skip_emulated_instruction() for
both VMX and SVM, and (c) duplicating the state capture code in every
location that can encounter a SPP fault.

What I'm hoping is that it's possible to modify the call stacks to
explicitly propagate an exit to userspace and/or SPP fault, and shove all
the state capture into a common location, e.g. handle_ept_violation().

Side topic, assuming the userspace VMI is going to be instrospecting the
faulting instruction, won't it decode the instruction?  I.e. calculate
the instruction length anyways?
Adalbert Lazăr Jan. 13, 2020, 6:55 p.m. UTC | #6
On Mon, 13 Jan 2020 09:33:58 -0800, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
> On Mon, Jan 13, 2020 at 04:10:50PM +0800, Yang Weijiang wrote:
> > On Fri, Jan 10, 2020 at 10:04:59AM -0800, Sean Christopherson wrote:
> > > On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote:
> > > > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
> > > >  		if ((error_code & PFERR_WRITE_MASK) &&
> > > >  		    spte_can_locklessly_be_made_writable(spte))
> > > >  		{
> > > > -			new_spte |= PT_WRITABLE_MASK;
> > > > +			/*
> > > > +			 * Record write protect fault caused by
> > > > +			 * Sub-page Protection, let VMI decide
> > > > +			 * the next step.
> > > > +			 */
> > > > +			if (spte & PT_SPP_MASK) {
> > > > +				int len = kvm_x86_ops->get_inst_len(vcpu);
> > > 
> > > There's got to be a better way to handle SPP exits than adding a helper
> > > to retrieve the instruction length.
> > >
> > The fault instruction was skipped by kvm_skip_emulated_instruction()
> > before, but Paolo suggested leave the re-do or skip option to user-space
> > to make it flexible for write protection or write tracking, so return
> > length to user-space.
> 
> Sorry, my comment was unclear.  I have no objection to punting the fault
> to userspace, it's the mechanics of how it's done that I dislike.
> 
> Specifically, (a) using run->exit_reason to propagate the SPP exit up the
> stack, e.g. instead of modifying affected call stacks to play nice with
> any exit to userspace, (b) assuming ->get_insn_len() will always be
> accurate, e.g. see the various caveats in skip_emulated_instruction() for
> both VMX and SVM, and (c) duplicating the state capture code in every
> location that can encounter a SPP fault.
> 
> What I'm hoping is that it's possible to modify the call stacks to
> explicitly propagate an exit to userspace and/or SPP fault, and shove all
> the state capture into a common location, e.g. handle_ept_violation().
> 
> Side topic, assuming the userspace VMI is going to be instrospecting the
> faulting instruction, won't it decode the instruction?  I.e. calculate
> the instruction length anyways?

Indeed, we decode the instruction from userspace. I don't know if the
instruction length helps other projects. Added Tamas and Mathieu.

In our last VMI API proposal, the breakpoint event had the instruction
length sent to userspace, but I can't remember why.

https://lore.kernel.org/kvm/20190809160047.8319-62-alazar@bitdefender.com/
Sean Christopherson Jan. 13, 2020, 9:47 p.m. UTC | #7
On Mon, Jan 13, 2020 at 08:55:46PM +0200, Adalbert Lazăr wrote:
> On Mon, 13 Jan 2020 09:33:58 -0800, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
> > On Mon, Jan 13, 2020 at 04:10:50PM +0800, Yang Weijiang wrote:
> > > On Fri, Jan 10, 2020 at 10:04:59AM -0800, Sean Christopherson wrote:
> > > > On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote:
> > > > > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
> > > > >  		if ((error_code & PFERR_WRITE_MASK) &&
> > > > >  		    spte_can_locklessly_be_made_writable(spte))
> > > > >  		{
> > > > > -			new_spte |= PT_WRITABLE_MASK;
> > > > > +			/*
> > > > > +			 * Record write protect fault caused by
> > > > > +			 * Sub-page Protection, let VMI decide
> > > > > +			 * the next step.
> > > > > +			 */
> > > > > +			if (spte & PT_SPP_MASK) {
> > > > > +				int len = kvm_x86_ops->get_inst_len(vcpu);
> > > > 
> > > > There's got to be a better way to handle SPP exits than adding a helper
> > > > to retrieve the instruction length.
> > > >
> > > The fault instruction was skipped by kvm_skip_emulated_instruction()
> > > before, but Paolo suggested leave the re-do or skip option to user-space
> > > to make it flexible for write protection or write tracking, so return
> > > length to user-space.
> > 
> > Sorry, my comment was unclear.  I have no objection to punting the fault
> > to userspace, it's the mechanics of how it's done that I dislike.
> > 
> > Specifically, (a) using run->exit_reason to propagate the SPP exit up the
> > stack, e.g. instead of modifying affected call stacks to play nice with
> > any exit to userspace, (b) assuming ->get_insn_len() will always be
> > accurate, e.g. see the various caveats in skip_emulated_instruction() for
> > both VMX and SVM, and (c) duplicating the state capture code in every
> > location that can encounter a SPP fault.
> > 
> > What I'm hoping is that it's possible to modify the call stacks to
> > explicitly propagate an exit to userspace and/or SPP fault, and shove all
> > the state capture into a common location, e.g. handle_ept_violation().
> > 
> > Side topic, assuming the userspace VMI is going to be instrospecting the
> > faulting instruction, won't it decode the instruction?  I.e. calculate
> > the instruction length anyways?
> 
> Indeed, we decode the instruction from userspace. I don't know if the
> instruction length helps other projects. Added Tamas and Mathieu.
> 
> In our last VMI API proposal, the breakpoint event had the instruction
> length sent to userspace, but I can't remember why.

INT3 is trap-like, i.e. the VM-Exit occurs after the instruction retires.
It's impossible for software to know how far to unwind RIP without the
instruction length being provided by hardware/KVM, e.g. if the guest is
being silly and prepends ignored prefixes on the INT3.

Self-aware software has a priori knowledge of what's being patched in,
and practically speaking I don't any well-behaved sane software uses
prefixes with INT3, but from a VMM's perspective it's legal and possible.

> 
> https://lore.kernel.org/kvm/20190809160047.8319-62-alazar@bitdefender.com/
Yang, Weijiang Jan. 14, 2020, 3:08 a.m. UTC | #8
On Mon, Jan 13, 2020 at 09:33:58AM -0800, Sean Christopherson wrote:
> On Mon, Jan 13, 2020 at 04:10:50PM +0800, Yang Weijiang wrote:
> > On Fri, Jan 10, 2020 at 10:04:59AM -0800, Sean Christopherson wrote:
> > > On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote:
> > > > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
> > > >  		if ((error_code & PFERR_WRITE_MASK) &&
> > > >  		    spte_can_locklessly_be_made_writable(spte))
> > > >  		{
> > > > -			new_spte |= PT_WRITABLE_MASK;
> > > > +			/*
> > > > +			 * Record write protect fault caused by
> > > > +			 * Sub-page Protection, let VMI decide
> > > > +			 * the next step.
> > > > +			 */
> > > > +			if (spte & PT_SPP_MASK) {
> > > > +				int len = kvm_x86_ops->get_inst_len(vcpu);
> > > 
> > > There's got to be a better way to handle SPP exits than adding a helper
> > > to retrieve the instruction length.
> > >
> > The fault instruction was skipped by kvm_skip_emulated_instruction()
> > before, but Paolo suggested leave the re-do or skip option to user-space
> > to make it flexible for write protection or write tracking, so return
> > length to user-space.
> 
> Sorry, my comment was unclear.  I have no objection to punting the fault
> to userspace, it's the mechanics of how it's done that I dislike.
> 
> Specifically, (a) using run->exit_reason to propagate the SPP exit up the
> stack, e.g. instead of modifying affected call stacks to play nice with
> any exit to userspace, (b) assuming ->get_insn_len() will always be
> accurate, e.g. see the various caveats in skip_emulated_instruction() for
> both VMX and SVM, and (c) duplicating the state capture code in every
> location that can encounter a SPP fault.
How about calling skip_emulated_instruction() in KVM before exit to
userspace, but still return the skipped instruction length, if userspace
would like to re-execute the instruction, it can unwind RIP or simply
rely on KVM?

> 
> What I'm hoping is that it's possible to modify the call stacks to
> explicitly propagate an exit to userspace and/or SPP fault, and shove all
> the state capture into a common location, e.g. handle_ept_violation().
>
The problem is, the state capture code in fast_page_fault() and
emulation case share different causes, the former is generic occurence
of SPP induced EPT violation, the latter is atually a "faked" one while
detecting emulation instruction is writing some SPP protected area, so I
seperated them.

> Side topic, assuming the userspace VMI is going to be instrospecting the
> faulting instruction, won't it decode the instruction?  I.e. calculate
> the instruction length anyways?
Sean Christopherson Jan. 14, 2020, 6:58 p.m. UTC | #9
On Tue, Jan 14, 2020 at 11:08:20AM +0800, Yang Weijiang wrote:
> On Mon, Jan 13, 2020 at 09:33:58AM -0800, Sean Christopherson wrote:
> > On Mon, Jan 13, 2020 at 04:10:50PM +0800, Yang Weijiang wrote:
> > > On Fri, Jan 10, 2020 at 10:04:59AM -0800, Sean Christopherson wrote:
> > > > On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote:
> > > > > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
> > > > >  		if ((error_code & PFERR_WRITE_MASK) &&
> > > > >  		    spte_can_locklessly_be_made_writable(spte))
> > > > >  		{
> > > > > -			new_spte |= PT_WRITABLE_MASK;
> > > > > +			/*
> > > > > +			 * Record write protect fault caused by
> > > > > +			 * Sub-page Protection, let VMI decide
> > > > > +			 * the next step.
> > > > > +			 */
> > > > > +			if (spte & PT_SPP_MASK) {
> > > > > +				int len = kvm_x86_ops->get_inst_len(vcpu);
> > > > 
> > > > There's got to be a better way to handle SPP exits than adding a helper
> > > > to retrieve the instruction length.
> > > >
> > > The fault instruction was skipped by kvm_skip_emulated_instruction()
> > > before, but Paolo suggested leave the re-do or skip option to user-space
> > > to make it flexible for write protection or write tracking, so return
> > > length to user-space.
> > 
> > Sorry, my comment was unclear.  I have no objection to punting the fault
> > to userspace, it's the mechanics of how it's done that I dislike.
> > 
> > Specifically, (a) using run->exit_reason to propagate the SPP exit up the
> > stack, e.g. instead of modifying affected call stacks to play nice with
> > any exit to userspace, (b) assuming ->get_insn_len() will always be
> > accurate, e.g. see the various caveats in skip_emulated_instruction() for
> > both VMX and SVM, and (c) duplicating the state capture code in every
> > location that can encounter a SPP fault.
>
> How about calling skip_emulated_instruction() in KVM before exit to

I'm confused.  It sounds like KVM_EXIT_SPP provides the instruction length
because it skips an instruction before exiting to userspace.  But if KVM
is is emulating an instruction, it shouldn't be doing
{kvm_}skip_emulated_instruction(), e.g. if emulation fails due to a SPP
violation (returns KVM_EXIT_SPP) then GUEST_RIP should still point at the
exiting instruction.  Ditto for the fast_page_fault() case, RIP shouldn't
be advanced.

What am I missing?

> userspace, but still return the skipped instruction length, if userspace
> would like to re-execute the instruction, it can unwind RIP or simply
> rely on KVM?

I'm not convinced the instruction length needs to be provided to userspace
for this case.  Obviously it's not difficult to provide the info, I just
don't understand the value added by doing so.  As above, RIP shouldn't
need to be unwound, and blindly skipping an instruction seems like an odd
thing for a VMI engine to do.

> > What I'm hoping is that it's possible to modify the call stacks to
> > explicitly propagate an exit to userspace and/or SPP fault, and shove all
> > the state capture into a common location, e.g. handle_ept_violation().
> >
> The problem is, the state capture code in fast_page_fault() and
> emulation case share different causes, the former is generic occurence
> of SPP induced EPT violation, the latter is atually a "faked" one while
> detecting emulation instruction is writing some SPP protected area, so I
> seperated them.

Can we make SPP dependent on unrestricted guest so that the only entry
point to the emulator is through handle_ept_violation()?  And thus the
only path to triggering KVM_EXIT_SPP would also be through
handle_ept_violation(); (I think, might be forgetting a different emulation
path).

>
> > Side topic, assuming the userspace VMI is going to be instrospecting the
> > faulting instruction, won't it decode the instruction?  I.e. calculate
> > the instruction length anyways?
Yang, Weijiang Jan. 15, 2020, 1:36 a.m. UTC | #10
On Tue, Jan 14, 2020 at 10:58:08AM -0800, Sean Christopherson wrote:
> On Tue, Jan 14, 2020 at 11:08:20AM +0800, Yang Weijiang wrote:
> > On Mon, Jan 13, 2020 at 09:33:58AM -0800, Sean Christopherson wrote:
> > > On Mon, Jan 13, 2020 at 04:10:50PM +0800, Yang Weijiang wrote:
> > > > On Fri, Jan 10, 2020 at 10:04:59AM -0800, Sean Christopherson wrote:
> > > > > On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote:
> > > > > > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
> > > > > >  		if ((error_code & PFERR_WRITE_MASK) &&
> > > > > >  		    spte_can_locklessly_be_made_writable(spte))
> > > > > >  		{
> > > > > > -			new_spte |= PT_WRITABLE_MASK;
> > > > > > +			/*
> > > > > > +			 * Record write protect fault caused by
> > > > > > +			 * Sub-page Protection, let VMI decide
> > > > > > +			 * the next step.
> > > > > > +			 */
> > > > > > +			if (spte & PT_SPP_MASK) {
> > > > > > +				int len = kvm_x86_ops->get_inst_len(vcpu);
> > > > > 
> > > > > There's got to be a better way to handle SPP exits than adding a helper
> > > > > to retrieve the instruction length.
> > > > >
> > > > The fault instruction was skipped by kvm_skip_emulated_instruction()
> > > > before, but Paolo suggested leave the re-do or skip option to user-space
> > > > to make it flexible for write protection or write tracking, so return
> > > > length to user-space.
> > > 
> > > Sorry, my comment was unclear.  I have no objection to punting the fault
> > > to userspace, it's the mechanics of how it's done that I dislike.
> > > 
> > > Specifically, (a) using run->exit_reason to propagate the SPP exit up the
> > > stack, e.g. instead of modifying affected call stacks to play nice with
> > > any exit to userspace, (b) assuming ->get_insn_len() will always be
> > > accurate, e.g. see the various caveats in skip_emulated_instruction() for
> > > both VMX and SVM, and (c) duplicating the state capture code in every
> > > location that can encounter a SPP fault.
> >
> > How about calling skip_emulated_instruction() in KVM before exit to
> 
> I'm confused.  It sounds like KVM_EXIT_SPP provides the instruction length
> because it skips an instruction before exiting to userspace.  But if KVM
> is is emulating an instruction, it shouldn't be doing
> {kvm_}skip_emulated_instruction(), e.g. if emulation fails due to a SPP
> violation (returns KVM_EXIT_SPP) then GUEST_RIP should still point at the
> exiting instruction.  Ditto for the fast_page_fault() case, RIP shouldn't
> be advanced.
There're two SPP usages, one is for write-protection the other is for
write-tracking. If the first case is being used, KVM ignores the write
, i.e., write to the memory is discarded. The second case is, if
userspace is tracking memory write through SPP, then it's notified via
KVM_EXIT_SPP but still let the write take effect by unprotecting the
subpage, i.e., like generic 4KB access-tracking.

In the first case, no necessity to re-try the faulted instruction,
the second case, a re-try is necessary, so I would skip current instruction
first, then if it's actually the second case, userspace should take action
based on the instruction lenght returned.
> 
> What am I missing?
>

> > userspace, but still return the skipped instruction length, if userspace
> > would like to re-execute the instruction, it can unwind RIP or simply
> > rely on KVM?
> 
> I'm not convinced the instruction length needs to be provided to userspace
> for this case.  Obviously it's not difficult to provide the info, I just
> don't understand the value added by doing so.  As above, RIP shouldn't
> need to be unwound, and blindly skipping an instruction seems like an odd
> thing for a VMI engine to do.
In the last review by Paolo, he mentioned SPP could be used in
access-tracing manner, it's flexible to provide instruction length to
userspace, so I removed instruction-skip in KVM but let userspace to
decide.
> 
> > > What I'm hoping is that it's possible to modify the call stacks to
> > > explicitly propagate an exit to userspace and/or SPP fault, and shove all
> > > the state capture into a common location, e.g. handle_ept_violation().
> > >
> > The problem is, the state capture code in fast_page_fault() and
> > emulation case share different causes, the former is generic occurence
> > of SPP induced EPT violation, the latter is atually a "faked" one while
> > detecting emulation instruction is writing some SPP protected area, so I
> > seperated them.
> 
> Can we make SPP dependent on unrestricted guest so that the only entry
> point to the emulator is through handle_ept_violation()?  And thus the
> only path to triggering KVM_EXIT_SPP would also be through
> handle_ept_violation(); (I think, might be forgetting a different emulation
> path).
> 
I don't got your point, from my understanding, instruction emulation is
used in several cases regarless of guest working mode. As long as any memory write happens
e.g., string ops, port ops etc, SPP write-protection
check should be applied to let the userspace capture the event.

> >
> > > Side topic, assuming the userspace VMI is going to be instrospecting the
> > > faulting instruction, won't it decode the instruction?  I.e. calculate
> > > the instruction length anyways?
Paolo Bonzini Jan. 21, 2020, 2:01 p.m. UTC | #11
On 10/01/20 18:55, Sean Christopherson wrote:
> Unless there is a *very* good reason for these shenanigans, spp.c needs
> to built via the Makefile like any other source.  If this is justified
> for whatever reason, then that justification needs to be very clearly
> stated in the changelog.

Well, this #include is the reason why I moved mmu.c to mmu/spp.c.  It
shouldn't be hard to create a mmu_internal.h header for things that have
to be shared between mmu.c and spp.c, but I'm okay with having the
#include "spp.c" for now and cleaning it up later.  The spp.c file,
albeit ugly, provides hints as to what to put in that header; without it
it's a pointless exercise.

Note that there isn't anything really VMX specific even in the few vmx_*
functions of spp.c.  My suggestion is just to rename them to kvm_mmu_*.

Paolo
Paolo Bonzini Jan. 21, 2020, 2:14 p.m. UTC | #12
On 14/01/20 19:58, Sean Christopherson wrote:
> I'm not convinced the instruction length needs to be provided to userspace
> for this case.  Obviously it's not difficult to provide the info, I just
> don't understand the value added by doing so.  As above, RIP shouldn't
> need to be unwound, and blindly skipping an instruction seems like an odd
> thing for a VMI engine to do.
> 

The reason to introduce the instruction length was so that userspace
could blindly use SPP to emulate ROM behavior.  Weijiang's earlier
patches in fact _only_ provided that behavior, and I asked him to change
it so that, by default, using SPP and not handling it will basically
cause an infinite loop of SPP violations.

One possibility to clean things up is to change "fault_handled" and
fast_page_fault's return value from bool to RET_PF* (false becomes
RET_PF_INVALID, true becomes RET_PF_RETRY).  direct_page_fault would
also have to do something like

	r = fast_page_fault(vcpu, gpa, level, error_code))
	if (r != RET_PF_INVALID)
		return r;

Then fast_page_fault can just return RET_PF_USERSPACE, and this ugly
case can go away.

+		if (vcpu->run->exit_reason == KVM_EXIT_SPP)
+			r = RET_PF_USERSPACE;
+

Thanks,

Paolo
diff mbox series

Patch

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index de79a4de0723..81faf6607a9e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -213,6 +213,8 @@  enum vmcs_field {
 	XSS_EXIT_BITMAP_HIGH            = 0x0000202D,
 	ENCLS_EXITING_BITMAP		= 0x0000202E,
 	ENCLS_EXITING_BITMAP_HIGH	= 0x0000202F,
+	SPPT_POINTER			= 0x00002030,
+	SPPT_POINTER_HIGH		= 0x00002031,
 	TSC_MULTIPLIER                  = 0x00002032,
 	TSC_MULTIPLIER_HIGH             = 0x00002033,
 	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
@@ -534,6 +536,13 @@  struct vmx_msr_entry {
 #define EPT_VIOLATION_EXECUTABLE	(1 << EPT_VIOLATION_EXECUTABLE_BIT)
 #define EPT_VIOLATION_GVA_TRANSLATED	(1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
 
+/*
+ * Exit Qualifications for SPPT-Induced vmexits
+ */
+#define SPPT_INDUCED_EXIT_TYPE_BIT     11
+#define SPPT_INDUCED_EXIT_TYPE         (1 << SPPT_INDUCED_EXIT_TYPE_BIT)
+#define SPPT_INTR_INFO_UNBLOCK_NMI     INTR_INFO_UNBLOCK_NMI
+
 /*
  * VM-instruction error numbers
  */
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 3eb8411ab60e..4833a39a799b 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -86,6 +86,7 @@ 
 #define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
+#define EXIT_REASON_SPP                 66
 #define EXIT_REASON_UMWAIT              67
 #define EXIT_REASON_TPAUSE              68
 
@@ -145,6 +146,7 @@ 
 	{ EXIT_REASON_ENCLS,                 "ENCLS" }, \
 	{ EXIT_REASON_RDSEED,                "RDSEED" }, \
 	{ EXIT_REASON_PML_FULL,              "PML_FULL" }, \
+	{ EXIT_REASON_SPP,                   "SPP" }, \
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
 	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
 	{ EXIT_REASON_UMWAIT,                "UMWAIT" }, \
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6f92b40d798c..c41791ebee65 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -20,6 +20,7 @@ 
 #include "x86.h"
 #include "kvm_cache_regs.h"
 #include "cpuid.h"
+#include "spp.h"
 
 #include <linux/kvm_host.h>
 #include <linux/types.h>
@@ -177,6 +178,7 @@  module_param(dbg, bool, 0644);
 /* The mask for the R/X bits in EPT PTEs */
 #define PT64_EPT_READABLE_MASK			0x1ull
 #define PT64_EPT_EXECUTABLE_MASK		0x4ull
+#define PT64_SPP_SAVED_BIT	(1ULL << (PT64_SECOND_AVAIL_BITS_SHIFT + 1))
 
 #include <trace/events/kvm.h>
 
@@ -200,6 +202,7 @@  enum {
 	RET_PF_RETRY = 0,
 	RET_PF_EMULATE = 1,
 	RET_PF_INVALID = 2,
+	RET_PF_USERSPACE = 3,
 };
 
 struct pte_list_desc {
@@ -979,6 +982,11 @@  static u64 mark_spte_for_access_track(u64 spte)
 		shadow_acc_track_saved_bits_shift;
 	spte &= ~shadow_acc_track_mask;
 
+	if (spte & PT_SPP_MASK) {
+		spte &= ~PT_SPP_MASK;
+		spte |= PT64_SPP_SAVED_BIT;
+	}
+
 	return spte;
 }
 
@@ -1677,9 +1685,15 @@  static bool spte_wrprot_for_clear_dirty(u64 *sptep)
 {
 	bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
 					       (unsigned long *)sptep);
+	bool was_spp_armed = test_and_clear_bit(PT_SPP_SHIFT,
+					       (unsigned long *)sptep);
+
 	if (was_writable && !spte_ad_enabled(*sptep))
 		kvm_set_pfn_dirty(spte_to_pfn(*sptep));
 
+	if (was_spp_armed)
+		*sptep |= PT64_SPP_SAVED_BIT;
+
 	return was_writable;
 }
 
@@ -3478,7 +3492,8 @@  static bool page_fault_can_be_fast(u32 error_code)
  */
 static bool
 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			u64 *sptep, u64 old_spte, u64 new_spte)
+			u64 *sptep, u64 old_spte, u64 new_spte,
+			bool spp_protected)
 {
 	gfn_t gfn;
 
@@ -3499,7 +3514,8 @@  fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
 		return false;
 
-	if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
+	if ((is_writable_pte(new_spte) && !is_writable_pte(old_spte)) ||
+	    spp_protected) {
 		/*
 		 * The gfn of direct spte is stable since it is
 		 * calculated by sp->gfn.
@@ -3534,6 +3550,7 @@  static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
 	bool fault_handled = false;
+	bool spp_protected = false;
 	u64 spte = 0ull;
 	uint retry_count = 0;
 
@@ -3585,7 +3602,30 @@  static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		if ((error_code & PFERR_WRITE_MASK) &&
 		    spte_can_locklessly_be_made_writable(spte))
 		{
-			new_spte |= PT_WRITABLE_MASK;
+			/*
+			 * Record write protect fault caused by
+			 * Sub-page Protection, let VMI decide
+			 * the next step.
+			 */
+			if (spte & PT_SPP_MASK) {
+				int len = kvm_x86_ops->get_inst_len(vcpu);
+
+				fault_handled = true;
+				vcpu->run->exit_reason = KVM_EXIT_SPP;
+				vcpu->run->spp.addr = gva;
+				vcpu->run->spp.ins_len = len;
+				trace_kvm_spp_induced_page_fault(vcpu,
+								 gva,
+								 len);
+				break;
+			}
+
+			if (was_spp_armed(new_spte)) {
+				restore_spp_bit(&new_spte);
+				spp_protected = true;
+			} else {
+				new_spte |= PT_WRITABLE_MASK;
+			}
 
 			/*
 			 * Do not fix write-permission on the large spte.  Since
@@ -3604,7 +3644,8 @@  static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 
 		/* Verify that the fault can be handled in the fast path */
 		if (new_spte == spte ||
-		    !is_access_allowed(error_code, new_spte))
+		    (!is_access_allowed(error_code, new_spte) &&
+		    !spp_protected))
 			break;
 
 		/*
@@ -3614,7 +3655,8 @@  static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		 */
 		fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
 							iterator.sptep, spte,
-							new_spte);
+							new_spte,
+							spp_protected);
 		if (fault_handled)
 			break;
 
@@ -3740,6 +3782,12 @@  void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 		    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
 			mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
 					   &invalid_list);
+			if (vcpu->kvm->arch.spp_active) {
+				vcpu->kvm->arch.spp_active = false;
+				mmu_free_root_page(vcpu->kvm,
+						   &vcpu->kvm->arch.sppt_root,
+						   &invalid_list);
+			}
 		} else {
 			for (i = 0; i < 4; ++i)
 				if (mmu->pae_root[i] != 0)
@@ -5223,6 +5271,8 @@  void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
 		uint i;
 
 		vcpu->arch.mmu->root_hpa = INVALID_PAGE;
+		if (!vcpu->kvm->arch.spp_active)
+			vcpu->kvm->arch.sppt_root = INVALID_PAGE;
 
 		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
 			vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
@@ -5539,6 +5589,10 @@  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
 		r = vcpu->arch.mmu->page_fault(vcpu, cr2,
 					       lower_32_bits(error_code),
 					       false);
+
+		if (vcpu->run->exit_reason == KVM_EXIT_SPP)
+			r = RET_PF_USERSPACE;
+
 		WARN_ON(r == RET_PF_INVALID);
 	}
 
@@ -5546,7 +5600,8 @@  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
 		return 1;
 	if (r < 0)
 		return r;
-
+	if (r == RET_PF_USERSPACE)
+		return 0;
 	/*
 	 * Before emulating the instruction, check if the error code
 	 * was due to a RO violation while translating the guest page.
@@ -6372,6 +6427,8 @@  unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
 	return nr_mmu_pages;
 }
 
+#include "spp.c"
+
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
 	kvm_mmu_unload(vcpu);
diff --git a/arch/x86/kvm/mmu/spp.c b/arch/x86/kvm/mmu/spp.c
index 6f611e04e817..3ec434140967 100644
--- a/arch/x86/kvm/mmu/spp.c
+++ b/arch/x86/kvm/mmu/spp.c
@@ -17,6 +17,18 @@  static void shadow_spp_walk_init(struct kvm_shadow_walk_iterator *iterator,
 	iterator->level = PT64_ROOT_4LEVEL;
 }
 
+/* Restore an spp armed PTE */
+void restore_spp_bit(u64 *spte)
+{
+	*spte &= ~PT64_SPP_SAVED_BIT;
+	*spte |= PT_SPP_MASK;
+}
+
+bool was_spp_armed(u64 spte)
+{
+	return !!(spte & PT64_SPP_SAVED_BIT);
+}
+
 u32 *gfn_to_subpage_wp_info(struct kvm_memory_slot *slot, gfn_t gfn)
 {
 	unsigned long idx;
@@ -459,6 +471,7 @@  int kvm_spp_set_permission(struct kvm *kvm, u64 gfn, u32 npages,
 		if (!access)
 			return -EFAULT;
 		*access = access_map[i];
+		trace_kvm_spp_set_subpages(vcpu, gfn, *access);
 	}
 
 	gfn = old_gfn;
diff --git a/arch/x86/kvm/mmu/spp.h b/arch/x86/kvm/mmu/spp.h
index daa69bd274da..5ffe06d2cd6f 100644
--- a/arch/x86/kvm/mmu/spp.h
+++ b/arch/x86/kvm/mmu/spp.h
@@ -11,6 +11,8 @@  int kvm_spp_set_permission(struct kvm *kvm, u64 gfn, u32 npages,
 			   u32 *access_map);
 int kvm_spp_mark_protection(struct kvm *kvm, u64 gfn, u32 access);
 bool is_spp_spte(struct kvm_mmu_page *sp);
+void restore_spp_bit(u64 *spte);
+bool was_spp_armed(u64 spte);
 u64 construct_spptp(unsigned long root_hpa);
 int kvm_vm_ioctl_get_subpages(struct kvm *kvm,
 			      u64 gfn,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 7c741a0c5f80..293b91d516b1 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1496,6 +1496,72 @@  TRACE_EVENT(kvm_nested_vmenter_failed,
 		__print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS))
 );
 
+TRACE_EVENT(kvm_spp_set_subpages,
+	TP_PROTO(struct kvm_vcpu *vcpu, gfn_t gfn, u32 access),
+	TP_ARGS(vcpu, gfn, access),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(gfn_t, gfn)
+		__field(u32, access)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu->vcpu_id;
+		__entry->gfn = gfn;
+		__entry->access = access;
+	),
+
+	TP_printk("vcpu %d gfn %llx access %x",
+		  __entry->vcpu_id,
+		  __entry->gfn,
+		  __entry->access)
+);
+
+TRACE_EVENT(kvm_spp_induced_exit,
+	TP_PROTO(struct kvm_vcpu *vcpu, gpa_t gpa, u32 qualification),
+	TP_ARGS(vcpu, gpa, qualification),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(gpa_t, gpa)
+		__field(u32, qualification)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu->vcpu_id;
+		__entry->gpa = gpa;
+		__entry->qualification = qualification;
+	),
+
+	TP_printk("vcpu %d gpa %llx qualificaiton %x",
+		  __entry->vcpu_id,
+		  __entry->gpa,
+		  __entry->qualification)
+);
+
+TRACE_EVENT(kvm_spp_induced_page_fault,
+	TP_PROTO(struct kvm_vcpu *vcpu, gpa_t gpa, int inst_len),
+	TP_ARGS(vcpu, gpa, inst_len),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(gpa_t, gpa)
+		__field(int, inst_len)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu->vcpu_id;
+		__entry->gpa = gpa;
+		__entry->inst_len = inst_len;
+	),
+
+	TP_printk("vcpu %d gpa %llx inst_len %d",
+		  __entry->vcpu_id,
+		  __entry->gpa,
+		  __entry->inst_len)
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 24e4e1c47f42..97d862c79124 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -200,7 +200,6 @@  static const struct {
 	[VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
 	[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
 };
-
 #define L1D_CACHE_ORDER 4
 static void *vmx_l1d_flush_pages;
 
@@ -2999,6 +2998,7 @@  void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	bool update_guest_cr3 = true;
 	unsigned long guest_cr3;
 	u64 eptp;
+	u64 spptp;
 
 	guest_cr3 = cr3;
 	if (enable_ept) {
@@ -3027,6 +3027,12 @@  void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
 	if (update_guest_cr3)
 		vmcs_writel(GUEST_CR3, guest_cr3);
+
+	if (kvm->arch.spp_active && VALID_PAGE(vcpu->kvm->arch.sppt_root)) {
+		spptp = construct_spptp(vcpu->kvm->arch.sppt_root);
+		vmcs_write64(SPPT_POINTER, spptp);
+		vmx_flush_tlb(vcpu, true);
+	}
 }
 
 int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -5361,6 +5367,74 @@  static int handle_monitor_trap(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+int handle_spp(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qualification;
+	struct kvm_memory_slot *slot;
+	gpa_t gpa;
+	gfn_t gfn;
+
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+	/*
+	 * SPP VM exit happened while executing iret from NMI,
+	 * "blocked by NMI" bit has to be set before next VM entry.
+	 * There are errata that may cause this bit to not be set:
+	 * AAK134, BY25.
+	 */
+	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+	    (exit_qualification & SPPT_INTR_INFO_UNBLOCK_NMI))
+		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+			      GUEST_INTR_STATE_NMI);
+
+	vcpu->arch.exit_qualification = exit_qualification;
+	if (exit_qualification & SPPT_INDUCED_EXIT_TYPE) {
+		int page_num = KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL);
+		u32 *access;
+		gfn_t gfn_max;
+
+		/*
+		 * SPPT missing
+		 * We don't set SPP write access for the corresponding
+		 * GPA, if we haven't setup, we need to construct
+		 * SPP table here.
+		 */
+		gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+		gfn = gpa >> PAGE_SHIFT;
+		trace_kvm_spp_induced_exit(vcpu, gpa, exit_qualification);
+		/*
+		 * In level 1 of SPPT, there's no PRESENT bit, all data is
+		 * regarded as permission vector, so need to check from
+		 * level 2 to set up the vector if target page is protected.
+		 */
+		spin_lock(&vcpu->kvm->mmu_lock);
+		gfn &= ~(page_num - 1);
+		gfn_max = gfn + page_num - 1;
+		for (; gfn <= gfn_max; gfn++) {
+			slot = gfn_to_memslot(vcpu->kvm, gfn);
+			if (!slot)
+				continue;
+			access = gfn_to_subpage_wp_info(slot, gfn);
+			if (access && *access != FULL_SPP_ACCESS)
+				kvm_spp_setup_structure(vcpu,
+							*access,
+							gfn);
+		}
+		spin_unlock(&vcpu->kvm->mmu_lock);
+		return 1;
+	}
+	/*
+	 * SPPT Misconfig
+	 * This is probably caused by some mis-configuration in SPPT
+	 * entries, cannot handle it here, escalate the fault to
+	 * emulator.
+	 */
+	WARN_ON(1);
+	vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+	vcpu->run->hw.hardware_exit_reason = EXIT_REASON_SPP;
+	return 0;
+}
+
 static int handle_monitor(struct kvm_vcpu *vcpu)
 {
 	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
@@ -5575,6 +5649,7 @@  static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
 	[EXIT_REASON_RDRAND]                  = handle_invalid_op,
 	[EXIT_REASON_RDSEED]                  = handle_invalid_op,
+	[EXIT_REASON_SPP]                     = handle_spp,
 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
 	[EXIT_REASON_INVPCID]                 = handle_invpcid,
 	[EXIT_REASON_VMFUNC]		      = handle_vmx_instruction,
@@ -5807,6 +5882,9 @@  void dump_vmcs(void)
 		pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
 	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
 		pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
+	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_SPP))
+		pr_err("SPPT pointer = 0x%016llx\n", vmcs_read64(SPPT_POINTER));
+
 	n = vmcs_read32(CR3_TARGET_COUNT);
 	for (i = 0; i + 1 < n; i += 4)
 		pr_err("CR3 target%u=%016lx target%u=%016lx\n",
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fb7da000ceaf..a9d7fc21dad6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9782,6 +9782,7 @@  void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 	}
 
 	kvm_page_track_free_memslot(free, dont);
+	kvm_spp_free_memslot(free, dont);
 }
 
 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
@@ -10406,3 +10407,6 @@  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_spp_set_subpages);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_spp_induced_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_spp_induced_page_fault);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 09e5e8e6e6dd..c0f3162ee46a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -244,6 +244,7 @@  struct kvm_hyperv_exit {
 #define KVM_EXIT_IOAPIC_EOI       26
 #define KVM_EXIT_HYPERV           27
 #define KVM_EXIT_ARM_NISV         28
+#define KVM_EXIT_SPP              29
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -401,6 +402,11 @@  struct kvm_run {
 		struct {
 			__u8 vector;
 		} eoi;
+		/* KVM_EXIT_SPP */
+		struct {
+			__u64 addr;
+			__u8 ins_len;
+		} spp;
 		/* KVM_EXIT_HYPERV */
 		struct kvm_hyperv_exit hyperv;
 		/* KVM_EXIT_ARM_NISV */