Message ID | 20200102061319.10077-7-weijiang.yang@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Enable Sub-Page Write Protection Support | expand |
On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote: > If write to subpage is not allowed, EPT violation generates > and it's handled in fast_page_fault(). > > In current implementation, SPPT setup is only handled in handle_spp() > vmexit handler, it's triggered when SPP bit is set in EPT leaf > entry while SPPT entries are not ready. > > A SPP specific bit(11) is added to exit_qualification and a new > exit reason(66) is introduced for SPP. ... > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 6f92b40d798c..c41791ebee65 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -6372,6 +6427,8 @@ unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) > return nr_mmu_pages; > } > > +#include "spp.c" > + Unless there is a *very* good reason for these shenanigans, spp.c needs to built via the Makefile like any other source. If this is justified for whatever reason, then that justification needs to be very clearly stated in the changelog. In general, the code organization of this entire series likely needs to be overhauled. There are gobs exports which are either completely unnecessary or completely backswards. E.g. exporting VMX-only functions from spp.c, which presumably are only callbed by VMX. EXPORT_SYMBOL_GPL(vmx_spp_flush_sppt); EXPORT_SYMBOL_GPL(vmx_spp_init); Exporting ioctl helpers from the same file, which are presumably called only from x86.c. EXPORT_SYMBOL_GPL(kvm_vm_ioctl_get_subpages); EXPORT_SYMBOL_GPL(kvm_vm_ioctl_set_subpages);
On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote: > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > if ((error_code & PFERR_WRITE_MASK) && > spte_can_locklessly_be_made_writable(spte)) > { > - new_spte |= PT_WRITABLE_MASK; > + /* > + * Record write protect fault caused by > + * Sub-page Protection, let VMI decide > + * the next step. > + */ > + if (spte & PT_SPP_MASK) { > + int len = kvm_x86_ops->get_inst_len(vcpu); There's got to be a better way to handle SPP exits than adding a helper to retrieve the instruction length. > + > + fault_handled = true; > + vcpu->run->exit_reason = KVM_EXIT_SPP; > + vcpu->run->spp.addr = gva; > + vcpu->run->spp.ins_len = len; s/ins_len/insn_len to be consistent with other KVM nomenclature. > + trace_kvm_spp_induced_page_fault(vcpu, > + gva, > + len); > + break; > + } > + > + if (was_spp_armed(new_spte)) { > + restore_spp_bit(&new_spte); > + spp_protected = true; > + } else { > + new_spte |= PT_WRITABLE_MASK; > + } > > /* > * Do not fix write-permission on the large spte. Since ... > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > index 24e4e1c47f42..97d862c79124 100644 > --- a/arch/x86/kvm/vmx/vmx.c > +++ b/arch/x86/kvm/vmx/vmx.c > @@ -200,7 +200,6 @@ static const struct { > [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, > [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, > }; > - Spurious whitepsace. > #define L1D_CACHE_ORDER 4 > static void *vmx_l1d_flush_pages; > > @@ -2999,6 +2998,7 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) > bool update_guest_cr3 = true; > unsigned long guest_cr3; > u64 eptp; > + u64 spptp; > > guest_cr3 = cr3; > if (enable_ept) { > @@ -3027,6 +3027,12 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) > > if (update_guest_cr3) > vmcs_writel(GUEST_CR3, guest_cr3); > + > + if (kvm->arch.spp_active && VALID_PAGE(vcpu->kvm->arch.sppt_root)) { > + spptp = construct_spptp(vcpu->kvm->arch.sppt_root); > + vmcs_write64(SPPT_POINTER, spptp); > + vmx_flush_tlb(vcpu, true); Why is SPP so special that it gets to force TLB flushes? > + } > } > > int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) > @@ -5361,6 +5367,74 @@ static int handle_monitor_trap(struct kvm_vcpu *vcpu) > return 1; > } > > +int handle_spp(struct kvm_vcpu *vcpu) Can be static. > +{ > + unsigned long exit_qualification; > + struct kvm_memory_slot *slot; > + gpa_t gpa; > + gfn_t gfn; > + > + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); > + > + /* > + * SPP VM exit happened while executing iret from NMI, > + * "blocked by NMI" bit has to be set before next VM entry. > + * There are errata that may cause this bit to not be set: > + * AAK134, BY25. > + */ > + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && > + (exit_qualification & SPPT_INTR_INFO_UNBLOCK_NMI)) > + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, > + GUEST_INTR_STATE_NMI); > + > + vcpu->arch.exit_qualification = exit_qualification; if (WARN_ON(!(exit_qualification & SPPT_INDUCED_EXIT_TYPE))) goto out_err; <handle spp exit> return 1; out_err: vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_SPP; return 0; > + if (exit_qualification & SPPT_INDUCED_EXIT_TYPE) { > + int page_num = KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL); The compiler is probably clever enough to make these constants, but if this logic is a fundamental property of SPP then it should be defined as a macro somewhere. > + u32 *access; > + gfn_t gfn_max; > + > + /* > + * SPPT missing > + * We don't set SPP write access for the corresponding > + * GPA, if we haven't setup, we need to construct > + * SPP table here. > + */ > + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); > + gfn = gpa >> PAGE_SHIFT; gpa_to_gfn() > + trace_kvm_spp_induced_exit(vcpu, gpa, exit_qualification); > + /* > + * In level 1 of SPPT, there's no PRESENT bit, all data is > + * regarded as permission vector, so need to check from > + * level 2 to set up the vector if target page is protected. > + */ > + spin_lock(&vcpu->kvm->mmu_lock); > + gfn &= ~(page_num - 1); > + gfn_max = gfn + page_num - 1; s/gfn_max/gfn_end > + for (; gfn <= gfn_max; gfn++) { My preference would be to do: gfn_end = gfn + page_num; for ( ; gfn < gfn_end; gfn++) > + slot = gfn_to_memslot(vcpu->kvm, gfn); > + if (!slot) > + continue; > + access = gfn_to_subpage_wp_info(slot, gfn); > + if (access && *access != FULL_SPP_ACCESS) > + kvm_spp_setup_structure(vcpu, > + *access, > + gfn); > + } > + spin_unlock(&vcpu->kvm->mmu_lock); > + return 1; > + } > + /* > + * SPPT Misconfig > + * This is probably caused by some mis-configuration in SPPT > + * entries, cannot handle it here, escalate the fault to > + * emulator. > + */ > + WARN_ON(1); > + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; > + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_SPP; > + return 0; > +} > + > static int handle_monitor(struct kvm_vcpu *vcpu) > { > printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); > @@ -5575,6 +5649,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { > [EXIT_REASON_INVVPID] = handle_vmx_instruction, > [EXIT_REASON_RDRAND] = handle_invalid_op, > [EXIT_REASON_RDSEED] = handle_invalid_op, > + [EXIT_REASON_SPP] = handle_spp, > [EXIT_REASON_PML_FULL] = handle_pml_full, > [EXIT_REASON_INVPCID] = handle_invpcid, > [EXIT_REASON_VMFUNC] = handle_vmx_instruction, > @@ -5807,6 +5882,9 @@ void dump_vmcs(void) > pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); > if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) > pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); > + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_SPP)) > + pr_err("SPPT pointer = 0x%016llx\n", vmcs_read64(SPPT_POINTER)); > + > n = vmcs_read32(CR3_TARGET_COUNT); > for (i = 0; i + 1 < n; i += 4) > pr_err("CR3 target%u=%016lx target%u=%016lx\n", > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index fb7da000ceaf..a9d7fc21dad6 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -9782,6 +9782,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, > } > > kvm_page_track_free_memslot(free, dont); > + kvm_spp_free_memslot(free, dont); > } > > int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, > @@ -10406,3 +10407,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); > EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); > EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); > EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); > +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_spp_set_subpages); > +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_spp_induced_exit); > +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_spp_induced_page_fault); > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index 09e5e8e6e6dd..c0f3162ee46a 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -244,6 +244,7 @@ struct kvm_hyperv_exit { > #define KVM_EXIT_IOAPIC_EOI 26 > #define KVM_EXIT_HYPERV 27 > #define KVM_EXIT_ARM_NISV 28 > +#define KVM_EXIT_SPP 29 > > /* For KVM_EXIT_INTERNAL_ERROR */ > /* Emulate instruction failed. */ > @@ -401,6 +402,11 @@ struct kvm_run { > struct { > __u8 vector; > } eoi; > + /* KVM_EXIT_SPP */ > + struct { > + __u64 addr; > + __u8 ins_len; > + } spp; > /* KVM_EXIT_HYPERV */ > struct kvm_hyperv_exit hyperv; > /* KVM_EXIT_ARM_NISV */ > -- > 2.17.2 >
On Fri, Jan 10, 2020 at 09:55:37AM -0800, Sean Christopherson wrote: > On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote: > > If write to subpage is not allowed, EPT violation generates > > and it's handled in fast_page_fault(). > > > > In current implementation, SPPT setup is only handled in handle_spp() > > vmexit handler, it's triggered when SPP bit is set in EPT leaf > > entry while SPPT entries are not ready. > > > > A SPP specific bit(11) is added to exit_qualification and a new > > exit reason(66) is introduced for SPP. > > ... > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > index 6f92b40d798c..c41791ebee65 100644 > > --- a/arch/x86/kvm/mmu/mmu.c > > +++ b/arch/x86/kvm/mmu/mmu.c > > @@ -6372,6 +6427,8 @@ unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) > > return nr_mmu_pages; > > } > > > > +#include "spp.c" > > + > > Unless there is a *very* good reason for these shenanigans, spp.c needs > to built via the Makefile like any other source. If this is justified > for whatever reason, then that justification needs to be very clearly > stated in the changelog. Yes, it looks odd. When extracted the SPP code from mmu.c, I found a lot of functions in mmu.c should be exposed so that spp.c can see them, I took them as unnecessary modification to mmu.c, so just add the spp.c file back to mmu.c, if you suggest change it with a seperate object file, I'll do it. > > In general, the code organization of this entire series likely needs to > be overhauled. There are gobs exports which are either completely > unnecessary or completely backswards. > > E.g. exporting VMX-only functions from spp.c, which presumably are only > callbed by VMX. > > EXPORT_SYMBOL_GPL(vmx_spp_flush_sppt); > EXPORT_SYMBOL_GPL(vmx_spp_init); > > Exporting ioctl helpers from the same file, which are presumably called > only from x86.c. > > EXPORT_SYMBOL_GPL(kvm_vm_ioctl_get_subpages); > EXPORT_SYMBOL_GPL(kvm_vm_ioctl_set_subpages); Thanks for the suggestion, I'll go over the patches and or-organize them.
On Fri, Jan 10, 2020 at 10:04:59AM -0800, Sean Christopherson wrote: > On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote: > > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > > if ((error_code & PFERR_WRITE_MASK) && > > spte_can_locklessly_be_made_writable(spte)) > > { > > - new_spte |= PT_WRITABLE_MASK; > > + /* > > + * Record write protect fault caused by > > + * Sub-page Protection, let VMI decide > > + * the next step. > > + */ > > + if (spte & PT_SPP_MASK) { > > + int len = kvm_x86_ops->get_inst_len(vcpu); > > There's got to be a better way to handle SPP exits than adding a helper > to retrieve the instruction length. > The fault instruction was skipped by kvm_skip_emulated_instruction() before, but Paolo suggested leave the re-do or skip option to user-space to make it flexible for write protection or write tracking, so return length to user-space. > > + > > + fault_handled = true; > > + vcpu->run->exit_reason = KVM_EXIT_SPP; > > + vcpu->run->spp.addr = gva; > > + vcpu->run->spp.ins_len = len; > > s/ins_len/insn_len to be consistent with other KVM nomenclature. > OK. > > + trace_kvm_spp_induced_page_fault(vcpu, > > + gva, > > + len); > > + break; > > + } > > + > > + if (was_spp_armed(new_spte)) { > > + restore_spp_bit(&new_spte); > > + spp_protected = true; > > + } else { > > + new_spte |= PT_WRITABLE_MASK; > > + } > > > > /* > > * Do not fix write-permission on the large spte. Since > > ... > > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > > index 24e4e1c47f42..97d862c79124 100644 > > --- a/arch/x86/kvm/vmx/vmx.c > > +++ b/arch/x86/kvm/vmx/vmx.c > > @@ -200,7 +200,6 @@ static const struct { > > [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, > > [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, > > }; > > - > > Spurious whitepsace. > OK. > > #define L1D_CACHE_ORDER 4 > > static void *vmx_l1d_flush_pages; > > > > @@ -2999,6 +2998,7 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) > > bool update_guest_cr3 = true; > > unsigned long guest_cr3; > > u64 eptp; > > + u64 spptp; > > > > guest_cr3 = cr3; > > if (enable_ept) { > > @@ -3027,6 +3027,12 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) > > > > if (update_guest_cr3) > > vmcs_writel(GUEST_CR3, guest_cr3); > > + > > + if (kvm->arch.spp_active && VALID_PAGE(vcpu->kvm->arch.sppt_root)) { > > + spptp = construct_spptp(vcpu->kvm->arch.sppt_root); > > + vmcs_write64(SPPT_POINTER, spptp); > > + vmx_flush_tlb(vcpu, true); > > Why is SPP so special that it gets to force TLB flushes? I double checked the code, there's a call to vmx_flush_tlb() in mmu_load(), so it's unnecessary here, thank you! > > + } > > } > > > > int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) > > @@ -5361,6 +5367,74 @@ static int handle_monitor_trap(struct kvm_vcpu *vcpu) > > return 1; > > } > > > > +int handle_spp(struct kvm_vcpu *vcpu) > > Can be static. > Thanks! > > +{ > > + unsigned long exit_qualification; > > + struct kvm_memory_slot *slot; > > + gpa_t gpa; > > + gfn_t gfn; > > + > > + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); > > + > > + /* > > + * SPP VM exit happened while executing iret from NMI, > > + * "blocked by NMI" bit has to be set before next VM entry. > > + * There are errata that may cause this bit to not be set: > > + * AAK134, BY25. > > + */ > > + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && > > + (exit_qualification & SPPT_INTR_INFO_UNBLOCK_NMI)) > > + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, > > + GUEST_INTR_STATE_NMI); > > + > > + vcpu->arch.exit_qualification = exit_qualification; > > if (WARN_ON(!(exit_qualification & SPPT_INDUCED_EXIT_TYPE))) > goto out_err; > > <handle spp exit> > > return 1; > > out_err: > vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; > vcpu->run->hw.hardware_exit_reason = EXIT_REASON_SPP; > return 0; > Sure, will change it. > > + if (exit_qualification & SPPT_INDUCED_EXIT_TYPE) { > > + int page_num = KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL); > > The compiler is probably clever enough to make these constants, but if > this logic is a fundamental property of SPP then it should be defined as > a macro somewhere. > OK, will change it. > > + u32 *access; > > + gfn_t gfn_max; > > + > > + /* > > + * SPPT missing > > + * We don't set SPP write access for the corresponding > > + * GPA, if we haven't setup, we need to construct > > + * SPP table here. > > + */ > > + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); > > + gfn = gpa >> PAGE_SHIFT; > > gpa_to_gfn() > OK. > > + trace_kvm_spp_induced_exit(vcpu, gpa, exit_qualification); > > + /* > > + * In level 1 of SPPT, there's no PRESENT bit, all data is > > + * regarded as permission vector, so need to check from > > + * level 2 to set up the vector if target page is protected. > > + */ > > + spin_lock(&vcpu->kvm->mmu_lock); > > + gfn &= ~(page_num - 1); > > > > > + gfn_max = gfn + page_num - 1; > > s/gfn_max/gfn_end OK. > > > + for (; gfn <= gfn_max; gfn++) { > > My preference would be to do: > gfn_end = gfn + page_num; > > for ( ; gfn < gfn_end; gfn++) > Thank you! > > + slot = gfn_to_memslot(vcpu->kvm, gfn);
On Mon, Jan 13, 2020 at 04:10:50PM +0800, Yang Weijiang wrote: > On Fri, Jan 10, 2020 at 10:04:59AM -0800, Sean Christopherson wrote: > > On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote: > > > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > > > if ((error_code & PFERR_WRITE_MASK) && > > > spte_can_locklessly_be_made_writable(spte)) > > > { > > > - new_spte |= PT_WRITABLE_MASK; > > > + /* > > > + * Record write protect fault caused by > > > + * Sub-page Protection, let VMI decide > > > + * the next step. > > > + */ > > > + if (spte & PT_SPP_MASK) { > > > + int len = kvm_x86_ops->get_inst_len(vcpu); > > > > There's got to be a better way to handle SPP exits than adding a helper > > to retrieve the instruction length. > > > The fault instruction was skipped by kvm_skip_emulated_instruction() > before, but Paolo suggested leave the re-do or skip option to user-space > to make it flexible for write protection or write tracking, so return > length to user-space. Sorry, my comment was unclear. I have no objection to punting the fault to userspace, it's the mechanics of how it's done that I dislike. Specifically, (a) using run->exit_reason to propagate the SPP exit up the stack, e.g. instead of modifying affected call stacks to play nice with any exit to userspace, (b) assuming ->get_insn_len() will always be accurate, e.g. see the various caveats in skip_emulated_instruction() for both VMX and SVM, and (c) duplicating the state capture code in every location that can encounter a SPP fault. What I'm hoping is that it's possible to modify the call stacks to explicitly propagate an exit to userspace and/or SPP fault, and shove all the state capture into a common location, e.g. handle_ept_violation(). Side topic, assuming the userspace VMI is going to be instrospecting the faulting instruction, won't it decode the instruction? I.e. calculate the instruction length anyways?
On Mon, 13 Jan 2020 09:33:58 -0800, Sean Christopherson <sean.j.christopherson@intel.com> wrote: > On Mon, Jan 13, 2020 at 04:10:50PM +0800, Yang Weijiang wrote: > > On Fri, Jan 10, 2020 at 10:04:59AM -0800, Sean Christopherson wrote: > > > On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote: > > > > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > > > > if ((error_code & PFERR_WRITE_MASK) && > > > > spte_can_locklessly_be_made_writable(spte)) > > > > { > > > > - new_spte |= PT_WRITABLE_MASK; > > > > + /* > > > > + * Record write protect fault caused by > > > > + * Sub-page Protection, let VMI decide > > > > + * the next step. > > > > + */ > > > > + if (spte & PT_SPP_MASK) { > > > > + int len = kvm_x86_ops->get_inst_len(vcpu); > > > > > > There's got to be a better way to handle SPP exits than adding a helper > > > to retrieve the instruction length. > > > > > The fault instruction was skipped by kvm_skip_emulated_instruction() > > before, but Paolo suggested leave the re-do or skip option to user-space > > to make it flexible for write protection or write tracking, so return > > length to user-space. > > Sorry, my comment was unclear. I have no objection to punting the fault > to userspace, it's the mechanics of how it's done that I dislike. > > Specifically, (a) using run->exit_reason to propagate the SPP exit up the > stack, e.g. instead of modifying affected call stacks to play nice with > any exit to userspace, (b) assuming ->get_insn_len() will always be > accurate, e.g. see the various caveats in skip_emulated_instruction() for > both VMX and SVM, and (c) duplicating the state capture code in every > location that can encounter a SPP fault. > > What I'm hoping is that it's possible to modify the call stacks to > explicitly propagate an exit to userspace and/or SPP fault, and shove all > the state capture into a common location, e.g. handle_ept_violation(). > > Side topic, assuming the userspace VMI is going to be instrospecting the > faulting instruction, won't it decode the instruction? I.e. calculate > the instruction length anyways? Indeed, we decode the instruction from userspace. I don't know if the instruction length helps other projects. Added Tamas and Mathieu. In our last VMI API proposal, the breakpoint event had the instruction length sent to userspace, but I can't remember why. https://lore.kernel.org/kvm/20190809160047.8319-62-alazar@bitdefender.com/
On Mon, Jan 13, 2020 at 08:55:46PM +0200, Adalbert Lazăr wrote: > On Mon, 13 Jan 2020 09:33:58 -0800, Sean Christopherson <sean.j.christopherson@intel.com> wrote: > > On Mon, Jan 13, 2020 at 04:10:50PM +0800, Yang Weijiang wrote: > > > On Fri, Jan 10, 2020 at 10:04:59AM -0800, Sean Christopherson wrote: > > > > On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote: > > > > > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > > > > > if ((error_code & PFERR_WRITE_MASK) && > > > > > spte_can_locklessly_be_made_writable(spte)) > > > > > { > > > > > - new_spte |= PT_WRITABLE_MASK; > > > > > + /* > > > > > + * Record write protect fault caused by > > > > > + * Sub-page Protection, let VMI decide > > > > > + * the next step. > > > > > + */ > > > > > + if (spte & PT_SPP_MASK) { > > > > > + int len = kvm_x86_ops->get_inst_len(vcpu); > > > > > > > > There's got to be a better way to handle SPP exits than adding a helper > > > > to retrieve the instruction length. > > > > > > > The fault instruction was skipped by kvm_skip_emulated_instruction() > > > before, but Paolo suggested leave the re-do or skip option to user-space > > > to make it flexible for write protection or write tracking, so return > > > length to user-space. > > > > Sorry, my comment was unclear. I have no objection to punting the fault > > to userspace, it's the mechanics of how it's done that I dislike. > > > > Specifically, (a) using run->exit_reason to propagate the SPP exit up the > > stack, e.g. instead of modifying affected call stacks to play nice with > > any exit to userspace, (b) assuming ->get_insn_len() will always be > > accurate, e.g. see the various caveats in skip_emulated_instruction() for > > both VMX and SVM, and (c) duplicating the state capture code in every > > location that can encounter a SPP fault. > > > > What I'm hoping is that it's possible to modify the call stacks to > > explicitly propagate an exit to userspace and/or SPP fault, and shove all > > the state capture into a common location, e.g. handle_ept_violation(). > > > > Side topic, assuming the userspace VMI is going to be instrospecting the > > faulting instruction, won't it decode the instruction? I.e. calculate > > the instruction length anyways? > > Indeed, we decode the instruction from userspace. I don't know if the > instruction length helps other projects. Added Tamas and Mathieu. > > In our last VMI API proposal, the breakpoint event had the instruction > length sent to userspace, but I can't remember why. INT3 is trap-like, i.e. the VM-Exit occurs after the instruction retires. It's impossible for software to know how far to unwind RIP without the instruction length being provided by hardware/KVM, e.g. if the guest is being silly and prepends ignored prefixes on the INT3. Self-aware software has a priori knowledge of what's being patched in, and practically speaking I don't any well-behaved sane software uses prefixes with INT3, but from a VMM's perspective it's legal and possible. > > https://lore.kernel.org/kvm/20190809160047.8319-62-alazar@bitdefender.com/
On Mon, Jan 13, 2020 at 09:33:58AM -0800, Sean Christopherson wrote: > On Mon, Jan 13, 2020 at 04:10:50PM +0800, Yang Weijiang wrote: > > On Fri, Jan 10, 2020 at 10:04:59AM -0800, Sean Christopherson wrote: > > > On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote: > > > > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > > > > if ((error_code & PFERR_WRITE_MASK) && > > > > spte_can_locklessly_be_made_writable(spte)) > > > > { > > > > - new_spte |= PT_WRITABLE_MASK; > > > > + /* > > > > + * Record write protect fault caused by > > > > + * Sub-page Protection, let VMI decide > > > > + * the next step. > > > > + */ > > > > + if (spte & PT_SPP_MASK) { > > > > + int len = kvm_x86_ops->get_inst_len(vcpu); > > > > > > There's got to be a better way to handle SPP exits than adding a helper > > > to retrieve the instruction length. > > > > > The fault instruction was skipped by kvm_skip_emulated_instruction() > > before, but Paolo suggested leave the re-do or skip option to user-space > > to make it flexible for write protection or write tracking, so return > > length to user-space. > > Sorry, my comment was unclear. I have no objection to punting the fault > to userspace, it's the mechanics of how it's done that I dislike. > > Specifically, (a) using run->exit_reason to propagate the SPP exit up the > stack, e.g. instead of modifying affected call stacks to play nice with > any exit to userspace, (b) assuming ->get_insn_len() will always be > accurate, e.g. see the various caveats in skip_emulated_instruction() for > both VMX and SVM, and (c) duplicating the state capture code in every > location that can encounter a SPP fault. How about calling skip_emulated_instruction() in KVM before exit to userspace, but still return the skipped instruction length, if userspace would like to re-execute the instruction, it can unwind RIP or simply rely on KVM? > > What I'm hoping is that it's possible to modify the call stacks to > explicitly propagate an exit to userspace and/or SPP fault, and shove all > the state capture into a common location, e.g. handle_ept_violation(). > The problem is, the state capture code in fast_page_fault() and emulation case share different causes, the former is generic occurence of SPP induced EPT violation, the latter is atually a "faked" one while detecting emulation instruction is writing some SPP protected area, so I seperated them. > Side topic, assuming the userspace VMI is going to be instrospecting the > faulting instruction, won't it decode the instruction? I.e. calculate > the instruction length anyways?
On Tue, Jan 14, 2020 at 11:08:20AM +0800, Yang Weijiang wrote: > On Mon, Jan 13, 2020 at 09:33:58AM -0800, Sean Christopherson wrote: > > On Mon, Jan 13, 2020 at 04:10:50PM +0800, Yang Weijiang wrote: > > > On Fri, Jan 10, 2020 at 10:04:59AM -0800, Sean Christopherson wrote: > > > > On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote: > > > > > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > > > > > if ((error_code & PFERR_WRITE_MASK) && > > > > > spte_can_locklessly_be_made_writable(spte)) > > > > > { > > > > > - new_spte |= PT_WRITABLE_MASK; > > > > > + /* > > > > > + * Record write protect fault caused by > > > > > + * Sub-page Protection, let VMI decide > > > > > + * the next step. > > > > > + */ > > > > > + if (spte & PT_SPP_MASK) { > > > > > + int len = kvm_x86_ops->get_inst_len(vcpu); > > > > > > > > There's got to be a better way to handle SPP exits than adding a helper > > > > to retrieve the instruction length. > > > > > > > The fault instruction was skipped by kvm_skip_emulated_instruction() > > > before, but Paolo suggested leave the re-do or skip option to user-space > > > to make it flexible for write protection or write tracking, so return > > > length to user-space. > > > > Sorry, my comment was unclear. I have no objection to punting the fault > > to userspace, it's the mechanics of how it's done that I dislike. > > > > Specifically, (a) using run->exit_reason to propagate the SPP exit up the > > stack, e.g. instead of modifying affected call stacks to play nice with > > any exit to userspace, (b) assuming ->get_insn_len() will always be > > accurate, e.g. see the various caveats in skip_emulated_instruction() for > > both VMX and SVM, and (c) duplicating the state capture code in every > > location that can encounter a SPP fault. > > How about calling skip_emulated_instruction() in KVM before exit to I'm confused. It sounds like KVM_EXIT_SPP provides the instruction length because it skips an instruction before exiting to userspace. But if KVM is is emulating an instruction, it shouldn't be doing {kvm_}skip_emulated_instruction(), e.g. if emulation fails due to a SPP violation (returns KVM_EXIT_SPP) then GUEST_RIP should still point at the exiting instruction. Ditto for the fast_page_fault() case, RIP shouldn't be advanced. What am I missing? > userspace, but still return the skipped instruction length, if userspace > would like to re-execute the instruction, it can unwind RIP or simply > rely on KVM? I'm not convinced the instruction length needs to be provided to userspace for this case. Obviously it's not difficult to provide the info, I just don't understand the value added by doing so. As above, RIP shouldn't need to be unwound, and blindly skipping an instruction seems like an odd thing for a VMI engine to do. > > What I'm hoping is that it's possible to modify the call stacks to > > explicitly propagate an exit to userspace and/or SPP fault, and shove all > > the state capture into a common location, e.g. handle_ept_violation(). > > > The problem is, the state capture code in fast_page_fault() and > emulation case share different causes, the former is generic occurence > of SPP induced EPT violation, the latter is atually a "faked" one while > detecting emulation instruction is writing some SPP protected area, so I > seperated them. Can we make SPP dependent on unrestricted guest so that the only entry point to the emulator is through handle_ept_violation()? And thus the only path to triggering KVM_EXIT_SPP would also be through handle_ept_violation(); (I think, might be forgetting a different emulation path). > > > Side topic, assuming the userspace VMI is going to be instrospecting the > > faulting instruction, won't it decode the instruction? I.e. calculate > > the instruction length anyways?
On Tue, Jan 14, 2020 at 10:58:08AM -0800, Sean Christopherson wrote: > On Tue, Jan 14, 2020 at 11:08:20AM +0800, Yang Weijiang wrote: > > On Mon, Jan 13, 2020 at 09:33:58AM -0800, Sean Christopherson wrote: > > > On Mon, Jan 13, 2020 at 04:10:50PM +0800, Yang Weijiang wrote: > > > > On Fri, Jan 10, 2020 at 10:04:59AM -0800, Sean Christopherson wrote: > > > > > On Thu, Jan 02, 2020 at 02:13:15PM +0800, Yang Weijiang wrote: > > > > > > @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > > > > > > if ((error_code & PFERR_WRITE_MASK) && > > > > > > spte_can_locklessly_be_made_writable(spte)) > > > > > > { > > > > > > - new_spte |= PT_WRITABLE_MASK; > > > > > > + /* > > > > > > + * Record write protect fault caused by > > > > > > + * Sub-page Protection, let VMI decide > > > > > > + * the next step. > > > > > > + */ > > > > > > + if (spte & PT_SPP_MASK) { > > > > > > + int len = kvm_x86_ops->get_inst_len(vcpu); > > > > > > > > > > There's got to be a better way to handle SPP exits than adding a helper > > > > > to retrieve the instruction length. > > > > > > > > > The fault instruction was skipped by kvm_skip_emulated_instruction() > > > > before, but Paolo suggested leave the re-do or skip option to user-space > > > > to make it flexible for write protection or write tracking, so return > > > > length to user-space. > > > > > > Sorry, my comment was unclear. I have no objection to punting the fault > > > to userspace, it's the mechanics of how it's done that I dislike. > > > > > > Specifically, (a) using run->exit_reason to propagate the SPP exit up the > > > stack, e.g. instead of modifying affected call stacks to play nice with > > > any exit to userspace, (b) assuming ->get_insn_len() will always be > > > accurate, e.g. see the various caveats in skip_emulated_instruction() for > > > both VMX and SVM, and (c) duplicating the state capture code in every > > > location that can encounter a SPP fault. > > > > How about calling skip_emulated_instruction() in KVM before exit to > > I'm confused. It sounds like KVM_EXIT_SPP provides the instruction length > because it skips an instruction before exiting to userspace. But if KVM > is is emulating an instruction, it shouldn't be doing > {kvm_}skip_emulated_instruction(), e.g. if emulation fails due to a SPP > violation (returns KVM_EXIT_SPP) then GUEST_RIP should still point at the > exiting instruction. Ditto for the fast_page_fault() case, RIP shouldn't > be advanced. There're two SPP usages, one is for write-protection the other is for write-tracking. If the first case is being used, KVM ignores the write , i.e., write to the memory is discarded. The second case is, if userspace is tracking memory write through SPP, then it's notified via KVM_EXIT_SPP but still let the write take effect by unprotecting the subpage, i.e., like generic 4KB access-tracking. In the first case, no necessity to re-try the faulted instruction, the second case, a re-try is necessary, so I would skip current instruction first, then if it's actually the second case, userspace should take action based on the instruction lenght returned. > > What am I missing? > > > userspace, but still return the skipped instruction length, if userspace > > would like to re-execute the instruction, it can unwind RIP or simply > > rely on KVM? > > I'm not convinced the instruction length needs to be provided to userspace > for this case. Obviously it's not difficult to provide the info, I just > don't understand the value added by doing so. As above, RIP shouldn't > need to be unwound, and blindly skipping an instruction seems like an odd > thing for a VMI engine to do. In the last review by Paolo, he mentioned SPP could be used in access-tracing manner, it's flexible to provide instruction length to userspace, so I removed instruction-skip in KVM but let userspace to decide. > > > > What I'm hoping is that it's possible to modify the call stacks to > > > explicitly propagate an exit to userspace and/or SPP fault, and shove all > > > the state capture into a common location, e.g. handle_ept_violation(). > > > > > The problem is, the state capture code in fast_page_fault() and > > emulation case share different causes, the former is generic occurence > > of SPP induced EPT violation, the latter is atually a "faked" one while > > detecting emulation instruction is writing some SPP protected area, so I > > seperated them. > > Can we make SPP dependent on unrestricted guest so that the only entry > point to the emulator is through handle_ept_violation()? And thus the > only path to triggering KVM_EXIT_SPP would also be through > handle_ept_violation(); (I think, might be forgetting a different emulation > path). > I don't got your point, from my understanding, instruction emulation is used in several cases regarless of guest working mode. As long as any memory write happens e.g., string ops, port ops etc, SPP write-protection check should be applied to let the userspace capture the event. > > > > > Side topic, assuming the userspace VMI is going to be instrospecting the > > > faulting instruction, won't it decode the instruction? I.e. calculate > > > the instruction length anyways?
On 10/01/20 18:55, Sean Christopherson wrote: > Unless there is a *very* good reason for these shenanigans, spp.c needs > to built via the Makefile like any other source. If this is justified > for whatever reason, then that justification needs to be very clearly > stated in the changelog. Well, this #include is the reason why I moved mmu.c to mmu/spp.c. It shouldn't be hard to create a mmu_internal.h header for things that have to be shared between mmu.c and spp.c, but I'm okay with having the #include "spp.c" for now and cleaning it up later. The spp.c file, albeit ugly, provides hints as to what to put in that header; without it it's a pointless exercise. Note that there isn't anything really VMX specific even in the few vmx_* functions of spp.c. My suggestion is just to rename them to kvm_mmu_*. Paolo
On 14/01/20 19:58, Sean Christopherson wrote: > I'm not convinced the instruction length needs to be provided to userspace > for this case. Obviously it's not difficult to provide the info, I just > don't understand the value added by doing so. As above, RIP shouldn't > need to be unwound, and blindly skipping an instruction seems like an odd > thing for a VMI engine to do. > The reason to introduce the instruction length was so that userspace could blindly use SPP to emulate ROM behavior. Weijiang's earlier patches in fact _only_ provided that behavior, and I asked him to change it so that, by default, using SPP and not handling it will basically cause an infinite loop of SPP violations. One possibility to clean things up is to change "fault_handled" and fast_page_fault's return value from bool to RET_PF* (false becomes RET_PF_INVALID, true becomes RET_PF_RETRY). direct_page_fault would also have to do something like r = fast_page_fault(vcpu, gpa, level, error_code)) if (r != RET_PF_INVALID) return r; Then fast_page_fault can just return RET_PF_USERSPACE, and this ugly case can go away. + if (vcpu->run->exit_reason == KVM_EXIT_SPP) + r = RET_PF_USERSPACE; + Thanks, Paolo
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index de79a4de0723..81faf6607a9e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -213,6 +213,8 @@ enum vmcs_field { XSS_EXIT_BITMAP_HIGH = 0x0000202D, ENCLS_EXITING_BITMAP = 0x0000202E, ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, + SPPT_POINTER = 0x00002030, + SPPT_POINTER_HIGH = 0x00002031, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, GUEST_PHYSICAL_ADDRESS = 0x00002400, @@ -534,6 +536,13 @@ struct vmx_msr_entry { #define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT) #define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) +/* + * Exit Qualifications for SPPT-Induced vmexits + */ +#define SPPT_INDUCED_EXIT_TYPE_BIT 11 +#define SPPT_INDUCED_EXIT_TYPE (1 << SPPT_INDUCED_EXIT_TYPE_BIT) +#define SPPT_INTR_INFO_UNBLOCK_NMI INTR_INFO_UNBLOCK_NMI + /* * VM-instruction error numbers */ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 3eb8411ab60e..4833a39a799b 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -86,6 +86,7 @@ #define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 +#define EXIT_REASON_SPP 66 #define EXIT_REASON_UMWAIT 67 #define EXIT_REASON_TPAUSE 68 @@ -145,6 +146,7 @@ { EXIT_REASON_ENCLS, "ENCLS" }, \ { EXIT_REASON_RDSEED, "RDSEED" }, \ { EXIT_REASON_PML_FULL, "PML_FULL" }, \ + { EXIT_REASON_SPP, "SPP" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ { EXIT_REASON_XRSTORS, "XRSTORS" }, \ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6f92b40d798c..c41791ebee65 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -20,6 +20,7 @@ #include "x86.h" #include "kvm_cache_regs.h" #include "cpuid.h" +#include "spp.h" #include <linux/kvm_host.h> #include <linux/types.h> @@ -177,6 +178,7 @@ module_param(dbg, bool, 0644); /* The mask for the R/X bits in EPT PTEs */ #define PT64_EPT_READABLE_MASK 0x1ull #define PT64_EPT_EXECUTABLE_MASK 0x4ull +#define PT64_SPP_SAVED_BIT (1ULL << (PT64_SECOND_AVAIL_BITS_SHIFT + 1)) #include <trace/events/kvm.h> @@ -200,6 +202,7 @@ enum { RET_PF_RETRY = 0, RET_PF_EMULATE = 1, RET_PF_INVALID = 2, + RET_PF_USERSPACE = 3, }; struct pte_list_desc { @@ -979,6 +982,11 @@ static u64 mark_spte_for_access_track(u64 spte) shadow_acc_track_saved_bits_shift; spte &= ~shadow_acc_track_mask; + if (spte & PT_SPP_MASK) { + spte &= ~PT_SPP_MASK; + spte |= PT64_SPP_SAVED_BIT; + } + return spte; } @@ -1677,9 +1685,15 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep) { bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); + bool was_spp_armed = test_and_clear_bit(PT_SPP_SHIFT, + (unsigned long *)sptep); + if (was_writable && !spte_ad_enabled(*sptep)) kvm_set_pfn_dirty(spte_to_pfn(*sptep)); + if (was_spp_armed) + *sptep |= PT64_SPP_SAVED_BIT; + return was_writable; } @@ -3478,7 +3492,8 @@ static bool page_fault_can_be_fast(u32 error_code) */ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *sptep, u64 old_spte, u64 new_spte) + u64 *sptep, u64 old_spte, u64 new_spte, + bool spp_protected) { gfn_t gfn; @@ -3499,7 +3514,8 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) return false; - if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) { + if ((is_writable_pte(new_spte) && !is_writable_pte(old_spte)) || + spp_protected) { /* * The gfn of direct spte is stable since it is * calculated by sp->gfn. @@ -3534,6 +3550,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; bool fault_handled = false; + bool spp_protected = false; u64 spte = 0ull; uint retry_count = 0; @@ -3585,7 +3602,30 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, if ((error_code & PFERR_WRITE_MASK) && spte_can_locklessly_be_made_writable(spte)) { - new_spte |= PT_WRITABLE_MASK; + /* + * Record write protect fault caused by + * Sub-page Protection, let VMI decide + * the next step. + */ + if (spte & PT_SPP_MASK) { + int len = kvm_x86_ops->get_inst_len(vcpu); + + fault_handled = true; + vcpu->run->exit_reason = KVM_EXIT_SPP; + vcpu->run->spp.addr = gva; + vcpu->run->spp.ins_len = len; + trace_kvm_spp_induced_page_fault(vcpu, + gva, + len); + break; + } + + if (was_spp_armed(new_spte)) { + restore_spp_bit(&new_spte); + spp_protected = true; + } else { + new_spte |= PT_WRITABLE_MASK; + } /* * Do not fix write-permission on the large spte. Since @@ -3604,7 +3644,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, /* Verify that the fault can be handled in the fast path */ if (new_spte == spte || - !is_access_allowed(error_code, new_spte)) + (!is_access_allowed(error_code, new_spte) && + !spp_protected)) break; /* @@ -3614,7 +3655,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, */ fault_handled = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, - new_spte); + new_spte, + spp_protected); if (fault_handled) break; @@ -3740,6 +3782,12 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list); + if (vcpu->kvm->arch.spp_active) { + vcpu->kvm->arch.spp_active = false; + mmu_free_root_page(vcpu->kvm, + &vcpu->kvm->arch.sppt_root, + &invalid_list); + } } else { for (i = 0; i < 4; ++i) if (mmu->pae_root[i] != 0) @@ -5223,6 +5271,8 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots) uint i; vcpu->arch.mmu->root_hpa = INVALID_PAGE; + if (!vcpu->kvm->arch.spp_active) + vcpu->kvm->arch.sppt_root = INVALID_PAGE; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; @@ -5539,6 +5589,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, r = vcpu->arch.mmu->page_fault(vcpu, cr2, lower_32_bits(error_code), false); + + if (vcpu->run->exit_reason == KVM_EXIT_SPP) + r = RET_PF_USERSPACE; + WARN_ON(r == RET_PF_INVALID); } @@ -5546,7 +5600,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, return 1; if (r < 0) return r; - + if (r == RET_PF_USERSPACE) + return 0; /* * Before emulating the instruction, check if the error code * was due to a RO violation while translating the guest page. @@ -6372,6 +6427,8 @@ unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) return nr_mmu_pages; } +#include "spp.c" + void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); diff --git a/arch/x86/kvm/mmu/spp.c b/arch/x86/kvm/mmu/spp.c index 6f611e04e817..3ec434140967 100644 --- a/arch/x86/kvm/mmu/spp.c +++ b/arch/x86/kvm/mmu/spp.c @@ -17,6 +17,18 @@ static void shadow_spp_walk_init(struct kvm_shadow_walk_iterator *iterator, iterator->level = PT64_ROOT_4LEVEL; } +/* Restore an spp armed PTE */ +void restore_spp_bit(u64 *spte) +{ + *spte &= ~PT64_SPP_SAVED_BIT; + *spte |= PT_SPP_MASK; +} + +bool was_spp_armed(u64 spte) +{ + return !!(spte & PT64_SPP_SAVED_BIT); +} + u32 *gfn_to_subpage_wp_info(struct kvm_memory_slot *slot, gfn_t gfn) { unsigned long idx; @@ -459,6 +471,7 @@ int kvm_spp_set_permission(struct kvm *kvm, u64 gfn, u32 npages, if (!access) return -EFAULT; *access = access_map[i]; + trace_kvm_spp_set_subpages(vcpu, gfn, *access); } gfn = old_gfn; diff --git a/arch/x86/kvm/mmu/spp.h b/arch/x86/kvm/mmu/spp.h index daa69bd274da..5ffe06d2cd6f 100644 --- a/arch/x86/kvm/mmu/spp.h +++ b/arch/x86/kvm/mmu/spp.h @@ -11,6 +11,8 @@ int kvm_spp_set_permission(struct kvm *kvm, u64 gfn, u32 npages, u32 *access_map); int kvm_spp_mark_protection(struct kvm *kvm, u64 gfn, u32 access); bool is_spp_spte(struct kvm_mmu_page *sp); +void restore_spp_bit(u64 *spte); +bool was_spp_armed(u64 spte); u64 construct_spptp(unsigned long root_hpa); int kvm_vm_ioctl_get_subpages(struct kvm *kvm, u64 gfn, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 7c741a0c5f80..293b91d516b1 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1496,6 +1496,72 @@ TRACE_EVENT(kvm_nested_vmenter_failed, __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) ); +TRACE_EVENT(kvm_spp_set_subpages, + TP_PROTO(struct kvm_vcpu *vcpu, gfn_t gfn, u32 access), + TP_ARGS(vcpu, gfn, access), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(gfn_t, gfn) + __field(u32, access) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->gfn = gfn; + __entry->access = access; + ), + + TP_printk("vcpu %d gfn %llx access %x", + __entry->vcpu_id, + __entry->gfn, + __entry->access) +); + +TRACE_EVENT(kvm_spp_induced_exit, + TP_PROTO(struct kvm_vcpu *vcpu, gpa_t gpa, u32 qualification), + TP_ARGS(vcpu, gpa, qualification), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(gpa_t, gpa) + __field(u32, qualification) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->gpa = gpa; + __entry->qualification = qualification; + ), + + TP_printk("vcpu %d gpa %llx qualificaiton %x", + __entry->vcpu_id, + __entry->gpa, + __entry->qualification) +); + +TRACE_EVENT(kvm_spp_induced_page_fault, + TP_PROTO(struct kvm_vcpu *vcpu, gpa_t gpa, int inst_len), + TP_ARGS(vcpu, gpa, inst_len), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(gpa_t, gpa) + __field(int, inst_len) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->gpa = gpa; + __entry->inst_len = inst_len; + ), + + TP_printk("vcpu %d gpa %llx inst_len %d", + __entry->vcpu_id, + __entry->gpa, + __entry->inst_len) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 24e4e1c47f42..97d862c79124 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -200,7 +200,6 @@ static const struct { [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, }; - #define L1D_CACHE_ORDER 4 static void *vmx_l1d_flush_pages; @@ -2999,6 +2998,7 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) bool update_guest_cr3 = true; unsigned long guest_cr3; u64 eptp; + u64 spptp; guest_cr3 = cr3; if (enable_ept) { @@ -3027,6 +3027,12 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (update_guest_cr3) vmcs_writel(GUEST_CR3, guest_cr3); + + if (kvm->arch.spp_active && VALID_PAGE(vcpu->kvm->arch.sppt_root)) { + spptp = construct_spptp(vcpu->kvm->arch.sppt_root); + vmcs_write64(SPPT_POINTER, spptp); + vmx_flush_tlb(vcpu, true); + } } int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -5361,6 +5367,74 @@ static int handle_monitor_trap(struct kvm_vcpu *vcpu) return 1; } +int handle_spp(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + struct kvm_memory_slot *slot; + gpa_t gpa; + gfn_t gfn; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + /* + * SPP VM exit happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + * There are errata that may cause this bit to not be set: + * AAK134, BY25. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + (exit_qualification & SPPT_INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + vcpu->arch.exit_qualification = exit_qualification; + if (exit_qualification & SPPT_INDUCED_EXIT_TYPE) { + int page_num = KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL); + u32 *access; + gfn_t gfn_max; + + /* + * SPPT missing + * We don't set SPP write access for the corresponding + * GPA, if we haven't setup, we need to construct + * SPP table here. + */ + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + gfn = gpa >> PAGE_SHIFT; + trace_kvm_spp_induced_exit(vcpu, gpa, exit_qualification); + /* + * In level 1 of SPPT, there's no PRESENT bit, all data is + * regarded as permission vector, so need to check from + * level 2 to set up the vector if target page is protected. + */ + spin_lock(&vcpu->kvm->mmu_lock); + gfn &= ~(page_num - 1); + gfn_max = gfn + page_num - 1; + for (; gfn <= gfn_max; gfn++) { + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot) + continue; + access = gfn_to_subpage_wp_info(slot, gfn); + if (access && *access != FULL_SPP_ACCESS) + kvm_spp_setup_structure(vcpu, + *access, + gfn); + } + spin_unlock(&vcpu->kvm->mmu_lock); + return 1; + } + /* + * SPPT Misconfig + * This is probably caused by some mis-configuration in SPPT + * entries, cannot handle it here, escalate the fault to + * emulator. + */ + WARN_ON(1); + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_SPP; + return 0; +} + static int handle_monitor(struct kvm_vcpu *vcpu) { printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); @@ -5575,6 +5649,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVVPID] = handle_vmx_instruction, [EXIT_REASON_RDRAND] = handle_invalid_op, [EXIT_REASON_RDSEED] = handle_invalid_op, + [EXIT_REASON_SPP] = handle_spp, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmx_instruction, @@ -5807,6 +5882,9 @@ void dump_vmcs(void) pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_SPP)) + pr_err("SPPT pointer = 0x%016llx\n", vmcs_read64(SPPT_POINTER)); + n = vmcs_read32(CR3_TARGET_COUNT); for (i = 0; i + 1 < n; i += 4) pr_err("CR3 target%u=%016lx target%u=%016lx\n", diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fb7da000ceaf..a9d7fc21dad6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9782,6 +9782,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, } kvm_page_track_free_memslot(free, dont); + kvm_spp_free_memslot(free, dont); } int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -10406,3 +10407,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_spp_set_subpages); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_spp_induced_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_spp_induced_page_fault); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 09e5e8e6e6dd..c0f3162ee46a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -244,6 +244,7 @@ struct kvm_hyperv_exit { #define KVM_EXIT_IOAPIC_EOI 26 #define KVM_EXIT_HYPERV 27 #define KVM_EXIT_ARM_NISV 28 +#define KVM_EXIT_SPP 29 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -401,6 +402,11 @@ struct kvm_run { struct { __u8 vector; } eoi; + /* KVM_EXIT_SPP */ + struct { + __u64 addr; + __u8 ins_len; + } spp; /* KVM_EXIT_HYPERV */ struct kvm_hyperv_exit hyperv; /* KVM_EXIT_ARM_NISV */