Message ID | 20210820155918.7518-38-brijesh.singh@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add AMD Secure Nested Paging (SEV-SNP) Hypervisor Support | expand |
* Brijesh Singh (brijesh.singh@amd.com) wrote: > SEV-SNP VMs can ask the hypervisor to change the page state in the RMP > table to be private or shared using the Page State Change MSR protocol > as defined in the GHCB specification. > > Before changing the page state in the RMP entry, lookup the page in the > NPT to make sure that there is a valid mapping for it. If the mapping > exist then try to find a workable page level between the NPT and RMP for > the page. If the page is not mapped in the NPT, then create a fault such > that it gets mapped before we change the page state in the RMP entry. > > Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> > --- > arch/x86/include/asm/sev-common.h | 9 ++ > arch/x86/kvm/svm/sev.c | 197 ++++++++++++++++++++++++++++++ > arch/x86/kvm/trace.h | 34 ++++++ > arch/x86/kvm/x86.c | 1 + > 4 files changed, 241 insertions(+) > > diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h > index 91089967ab09..4980f77aa1d5 100644 > --- a/arch/x86/include/asm/sev-common.h > +++ b/arch/x86/include/asm/sev-common.h > @@ -89,6 +89,10 @@ enum psc_op { > }; > > #define GHCB_MSR_PSC_REQ 0x014 > +#define GHCB_MSR_PSC_GFN_POS 12 > +#define GHCB_MSR_PSC_GFN_MASK GENMASK_ULL(39, 0) > +#define GHCB_MSR_PSC_OP_POS 52 > +#define GHCB_MSR_PSC_OP_MASK 0xf > #define GHCB_MSR_PSC_REQ_GFN(gfn, op) \ > /* GHCBData[55:52] */ \ > (((u64)((op) & 0xf) << 52) | \ > @@ -98,6 +102,11 @@ enum psc_op { > GHCB_MSR_PSC_REQ) > > #define GHCB_MSR_PSC_RESP 0x015 > +#define GHCB_MSR_PSC_ERROR_POS 32 > +#define GHCB_MSR_PSC_ERROR_MASK GENMASK_ULL(31, 0) > +#define GHCB_MSR_PSC_ERROR GENMASK_ULL(31, 0) > +#define GHCB_MSR_PSC_RSVD_POS 12 > +#define GHCB_MSR_PSC_RSVD_MASK GENMASK_ULL(19, 0) > #define GHCB_MSR_PSC_RESP_VAL(val) \ > /* GHCBData[63:32] */ \ > (((u64)(val) & GENMASK_ULL(63, 32)) >> 32) > diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c > index 991b8c996fc1..6d9483ec91ab 100644 > --- a/arch/x86/kvm/svm/sev.c > +++ b/arch/x86/kvm/svm/sev.c > @@ -31,6 +31,7 @@ > #include "svm_ops.h" > #include "cpuid.h" > #include "trace.h" > +#include "mmu.h" > > #define __ex(x) __kvm_handle_fault_on_reboot(x) > > @@ -2905,6 +2906,181 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) > svm->vmcb->control.ghcb_gpa = value; > } > > +static int snp_rmptable_psmash(struct kvm *kvm, kvm_pfn_t pfn) > +{ > + pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); > + > + return psmash(pfn); > +} > + > +static int snp_make_page_shared(struct kvm *kvm, gpa_t gpa, kvm_pfn_t pfn, int level) .... > + > + /* > + * Mark the userspace range unmerable before adding the pages ^^^^^^^^^ typo > + * in the RMP table. > + */ > + mmap_write_lock(kvm->mm); > + rc = snp_mark_unmergable(kvm, hva, page_level_size(level)); > + mmap_write_unlock(kvm->mm); > + if (rc) > + return -EINVAL; > + } > + > + write_lock(&kvm->mmu_lock); > + > + rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level); > + if (!rc) { > + /* > + * This may happen if another vCPU unmapped the page > + * before we acquire the lock. Retry the PSC. > + */ > + write_unlock(&kvm->mmu_lock); > + return 0; > + } > + > + /* > + * Adjust the level so that we don't go higher than the backing > + * page level. > + */ > + level = min_t(size_t, level, npt_level); > + > + trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op, level); > + > + switch (op) { > + case SNP_PAGE_STATE_SHARED: > + rc = snp_make_page_shared(kvm, gpa, pfn, level); > + break; > + case SNP_PAGE_STATE_PRIVATE: > + rc = rmp_make_private(pfn, gpa, level, sev->asid, false); Minor nit; it seems a shame that snp_make_page_shared and rmp_make_private both take gpa, pfn, level - in different orders. Dave > + break; > + default: > + rc = -EINVAL; > + break; > + } > + > + write_unlock(&kvm->mmu_lock); > + > + if (rc) { > + pr_err_ratelimited("Error op %d gpa %llx pfn %llx level %d rc %d\n", > + op, gpa, pfn, level, rc); > + return rc; > + } > + > + gpa = gpa + page_level_size(level); > + } > + > + return 0; > +} > + > static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) > { > struct vmcb_control_area *control = &svm->vmcb->control; > @@ -3005,6 +3181,27 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) > GHCB_MSR_INFO_POS); > break; > } > + case GHCB_MSR_PSC_REQ: { > + gfn_t gfn; > + int ret; > + enum psc_op op; > + > + gfn = get_ghcb_msr_bits(svm, GHCB_MSR_PSC_GFN_MASK, GHCB_MSR_PSC_GFN_POS); > + op = get_ghcb_msr_bits(svm, GHCB_MSR_PSC_OP_MASK, GHCB_MSR_PSC_OP_POS); > + > + ret = __snp_handle_page_state_change(vcpu, op, gfn_to_gpa(gfn), PG_LEVEL_4K); > + > + if (ret) > + set_ghcb_msr_bits(svm, GHCB_MSR_PSC_ERROR, > + GHCB_MSR_PSC_ERROR_MASK, GHCB_MSR_PSC_ERROR_POS); > + else > + set_ghcb_msr_bits(svm, 0, > + GHCB_MSR_PSC_ERROR_MASK, GHCB_MSR_PSC_ERROR_POS); > + > + set_ghcb_msr_bits(svm, 0, GHCB_MSR_PSC_RSVD_MASK, GHCB_MSR_PSC_RSVD_POS); > + set_ghcb_msr_bits(svm, GHCB_MSR_PSC_RESP, GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); > + break; > + } > case GHCB_MSR_TERM_REQ: { > u64 reason_set, reason_code; > > diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h > index 1c360e07856f..35ca1cf8440a 100644 > --- a/arch/x86/kvm/trace.h > +++ b/arch/x86/kvm/trace.h > @@ -7,6 +7,7 @@ > #include <asm/svm.h> > #include <asm/clocksource.h> > #include <asm/pvclock-abi.h> > +#include <asm/sev-common.h> > > #undef TRACE_SYSTEM > #define TRACE_SYSTEM kvm > @@ -1711,6 +1712,39 @@ TRACE_EVENT(kvm_vmgexit_msr_protocol_exit, > __entry->vcpu_id, __entry->ghcb_gpa, __entry->result) > ); > > +/* > + * Tracepoint for the SEV-SNP page state change processing > + */ > +#define psc_operation \ > + {SNP_PAGE_STATE_PRIVATE, "private"}, \ > + {SNP_PAGE_STATE_SHARED, "shared"} \ > + > +TRACE_EVENT(kvm_snp_psc, > + TP_PROTO(unsigned int vcpu_id, u64 pfn, u64 gpa, u8 op, int level), > + TP_ARGS(vcpu_id, pfn, gpa, op, level), > + > + TP_STRUCT__entry( > + __field(int, vcpu_id) > + __field(u64, pfn) > + __field(u64, gpa) > + __field(u8, op) > + __field(int, level) > + ), > + > + TP_fast_assign( > + __entry->vcpu_id = vcpu_id; > + __entry->pfn = pfn; > + __entry->gpa = gpa; > + __entry->op = op; > + __entry->level = level; > + ), > + > + TP_printk("vcpu %u, pfn %llx, gpa %llx, op %s, level %d", > + __entry->vcpu_id, __entry->pfn, __entry->gpa, > + __print_symbolic(__entry->op, psc_operation), > + __entry->level) > +); > + > #endif /* _TRACE_KVM_H */ > > #undef TRACE_INCLUDE_PATH > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index e5d5c5ed7dd4..afcdc75a99f2 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -12371,3 +12371,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); > EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); > EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); > EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); > +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_snp_psc); > -- > 2.17.1 > >
On Fri, Aug 20, 2021, Brijesh Singh wrote: > diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c > index 991b8c996fc1..6d9483ec91ab 100644 > --- a/arch/x86/kvm/svm/sev.c > +++ b/arch/x86/kvm/svm/sev.c > @@ -31,6 +31,7 @@ > #include "svm_ops.h" > #include "cpuid.h" > #include "trace.h" > +#include "mmu.h" > > #define __ex(x) __kvm_handle_fault_on_reboot(x) > > @@ -2905,6 +2906,181 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) > svm->vmcb->control.ghcb_gpa = value; > } > > +static int snp_rmptable_psmash(struct kvm *kvm, kvm_pfn_t pfn) > +{ > + pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); > + > + return psmash(pfn); > +} > + > +static int snp_make_page_shared(struct kvm *kvm, gpa_t gpa, kvm_pfn_t pfn, int level) > +{ > + int rc, rmp_level; > + > + rc = snp_lookup_rmpentry(pfn, &rmp_level); > + if (rc < 0) > + return -EINVAL; > + > + /* If page is not assigned then do nothing */ > + if (!rc) > + return 0; > + > + /* > + * Is the page part of an existing 2MB RMP entry ? Split the 2MB into > + * multiple of 4K-page before making the memory shared. > + */ > + if (level == PG_LEVEL_4K && rmp_level == PG_LEVEL_2M) { > + rc = snp_rmptable_psmash(kvm, pfn); > + if (rc) > + return rc; > + } > + > + return rmp_make_shared(pfn, level); > +} > + > +static int snp_check_and_build_npt(struct kvm_vcpu *vcpu, gpa_t gpa, int level) > +{ > + struct kvm *kvm = vcpu->kvm; > + int rc, npt_level; > + kvm_pfn_t pfn; > + > + /* > + * Get the pfn and level for the gpa from the nested page table. > + * > + * If the tdp walk fails, then its safe to say that there is no > + * valid mapping for this gpa. Create a fault to build the map. > + */ > + write_lock(&kvm->mmu_lock); SEV (or any vendor code for that matter) should not be taking mmu_lock. All of KVM has somewhat fungible borders between the various components, but IMO this crosses firmly into "this belongs in the MMU" territory. For example, I highly doubt this actually need to take mmu_lock for write. More below. > + rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level); > + write_unlock(&kvm->mmu_lock); What's the point of this walk? As soon as mmu_lock is dropped, all bets are off. At best this is a strong hint. It doesn't hurt anything per se, it's just a waste of cycles. > + if (!rc) { > + pfn = kvm_mmu_map_tdp_page(vcpu, gpa, PFERR_USER_MASK, level); Same here. > + if (is_error_noslot_pfn(pfn)) > + return -EINVAL; > + } > + > + return 0; > +} > + > +static int snp_gpa_to_hva(struct kvm *kvm, gpa_t gpa, hva_t *hva) > +{ > + struct kvm_memory_slot *slot; > + gfn_t gfn = gpa_to_gfn(gpa); > + int idx; > + > + idx = srcu_read_lock(&kvm->srcu); > + slot = gfn_to_memslot(kvm, gfn); > + if (!slot) { > + srcu_read_unlock(&kvm->srcu, idx); > + return -EINVAL; > + } > + > + /* > + * Note, using the __gfn_to_hva_memslot() is not solely for performance, > + * it's also necessary to avoid the "writable" check in __gfn_to_hva_many(), > + * which will always fail on read-only memslots due to gfn_to_hva() assuming > + * writes. > + */ > + *hva = __gfn_to_hva_memslot(slot, gfn); > + srcu_read_unlock(&kvm->srcu, idx); *hva is effectively invalidated the instance kvm->srcu is unlocked, e.g. a pending memslot update can complete immediately after and delete/move the backing memslot. > + > + return 0; > +} > + > +static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, gpa_t gpa, > + int level) > +{ > + struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; > + struct kvm *kvm = vcpu->kvm; > + int rc, npt_level; > + kvm_pfn_t pfn; > + gpa_t gpa_end; > + > + gpa_end = gpa + page_level_size(level); > + > + while (gpa < gpa_end) { > + /* > + * If the gpa is not present in the NPT then build the NPT. > + */ > + rc = snp_check_and_build_npt(vcpu, gpa, level); > + if (rc) > + return -EINVAL; > + > + if (op == SNP_PAGE_STATE_PRIVATE) { > + hva_t hva; > + > + if (snp_gpa_to_hva(kvm, gpa, &hva)) > + return -EINVAL; > + > + /* > + * Verify that the hva range is registered. This enforcement is > + * required to avoid the cases where a page is marked private > + * in the RMP table but never gets cleanup during the VM > + * termination path. > + */ > + mutex_lock(&kvm->lock); > + rc = is_hva_registered(kvm, hva, page_level_size(level)); This will get a false negative if a hva+size spans two contiguous regions. Also, storing a boolean return in a variable that is an int _and_ was already used for the kernel's standard > + mutex_unlock(&kvm->lock); This is also subject to races, e.g. userspace unregisters the hva immediately after this check, before KVM makes whatever conversion it makes below. A linear walk through a list to find a range is also a bad idea, e.g. pathological worst case scenario is that userspace has created tens of thousands of individual regions. There is no restriction on the number of regions, just the number of pages that can be pinned. I dislike the svm_(un)register_enc_region() scheme in general, but at least for SEV and SEV-ES the code is isolated, e.g. KVM is little more than a dump pipe to let userspace pin pages. I would like to go the opposite direction and work towards eliminating regions_list (or at least making it optional), not build more stuff on top. The more I look at this, the more strongly I feel that private <=> shared conversions belong in the MMU, and that KVM's SPTEs should be the single source of truth for shared vs. private. E.g. add a SPTE_TDP_PRIVATE_MASK in the software available bits. I believe the only hiccup is the snafu where not zapping _all_ SPTEs on memslot deletion breaks QEMU+VFIO+GPU, i.e. KVM would lose its canonical info on unrelated memslot deletion. But that is a solvable problem. Ideally the bug, wherever it is, would be root caused and fixed. I believe Peter (and Marc?) is going to work on reproducing the bug. If we are unable to root cause and fix the bug, I think a viable workaround would be to clear the hardware present bit in unrelated SPTEs, but keep the SPTEs themselves. The idea mostly the same as the ZAPPED_PRIVATE concept from the initial TDX RFC. MMU notifier invalidations, memslot removal, RMP restoration, etc... would all continue to work since the SPTEs is still there, and KVM's page fault handler could audit any "blocked" SPTE when it's refaulted (I'm pretty sure it'd be impossible for the PFN to change, since any PFN change would require a memslot update or mmu_notifier invalidation). The downside to that approach is that it would require walking all SPTEs to do a memslot deletion, i.e. we'd lose the "fast zap" behavior. If that's a performance issue, the behavior could be opt-in (but not for SNP/TDX). > + if (!rc) > + return -EINVAL; > + > + /* > + * Mark the userspace range unmerable before adding the pages > + * in the RMP table. > + */ > + mmap_write_lock(kvm->mm); > + rc = snp_mark_unmergable(kvm, hva, page_level_size(level)); > + mmap_write_unlock(kvm->mm); As mentioned in an earlier patch, this simply cannot work. > + if (rc) > + return -EINVAL; > + } > + > + write_lock(&kvm->mmu_lock); > + > + rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level); Same comment about the bool into int. Though in this case I'd say have kvm_mmu_get_tdp_walk() return 0/-errno, not a bool. Boolean returns for helpers without "is_", "test_", etc... are generally confusing. > + if (!rc) { > + /* > + * This may happen if another vCPU unmapped the page > + * before we acquire the lock. Retry the PSC. > + */ > + write_unlock(&kvm->mmu_lock); > + return 0; How will the caller (guest?) know to retry the PSC if KVM returns "success"? > + } > + > + /* > + * Adjust the level so that we don't go higher than the backing > + * page level. > + */ > + level = min_t(size_t, level, npt_level); > + > + trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op, level); > + > + switch (op) { > + case SNP_PAGE_STATE_SHARED: > + rc = snp_make_page_shared(kvm, gpa, pfn, level); > + break; > + case SNP_PAGE_STATE_PRIVATE: > + rc = rmp_make_private(pfn, gpa, level, sev->asid, false); > + break; > + default: > + rc = -EINVAL; Not that it really matters, because I don't think the MADV_* approach is viable, but this neglects to undo snp_mark_unmergable() on failure. > + break; > + } > + > + write_unlock(&kvm->mmu_lock); > + > + if (rc) { > + pr_err_ratelimited("Error op %d gpa %llx pfn %llx level %d rc %d\n", > + op, gpa, pfn, level, rc); > + return rc; > + } > + > + gpa = gpa + page_level_size(level); > + } > + > + return 0; > +} > + > static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) > { > struct vmcb_control_area *control = &svm->vmcb->control;
On Tue, Oct 12, 2021, Sean Christopherson wrote: > If we are unable to root cause and fix the bug, I think a viable workaround would > be to clear the hardware present bit in unrelated SPTEs, but keep the SPTEs > themselves. The idea mostly the same as the ZAPPED_PRIVATE concept from the initial > TDX RFC. MMU notifier invalidations, memslot removal, RMP restoration, etc... would > all continue to work since the SPTEs is still there, and KVM's page fault handler > could audit any "blocked" SPTE when it's refaulted (I'm pretty sure it'd be > impossible for the PFN to change, since any PFN change would require a memslot > update or mmu_notifier invalidation). > > The downside to that approach is that it would require walking all SPTEs to do a > memslot deletion, i.e. we'd lose the "fast zap" behavior. If that's a performance > issue, the behavior could be opt-in (but not for SNP/TDX). Another option if we introduce private memslots is to preserve private memslots on unrelated deletions. The argument being that (a) private memslots are a new feature so there's no prior uABI to break, and (b) if not zapping private memslot SPTEs in response to the guest remapping a BAR somehow breaks GPU pass-through, then the bug is all but guaranteed to be somewhere besides KVM's memslot logic.
On 10/12/21 2:48 PM, Sean Christopherson wrote: > On Fri, Aug 20, 2021, Brijesh Singh wrote: >> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c >> index 991b8c996fc1..6d9483ec91ab 100644 >> --- a/arch/x86/kvm/svm/sev.c >> +++ b/arch/x86/kvm/svm/sev.c >> @@ -31,6 +31,7 @@ >> #include "svm_ops.h" >> #include "cpuid.h" >> #include "trace.h" >> +#include "mmu.h" >> >> #define __ex(x) __kvm_handle_fault_on_reboot(x) >> >> @@ -2905,6 +2906,181 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) >> svm->vmcb->control.ghcb_gpa = value; >> } >> >> +static int snp_rmptable_psmash(struct kvm *kvm, kvm_pfn_t pfn) >> +{ >> + pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); >> + >> + return psmash(pfn); >> +} >> + >> +static int snp_make_page_shared(struct kvm *kvm, gpa_t gpa, kvm_pfn_t pfn, int level) >> +{ >> + int rc, rmp_level; >> + >> + rc = snp_lookup_rmpentry(pfn, &rmp_level); >> + if (rc < 0) >> + return -EINVAL; >> + >> + /* If page is not assigned then do nothing */ >> + if (!rc) >> + return 0; >> + >> + /* >> + * Is the page part of an existing 2MB RMP entry ? Split the 2MB into >> + * multiple of 4K-page before making the memory shared. >> + */ >> + if (level == PG_LEVEL_4K && rmp_level == PG_LEVEL_2M) { >> + rc = snp_rmptable_psmash(kvm, pfn); >> + if (rc) >> + return rc; >> + } >> + >> + return rmp_make_shared(pfn, level); >> +} >> + >> +static int snp_check_and_build_npt(struct kvm_vcpu *vcpu, gpa_t gpa, int level) >> +{ >> + struct kvm *kvm = vcpu->kvm; >> + int rc, npt_level; >> + kvm_pfn_t pfn; >> + >> + /* >> + * Get the pfn and level for the gpa from the nested page table. >> + * >> + * If the tdp walk fails, then its safe to say that there is no >> + * valid mapping for this gpa. Create a fault to build the map. >> + */ >> + write_lock(&kvm->mmu_lock); > SEV (or any vendor code for that matter) should not be taking mmu_lock. All of > KVM has somewhat fungible borders between the various components, but IMO this > crosses firmly into "this belongs in the MMU" territory. > > For example, I highly doubt this actually need to take mmu_lock for write. More > below. > >> + rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level); >> + write_unlock(&kvm->mmu_lock); > What's the point of this walk? As soon as mmu_lock is dropped, all bets are off. > At best this is a strong hint. It doesn't hurt anything per se, it's just a waste > of cycles. I can avoid the walk because the kvm_mmu_map_tdp_page() will return a pfn if the NPT is already built. >> + if (!rc) { >> + pfn = kvm_mmu_map_tdp_page(vcpu, gpa, PFERR_USER_MASK, level); > Same here. > >> + if (is_error_noslot_pfn(pfn)) >> + return -EINVAL; >> + } >> + >> + return 0; >> +} >> + >> +static int snp_gpa_to_hva(struct kvm *kvm, gpa_t gpa, hva_t *hva) >> +{ >> + struct kvm_memory_slot *slot; >> + gfn_t gfn = gpa_to_gfn(gpa); >> + int idx; >> + >> + idx = srcu_read_lock(&kvm->srcu); >> + slot = gfn_to_memslot(kvm, gfn); >> + if (!slot) { >> + srcu_read_unlock(&kvm->srcu, idx); >> + return -EINVAL; >> + } >> + >> + /* >> + * Note, using the __gfn_to_hva_memslot() is not solely for performance, >> + * it's also necessary to avoid the "writable" check in __gfn_to_hva_many(), >> + * which will always fail on read-only memslots due to gfn_to_hva() assuming >> + * writes. >> + */ >> + *hva = __gfn_to_hva_memslot(slot, gfn); >> + srcu_read_unlock(&kvm->srcu, idx); > *hva is effectively invalidated the instance kvm->srcu is unlocked, e.g. a pending > memslot update can complete immediately after and delete/move the backing memslot. Fair point, I can rework to do all the hva related updates while we keep the kvm->srcu. More on this below. >> + >> + return 0; >> +} >> + >> +static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, gpa_t gpa, >> + int level) >> +{ >> + struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; >> + struct kvm *kvm = vcpu->kvm; >> + int rc, npt_level; >> + kvm_pfn_t pfn; >> + gpa_t gpa_end; >> + >> + gpa_end = gpa + page_level_size(level); >> + >> + while (gpa < gpa_end) { >> + /* >> + * If the gpa is not present in the NPT then build the NPT. >> + */ >> + rc = snp_check_and_build_npt(vcpu, gpa, level); >> + if (rc) >> + return -EINVAL; >> + >> + if (op == SNP_PAGE_STATE_PRIVATE) { >> + hva_t hva; >> + >> + if (snp_gpa_to_hva(kvm, gpa, &hva)) >> + return -EINVAL; >> + >> + /* >> + * Verify that the hva range is registered. This enforcement is >> + * required to avoid the cases where a page is marked private >> + * in the RMP table but never gets cleanup during the VM >> + * termination path. >> + */ >> + mutex_lock(&kvm->lock); >> + rc = is_hva_registered(kvm, hva, page_level_size(level)); > This will get a false negative if a hva+size spans two contiguous regions. > > Also, storing a boolean return in a variable that is an int _and_ was already used > for the kernel's standard > >> + mutex_unlock(&kvm->lock); > This is also subject to races, e.g. userspace unregisters the hva immediately > after this check, before KVM makes whatever conversion it makes below. > > A linear walk through a list to find a range is also a bad idea, e.g. pathological > worst case scenario is that userspace has created tens of thousands of individual > regions. There is no restriction on the number of regions, just the number of > pages that can be pinned. > > I dislike the svm_(un)register_enc_region() scheme in general, but at least for > SEV and SEV-ES the code is isolated, e.g. KVM is little more than a dump pipe to > let userspace pin pages. I would like to go the opposite direction and work towards > eliminating regions_list (or at least making it optional), not build more stuff > on top. If we don't nuke the page from the direct map and region_list removed then we no longer need to have all the above complexity in the PSC. PSC can be as simple as 1) map the page in NPF and 2) update the RMP table. > The more I look at this, the more strongly I feel that private <=> shared conversions > belong in the MMU, and that KVM's SPTEs should be the single source of truth for > shared vs. private. E.g. add a SPTE_TDP_PRIVATE_MASK in the software available bits. > I believe the only hiccup is the snafu where not zapping _all_ SPTEs on memslot > deletion breaks QEMU+VFIO+GPU, i.e. KVM would lose its canonical info on unrelated > memslot deletion. > > But that is a solvable problem. Ideally the bug, wherever it is, would be root > caused and fixed. I believe Peter (and Marc?) is going to work on reproducing > the bug. We have been also setting up VM with Qemu + VFIO + GPU usecase to repro the bug on AMD HW and so far we no luck in reproducing it. Will continue stressing the system to recreate it. Lets hope that Peter (and Marc) can easily recreate on Intel HW so that we can work towards fixing it. > > If we are unable to root cause and fix the bug, I think a viable workaround would > be to clear the hardware present bit in unrelated SPTEs, but keep the SPTEs > themselves. The idea mostly the same as the ZAPPED_PRIVATE concept from the initial > TDX RFC. MMU notifier invalidations, memslot removal, RMP restoration, etc... would > all continue to work since the SPTEs is still there, and KVM's page fault handler > could audit any "blocked" SPTE when it's refaulted (I'm pretty sure it'd be > impossible for the PFN to change, since any PFN change would require a memslot > update or mmu_notifier invalidation). > > The downside to that approach is that it would require walking all SPTEs to do a > memslot deletion, i.e. we'd lose the "fast zap" behavior. If that's a performance > issue, the behavior could be opt-in (but not for SNP/TDX). > >> + if (!rc) >> + return -EINVAL; >> + >> + /* >> + * Mark the userspace range unmerable before adding the pages >> + * in the RMP table. >> + */ >> + mmap_write_lock(kvm->mm); >> + rc = snp_mark_unmergable(kvm, hva, page_level_size(level)); >> + mmap_write_unlock(kvm->mm); > As mentioned in an earlier patch, this simply cannot work. As discussed in the previous patches, will drop the support for nuking the page from direct map; this will keep ksm happy and no need to mark vma unmergable. > >> + if (rc) >> + return -EINVAL; >> + } >> + >> + write_lock(&kvm->mmu_lock); >> + >> + rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level); > Same comment about the bool into int. Though in this case I'd say have > kvm_mmu_get_tdp_walk() return 0/-errno, not a bool. Boolean returns for helpers > without "is_", "test_", etc... are generally confusing. > >> + if (!rc) { >> + /* >> + * This may happen if another vCPU unmapped the page >> + * before we acquire the lock. Retry the PSC. >> + */ >> + write_unlock(&kvm->mmu_lock); >> + return 0; > How will the caller (guest?) know to retry the PSC if KVM returns "success"? If a guest is adhering to the GHCB spec then it will see that hypervisor has not processed all the entry and it should retry the PSC. >> + } >> + >> + /* >> + * Adjust the level so that we don't go higher than the backing >> + * page level. >> + */ >> + level = min_t(size_t, level, npt_level); >> + >> + trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op, level); >> + >> + switch (op) { >> + case SNP_PAGE_STATE_SHARED: >> + rc = snp_make_page_shared(kvm, gpa, pfn, level); >> + break; >> + case SNP_PAGE_STATE_PRIVATE: >> + rc = rmp_make_private(pfn, gpa, level, sev->asid, false); >> + break; >> + default: >> + rc = -EINVAL; > Not that it really matters, because I don't think the MADV_* approach is viable, > but this neglects to undo snp_mark_unmergable() on failure. > >> + break; >> + } >> + >> + write_unlock(&kvm->mmu_lock); >> + >> + if (rc) { >> + pr_err_ratelimited("Error op %d gpa %llx pfn %llx level %d rc %d\n", >> + op, gpa, pfn, level, rc); >> + return rc; >> + } >> + >> + gpa = gpa + page_level_size(level); >> + } >> + >> + return 0; >> +} >> + >> static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) >> { >> struct vmcb_control_area *control = &svm->vmcb->control;
On Wed, Oct 13, 2021, Brijesh Singh wrote: > > The more I look at this, the more strongly I feel that private <=> shared conversions > > belong in the MMU, and that KVM's SPTEs should be the single source of truth for > > shared vs. private. E.g. add a SPTE_TDP_PRIVATE_MASK in the software available bits. > > I believe the only hiccup is the snafu where not zapping _all_ SPTEs on memslot > > deletion breaks QEMU+VFIO+GPU, i.e. KVM would lose its canonical info on unrelated > > memslot deletion. > > > > But that is a solvable problem. Ideally the bug, wherever it is, would be root > > caused and fixed. I believe Peter (and Marc?) is going to work on reproducing > > the bug. > We have been also setting up VM with Qemu + VFIO + GPU usecase to repro > the bug on AMD HW and so far we no luck in reproducing it. Will continue > stressing the system to recreate it. Lets hope that Peter (and Marc) can > easily recreate on Intel HW so that we can work towards fixing it. Are you trying on a modern kernel? If so, double check that nx_huge_pages is off, turning that on caused the bug to disappear. It should be off for AMD systems, but it's worth checking. > >> + if (!rc) { > >> + /* > >> + * This may happen if another vCPU unmapped the page > >> + * before we acquire the lock. Retry the PSC. > >> + */ > >> + write_unlock(&kvm->mmu_lock); > >> + return 0; > > How will the caller (guest?) know to retry the PSC if KVM returns "success"? > > If a guest is adhering to the GHCB spec then it will see that hypervisor > has not processed all the entry and it should retry the PSC. But AFAICT that information isn't passed to the guest. Even in this single-page MSR-based case, the caller will say "all good" on a return of 0. The "full" path is more obvious, as the caller clearly continues to process entries unless there's an actual failure. + for (; cur <= end; cur++) { + entry = &info->entries[cur]; + gpa = gfn_to_gpa(entry->gfn); + level = RMP_TO_X86_PG_LEVEL(entry->pagesize); + op = entry->operation; + + if (!IS_ALIGNED(gpa, page_level_size(level))) { + rc = PSC_INVALID_ENTRY; + goto out; + } + + rc = __snp_handle_page_state_change(vcpu, op, gpa, level); + if (rc) + goto out; + }
On 10/13/21 10:24 AM, Sean Christopherson wrote: > On Wed, Oct 13, 2021, Brijesh Singh wrote: >>> The more I look at this, the more strongly I feel that private <=> shared conversions >>> belong in the MMU, and that KVM's SPTEs should be the single source of truth for >>> shared vs. private. E.g. add a SPTE_TDP_PRIVATE_MASK in the software available bits. >>> I believe the only hiccup is the snafu where not zapping _all_ SPTEs on memslot >>> deletion breaks QEMU+VFIO+GPU, i.e. KVM would lose its canonical info on unrelated >>> memslot deletion. >>> >>> But that is a solvable problem. Ideally the bug, wherever it is, would be root >>> caused and fixed. I believe Peter (and Marc?) is going to work on reproducing >>> the bug. >> We have been also setting up VM with Qemu + VFIO + GPU usecase to repro >> the bug on AMD HW and so far we no luck in reproducing it. Will continue >> stressing the system to recreate it. Lets hope that Peter (and Marc) can >> easily recreate on Intel HW so that we can work towards fixing it. > Are you trying on a modern kernel? If so, double check that nx_huge_pages is off, > turning that on caused the bug to disappear. It should be off for AMD systems, > but it's worth checking. Yes, this is a recent kernel. I will double check the nx_huge_pages is off. >>>> + if (!rc) { >>>> + /* >>>> + * This may happen if another vCPU unmapped the page >>>> + * before we acquire the lock. Retry the PSC. >>>> + */ >>>> + write_unlock(&kvm->mmu_lock); >>>> + return 0; >>> How will the caller (guest?) know to retry the PSC if KVM returns "success"? >> If a guest is adhering to the GHCB spec then it will see that hypervisor >> has not processed all the entry and it should retry the PSC. > But AFAICT that information isn't passed to the guest. Even in this single-page > MSR-based case, the caller will say "all good" on a return of 0. > > The "full" path is more obvious, as the caller clearly continues to process > entries unless there's an actual failure. > > + for (; cur <= end; cur++) { > + entry = &info->entries[cur]; > + gpa = gfn_to_gpa(entry->gfn); > + level = RMP_TO_X86_PG_LEVEL(entry->pagesize); > + op = entry->operation; > + > + if (!IS_ALIGNED(gpa, page_level_size(level))) { > + rc = PSC_INVALID_ENTRY; > + goto out; > + } > + > + rc = __snp_handle_page_state_change(vcpu, op, gpa, level); > + if (rc) > + goto out; > + } > Please see the guest kernel patch #19 [1]. In spec there is no special code for the retry. The guest will look at the PSC hdr to determine how many entries were processed by the hypervisor (in this particular case a 0). And at time the guest can do whatever it wants. In the case of Linux guest, we retry the PSC. [1] https://lore.kernel.org/linux-mm/20211008180453.462291-20-brijesh.singh@amd.com/ thanks
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 91089967ab09..4980f77aa1d5 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -89,6 +89,10 @@ enum psc_op { }; #define GHCB_MSR_PSC_REQ 0x014 +#define GHCB_MSR_PSC_GFN_POS 12 +#define GHCB_MSR_PSC_GFN_MASK GENMASK_ULL(39, 0) +#define GHCB_MSR_PSC_OP_POS 52 +#define GHCB_MSR_PSC_OP_MASK 0xf #define GHCB_MSR_PSC_REQ_GFN(gfn, op) \ /* GHCBData[55:52] */ \ (((u64)((op) & 0xf) << 52) | \ @@ -98,6 +102,11 @@ enum psc_op { GHCB_MSR_PSC_REQ) #define GHCB_MSR_PSC_RESP 0x015 +#define GHCB_MSR_PSC_ERROR_POS 32 +#define GHCB_MSR_PSC_ERROR_MASK GENMASK_ULL(31, 0) +#define GHCB_MSR_PSC_ERROR GENMASK_ULL(31, 0) +#define GHCB_MSR_PSC_RSVD_POS 12 +#define GHCB_MSR_PSC_RSVD_MASK GENMASK_ULL(19, 0) #define GHCB_MSR_PSC_RESP_VAL(val) \ /* GHCBData[63:32] */ \ (((u64)(val) & GENMASK_ULL(63, 32)) >> 32) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 991b8c996fc1..6d9483ec91ab 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -31,6 +31,7 @@ #include "svm_ops.h" #include "cpuid.h" #include "trace.h" +#include "mmu.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) @@ -2905,6 +2906,181 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) svm->vmcb->control.ghcb_gpa = value; } +static int snp_rmptable_psmash(struct kvm *kvm, kvm_pfn_t pfn) +{ + pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); + + return psmash(pfn); +} + +static int snp_make_page_shared(struct kvm *kvm, gpa_t gpa, kvm_pfn_t pfn, int level) +{ + int rc, rmp_level; + + rc = snp_lookup_rmpentry(pfn, &rmp_level); + if (rc < 0) + return -EINVAL; + + /* If page is not assigned then do nothing */ + if (!rc) + return 0; + + /* + * Is the page part of an existing 2MB RMP entry ? Split the 2MB into + * multiple of 4K-page before making the memory shared. + */ + if (level == PG_LEVEL_4K && rmp_level == PG_LEVEL_2M) { + rc = snp_rmptable_psmash(kvm, pfn); + if (rc) + return rc; + } + + return rmp_make_shared(pfn, level); +} + +static int snp_check_and_build_npt(struct kvm_vcpu *vcpu, gpa_t gpa, int level) +{ + struct kvm *kvm = vcpu->kvm; + int rc, npt_level; + kvm_pfn_t pfn; + + /* + * Get the pfn and level for the gpa from the nested page table. + * + * If the tdp walk fails, then its safe to say that there is no + * valid mapping for this gpa. Create a fault to build the map. + */ + write_lock(&kvm->mmu_lock); + rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level); + write_unlock(&kvm->mmu_lock); + if (!rc) { + pfn = kvm_mmu_map_tdp_page(vcpu, gpa, PFERR_USER_MASK, level); + if (is_error_noslot_pfn(pfn)) + return -EINVAL; + } + + return 0; +} + +static int snp_gpa_to_hva(struct kvm *kvm, gpa_t gpa, hva_t *hva) +{ + struct kvm_memory_slot *slot; + gfn_t gfn = gpa_to_gfn(gpa); + int idx; + + idx = srcu_read_lock(&kvm->srcu); + slot = gfn_to_memslot(kvm, gfn); + if (!slot) { + srcu_read_unlock(&kvm->srcu, idx); + return -EINVAL; + } + + /* + * Note, using the __gfn_to_hva_memslot() is not solely for performance, + * it's also necessary to avoid the "writable" check in __gfn_to_hva_many(), + * which will always fail on read-only memslots due to gfn_to_hva() assuming + * writes. + */ + *hva = __gfn_to_hva_memslot(slot, gfn); + srcu_read_unlock(&kvm->srcu, idx); + + return 0; +} + +static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, gpa_t gpa, + int level) +{ + struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm *kvm = vcpu->kvm; + int rc, npt_level; + kvm_pfn_t pfn; + gpa_t gpa_end; + + gpa_end = gpa + page_level_size(level); + + while (gpa < gpa_end) { + /* + * If the gpa is not present in the NPT then build the NPT. + */ + rc = snp_check_and_build_npt(vcpu, gpa, level); + if (rc) + return -EINVAL; + + if (op == SNP_PAGE_STATE_PRIVATE) { + hva_t hva; + + if (snp_gpa_to_hva(kvm, gpa, &hva)) + return -EINVAL; + + /* + * Verify that the hva range is registered. This enforcement is + * required to avoid the cases where a page is marked private + * in the RMP table but never gets cleanup during the VM + * termination path. + */ + mutex_lock(&kvm->lock); + rc = is_hva_registered(kvm, hva, page_level_size(level)); + mutex_unlock(&kvm->lock); + if (!rc) + return -EINVAL; + + /* + * Mark the userspace range unmerable before adding the pages + * in the RMP table. + */ + mmap_write_lock(kvm->mm); + rc = snp_mark_unmergable(kvm, hva, page_level_size(level)); + mmap_write_unlock(kvm->mm); + if (rc) + return -EINVAL; + } + + write_lock(&kvm->mmu_lock); + + rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level); + if (!rc) { + /* + * This may happen if another vCPU unmapped the page + * before we acquire the lock. Retry the PSC. + */ + write_unlock(&kvm->mmu_lock); + return 0; + } + + /* + * Adjust the level so that we don't go higher than the backing + * page level. + */ + level = min_t(size_t, level, npt_level); + + trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op, level); + + switch (op) { + case SNP_PAGE_STATE_SHARED: + rc = snp_make_page_shared(kvm, gpa, pfn, level); + break; + case SNP_PAGE_STATE_PRIVATE: + rc = rmp_make_private(pfn, gpa, level, sev->asid, false); + break; + default: + rc = -EINVAL; + break; + } + + write_unlock(&kvm->mmu_lock); + + if (rc) { + pr_err_ratelimited("Error op %d gpa %llx pfn %llx level %d rc %d\n", + op, gpa, pfn, level, rc); + return rc; + } + + gpa = gpa + page_level_size(level); + } + + return 0; +} + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3005,6 +3181,27 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) GHCB_MSR_INFO_POS); break; } + case GHCB_MSR_PSC_REQ: { + gfn_t gfn; + int ret; + enum psc_op op; + + gfn = get_ghcb_msr_bits(svm, GHCB_MSR_PSC_GFN_MASK, GHCB_MSR_PSC_GFN_POS); + op = get_ghcb_msr_bits(svm, GHCB_MSR_PSC_OP_MASK, GHCB_MSR_PSC_OP_POS); + + ret = __snp_handle_page_state_change(vcpu, op, gfn_to_gpa(gfn), PG_LEVEL_4K); + + if (ret) + set_ghcb_msr_bits(svm, GHCB_MSR_PSC_ERROR, + GHCB_MSR_PSC_ERROR_MASK, GHCB_MSR_PSC_ERROR_POS); + else + set_ghcb_msr_bits(svm, 0, + GHCB_MSR_PSC_ERROR_MASK, GHCB_MSR_PSC_ERROR_POS); + + set_ghcb_msr_bits(svm, 0, GHCB_MSR_PSC_RSVD_MASK, GHCB_MSR_PSC_RSVD_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_PSC_RESP, GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); + break; + } case GHCB_MSR_TERM_REQ: { u64 reason_set, reason_code; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 1c360e07856f..35ca1cf8440a 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -7,6 +7,7 @@ #include <asm/svm.h> #include <asm/clocksource.h> #include <asm/pvclock-abi.h> +#include <asm/sev-common.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -1711,6 +1712,39 @@ TRACE_EVENT(kvm_vmgexit_msr_protocol_exit, __entry->vcpu_id, __entry->ghcb_gpa, __entry->result) ); +/* + * Tracepoint for the SEV-SNP page state change processing + */ +#define psc_operation \ + {SNP_PAGE_STATE_PRIVATE, "private"}, \ + {SNP_PAGE_STATE_SHARED, "shared"} \ + +TRACE_EVENT(kvm_snp_psc, + TP_PROTO(unsigned int vcpu_id, u64 pfn, u64 gpa, u8 op, int level), + TP_ARGS(vcpu_id, pfn, gpa, op, level), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u64, pfn) + __field(u64, gpa) + __field(u8, op) + __field(int, level) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->pfn = pfn; + __entry->gpa = gpa; + __entry->op = op; + __entry->level = level; + ), + + TP_printk("vcpu %u, pfn %llx, gpa %llx, op %s, level %d", + __entry->vcpu_id, __entry->pfn, __entry->gpa, + __print_symbolic(__entry->op, psc_operation), + __entry->level) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5d5c5ed7dd4..afcdc75a99f2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12371,3 +12371,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_snp_psc);
SEV-SNP VMs can ask the hypervisor to change the page state in the RMP table to be private or shared using the Page State Change MSR protocol as defined in the GHCB specification. Before changing the page state in the RMP entry, lookup the page in the NPT to make sure that there is a valid mapping for it. If the mapping exist then try to find a workable page level between the NPT and RMP for the page. If the page is not mapped in the NPT, then create a fault such that it gets mapped before we change the page state in the RMP entry. Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> --- arch/x86/include/asm/sev-common.h | 9 ++ arch/x86/kvm/svm/sev.c | 197 ++++++++++++++++++++++++++++++ arch/x86/kvm/trace.h | 34 ++++++ arch/x86/kvm/x86.c | 1 + 4 files changed, 241 insertions(+)