diff mbox series

[v12,22/29] KVM: SEV: Implement gmem hook for invalidating private pages

Message ID 20240329225835.400662-23-michael.roth@amd.com (mailing list archive)
State New, archived
Headers show
Series Add AMD Secure Nested Paging (SEV-SNP) Hypervisor Support | expand

Commit Message

Michael Roth March 29, 2024, 10:58 p.m. UTC
Implement a platform hook to do the work of restoring the direct map
entries of gmem-managed pages and transitioning the corresponding RMP
table entries back to the default shared/hypervisor-owned state.

Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 arch/x86/kvm/Kconfig   |  1 +
 arch/x86/kvm/svm/sev.c | 63 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c |  1 +
 arch/x86/kvm/svm/svm.h |  2 ++
 4 files changed, 67 insertions(+)

Comments

Paolo Bonzini March 30, 2024, 9:31 p.m. UTC | #1
On 3/29/24 23:58, Michael Roth wrote:
> +		/*
> +		 * If an unaligned PFN corresponds to a 2M region assigned as a
> +		 * large page in he RMP table, PSMASH the region into individual
> +		 * 4K RMP entries before attempting to convert a 4K sub-page.
> +		 */
> +		if (!use_2m_update && rmp_level > PG_LEVEL_4K) {
> +			rc = snp_rmptable_psmash(pfn);
> +			if (rc)
> +				pr_err_ratelimited("SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
> +						   pfn, rc);
> +		}

Ignoring the PSMASH failure is pretty scary...  At this point 
.free_folio cannot fail, should the psmash part of this patch be done in 
kvm_gmem_invalidate_begin() before kvm_mmu_unmap_gfn_range()?

Also, can you get PSMASH_FAIL_INUSE and if so what's the best way to 
address it?  Should fallocate() return -EBUSY?

Thanks,

Paolo
Michael Roth April 18, 2024, 7:57 p.m. UTC | #2
On Sat, Mar 30, 2024 at 10:31:47PM +0100, Paolo Bonzini wrote:
> On 3/29/24 23:58, Michael Roth wrote:
> > +		/*
> > +		 * If an unaligned PFN corresponds to a 2M region assigned as a
> > +		 * large page in he RMP table, PSMASH the region into individual
> > +		 * 4K RMP entries before attempting to convert a 4K sub-page.
> > +		 */
> > +		if (!use_2m_update && rmp_level > PG_LEVEL_4K) {
> > +			rc = snp_rmptable_psmash(pfn);
> > +			if (rc)
> > +				pr_err_ratelimited("SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
> > +						   pfn, rc);
> > +		}
> 
> Ignoring the PSMASH failure is pretty scary...  At this point .free_folio
> cannot fail, should the psmash part of this patch be done in
> kvm_gmem_invalidate_begin() before kvm_mmu_unmap_gfn_range()?
> 
> Also, can you get PSMASH_FAIL_INUSE and if so what's the best way to address
> it?  Should fallocate() return -EBUSY?

FAIL_INUSE shouldn't occur since at this point the pages have been unmapped
from NPT and only the task doing the cleanup should be attempting to
access/PSMASH this particular 2M HPA range at this point.

However, since FAIL_INUSE is transient, there isn't a good reason why we
shouldn't retry until it clears itself up rather than risk hosing the
system if some unexpected case ever did pop up, so I've updated
snp_rmptable_psmash() to handle that case automatically and simplify the
handling in sev_handle_rmp_fault() as well. (in the case of #NPF RMP
faults there is actually potential for PSMASH errors other than
FAIL_INUSE due to races with other vCPU threads which can interleave and
put the RMP entry in an unexpected state, so there's additional
handling/reporting to deal with those cases, but here they are not expected
and will trigger WARN_*ONCE()'s now)

I used this hacked up version of Sean's original patch to re-enable 2MB
hugepage support in gmem for the purposes of re-testing this:

  https://github.com/mdroth/linux/commit/15aa4f81811485997953130fc184e829ba4399d2

-Mike

> 
> Thanks,
> 
> Paolo
> 
>
diff mbox series

Patch

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 286b40d0b07c..32a5c37cbf88 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -125,6 +125,7 @@  config KVM_AMD_SEV
 	select ARCH_HAS_CC_PLATFORM
 	select KVM_GENERIC_PRIVATE_MEM
 	select HAVE_KVM_GMEM_PREPARE
+	select HAVE_KVM_GMEM_INVALIDATE
 	help
 	  Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
 	  with Encrypted State (SEV-ES) on AMD processors.
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index e1f8be1df219..87d621d013a4 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -4380,3 +4380,66 @@  int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
 
 	return 0;
 }
+
+void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
+{
+	kvm_pfn_t pfn;
+
+	pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end);
+
+	for (pfn = start; pfn < end;) {
+		bool use_2m_update = false;
+		int rc, rmp_level;
+		bool assigned;
+
+		rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+		if (rc) {
+			pr_debug_ratelimited("SEV: Failed to retrieve RMP entry for PFN 0x%llx error %d\n",
+					     pfn, rc);
+			goto next_pfn;
+		}
+
+		if (!assigned)
+			goto next_pfn;
+
+		use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) &&
+				end >= (pfn + PTRS_PER_PMD) &&
+				rmp_level > PG_LEVEL_4K;
+
+		/*
+		 * If an unaligned PFN corresponds to a 2M region assigned as a
+		 * large page in he RMP table, PSMASH the region into individual
+		 * 4K RMP entries before attempting to convert a 4K sub-page.
+		 */
+		if (!use_2m_update && rmp_level > PG_LEVEL_4K) {
+			rc = snp_rmptable_psmash(pfn);
+			if (rc)
+				pr_err_ratelimited("SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
+						   pfn, rc);
+		}
+
+		rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K);
+		if (WARN_ON_ONCE(rc)) {
+			pr_err_ratelimited("SEV: Failed to update RMP entry for PFN 0x%llx error %d\n",
+					   pfn, rc);
+			goto next_pfn;
+		}
+
+		/*
+		 * SEV-ES avoids host/guest cache coherency issues through
+		 * WBINVD hooks issued via MMU notifiers during run-time, and
+		 * KVM's VM destroy path at shutdown. Those MMU notifier events
+		 * don't cover gmem since there is no requirement to map pages
+		 * to a HVA in order to use them for a running guest. While the
+		 * shutdown path would still likely cover things for SNP guests,
+		 * userspace may also free gmem pages during run-time via
+		 * hole-punching operations on the guest_memfd, so flush the
+		 * cache entries for these pages before free'ing them back to
+		 * the host.
+		 */
+		clflush_cache_range(__va(pfn_to_hpa(pfn)),
+				    use_2m_update ? PMD_SIZE : PAGE_SIZE);
+next_pfn:
+		pfn += use_2m_update ? PTRS_PER_PMD : 1;
+	}
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c099154e326a..b456906f2670 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -5080,6 +5080,7 @@  static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.alloc_apic_backing_page = svm_alloc_apic_backing_page,
 
 	.gmem_prepare = sev_gmem_prepare,
+	.gmem_invalidate = sev_gmem_invalidate,
 };
 
 /*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 53618cfc2b89..3f1f6d3d3ade 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -731,6 +731,7 @@  void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
 void sev_vcpu_unblocking(struct kvm_vcpu *vcpu);
 void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
 int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
+void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
 #else
 static inline struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) {
 	return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
@@ -751,6 +752,7 @@  static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, in
 {
 	return 0;
 }
+static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {}
 
 #endif