diff mbox series

[09/11] KVM: guest_memfd: Add interface for populating gmem pages with user data

Message ID 20240404185034.3184582-10-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: guest_memfd: New hooks and functionality for SEV-SNP and TDX | expand

Commit Message

Paolo Bonzini April 4, 2024, 6:50 p.m. UTC
During guest run-time, kvm_arch_gmem_prepare() is issued as needed to
prepare newly-allocated gmem pages prior to mapping them into the guest.
In the case of SEV-SNP, this mainly involves setting the pages to
private in the RMP table.

However, for the GPA ranges comprising the initial guest payload, which
are encrypted/measured prior to starting the guest, the gmem pages need
to be accessed prior to setting them to private in the RMP table so they
can be initialized with the userspace-provided data. Additionally, an
SNP firmware call is needed afterward to encrypt them in-place and
measure the contents into the guest's launch digest.

While it is possible to bypass the kvm_arch_gmem_prepare() hooks so that
this handling can be done in an open-coded/vendor-specific manner, this
may expose more gmem-internal state/dependencies to external callers
than necessary. Try to avoid this by implementing an interface that
tries to handle as much of the common functionality inside gmem as
possible, while also making it generic enough to potentially be
usable/extensible for TDX as well.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Co-developed-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 26 ++++++++++++++
 virt/kvm/guest_memfd.c   | 78 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)

Comments

Xu Yilun April 22, 2024, 2:44 p.m. UTC | #1
On Thu, Apr 04, 2024 at 02:50:31PM -0400, Paolo Bonzini wrote:
> During guest run-time, kvm_arch_gmem_prepare() is issued as needed to
> prepare newly-allocated gmem pages prior to mapping them into the guest.
> In the case of SEV-SNP, this mainly involves setting the pages to
> private in the RMP table.
> 
> However, for the GPA ranges comprising the initial guest payload, which
> are encrypted/measured prior to starting the guest, the gmem pages need
> to be accessed prior to setting them to private in the RMP table so they
> can be initialized with the userspace-provided data. Additionally, an
> SNP firmware call is needed afterward to encrypt them in-place and
> measure the contents into the guest's launch digest.
> 
> While it is possible to bypass the kvm_arch_gmem_prepare() hooks so that
> this handling can be done in an open-coded/vendor-specific manner, this
> may expose more gmem-internal state/dependencies to external callers
> than necessary. Try to avoid this by implementing an interface that
> tries to handle as much of the common functionality inside gmem as
> possible, while also making it generic enough to potentially be
> usable/extensible for TDX as well.
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Michael Roth <michael.roth@amd.com>
> Co-developed-by: Michael Roth <michael.roth@amd.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  include/linux/kvm_host.h | 26 ++++++++++++++
>  virt/kvm/guest_memfd.c   | 78 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 104 insertions(+)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 33ed3b884a6b..97d57ec59789 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2450,4 +2450,30 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
>  bool kvm_arch_gmem_prepare_needed(struct kvm *kvm);
>  #endif
>  
> +/**
> + * kvm_gmem_populate() - Populate/prepare a GPA range with guest data
> + *
> + * @kvm: KVM instance
> + * @gfn: starting GFN to be populated
> + * @src: userspace-provided buffer containing data to copy into GFN range
> + *       (passed to @post_populate, and incremented on each iteration
> + *       if not NULL)
> + * @npages: number of pages to copy from userspace-buffer
> + * @post_populate: callback to issue for each gmem page that backs the GPA
> + *                 range
> + * @opaque: opaque data to pass to @post_populate callback
> + *
> + * This is primarily intended for cases where a gmem-backed GPA range needs
> + * to be initialized with userspace-provided data prior to being mapped into
> + * the guest as a private page. This should be called with the slots->lock
> + * held so that caller-enforced invariants regarding the expected memory
> + * attributes of the GPA range do not race with KVM_SET_MEMORY_ATTRIBUTES.
> + *
> + * Returns the number of pages that were populated.
> + */
> +long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
> +		       int (*post_populate)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> +					    void __user *src, int order, void *opaque),
> +		       void *opaque);
> +
>  #endif
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 51c99667690a..e7de97382a67 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -602,3 +602,81 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	return r;
>  }
>  EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
> +
> +static int kvm_gmem_undo_get_pfn(struct file *file, struct kvm_memory_slot *slot,
> +				 gfn_t gfn, int order)
> +{
> +	pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
> +	struct kvm_gmem *gmem = file->private_data;
> +
> +	/*
> +	 * Races with kvm_gmem_unbind() must have been detected by
> +	 * __kvm_gmem_get_gfn(), because the invalidate_lock is
> +	 * taken between __kvm_gmem_get_gfn() and kvm_gmem_undo_get_pfn().
> +	 */
> +	if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot))
> +		return -EIO;
> +
> +	return __kvm_gmem_punch_hole(file_inode(file), index << PAGE_SHIFT, PAGE_SIZE << order);
> +}
> +
> +long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
> +		       int (*post_populate)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> +					    void __user *src, int order, void *opaque),
> +		       void *opaque)
> +{
> +	struct file *file;
> +	struct kvm_memory_slot *slot;
> +
> +	int ret = 0, max_order;
> +	long i;
> +
> +	lockdep_assert_held(&kvm->slots_lock);
> +	if (npages < 0)
> +		return -EINVAL;
> +
> +	slot = gfn_to_memslot(kvm, gfn);
> +	if (!kvm_slot_can_be_private(slot))
> +		return -EINVAL;
> +
> +	file = kvm_gmem_get_file(slot);
> +	if (!file)
> +		return -EFAULT;
> +
> +	filemap_invalidate_lock(file->f_mapping);
> +
> +	npages = min_t(ulong, slot->npages - (gfn - slot->base_gfn), npages);
> +	for (i = 0; i < npages; i += (1 << max_order)) {
> +		gfn_t this_gfn = gfn + i;
> +		kvm_pfn_t pfn;
> +
> +		ret = __kvm_gmem_get_pfn(file, slot, this_gfn, &pfn, &max_order, false);
> +		if (ret)
> +			break;
> +
> +		if (!IS_ALIGNED(this_gfn, (1 << max_order)) ||
> +		    (npages - i) < (1 << max_order))
> +			max_order = 0;
> +
> +		if (post_populate) {
> +			void __user *p = src ? src + i * PAGE_SIZE : NULL;
> +			ret = post_populate(kvm, this_gfn, pfn, p, max_order, opaque);

I don't see the main difference between gmem_prepare() and post_populate()
from gmem's point of view.  They are all vendor callbacks invoked after
__filemap_get_folio(). Is it possible gmem choose to call gmem_prepare()
or post_populate() outside __kvm_gmem_get_pfn()? Or even pass all
parameters to a single gmem_prepare() and let vendor code decide what to
do.

> +		}
> +
> +		put_page(pfn_to_page(pfn));
> +		if (ret) {
> +			/*
> +			 * Punch a hole so that FGP_CREAT_ONLY can succeed
> +			 * again.
> +			 */
> +			kvm_gmem_undo_get_pfn(file, slot, this_gfn, max_order);
> +			break;
> +		}
> +	}
> +
> +	filemap_invalidate_unlock(file->f_mapping);
> +
> +	fput(file);
> +	return ret && !i ? ret : i;
> +}
> +EXPORT_SYMBOL_GPL(kvm_gmem_populate);
> -- 
> 2.43.0
> 
> 
>
Isaku Yamahata April 23, 2024, 11:50 p.m. UTC | #2
On Thu, Apr 04, 2024 at 02:50:31PM -0400,
Paolo Bonzini <pbonzini@redhat.com> wrote:

> During guest run-time, kvm_arch_gmem_prepare() is issued as needed to
> prepare newly-allocated gmem pages prior to mapping them into the guest.
> In the case of SEV-SNP, this mainly involves setting the pages to
> private in the RMP table.
> 
> However, for the GPA ranges comprising the initial guest payload, which
> are encrypted/measured prior to starting the guest, the gmem pages need
> to be accessed prior to setting them to private in the RMP table so they
> can be initialized with the userspace-provided data. Additionally, an
> SNP firmware call is needed afterward to encrypt them in-place and
> measure the contents into the guest's launch digest.
> 
> While it is possible to bypass the kvm_arch_gmem_prepare() hooks so that
> this handling can be done in an open-coded/vendor-specific manner, this
> may expose more gmem-internal state/dependencies to external callers
> than necessary. Try to avoid this by implementing an interface that
> tries to handle as much of the common functionality inside gmem as
> possible, while also making it generic enough to potentially be
> usable/extensible for TDX as well.

I explored how TDX will use this hook.  However, it resulted in not using this
hook, and instead used kvm_tdp_mmu_get_walk() with a twist.  The patch is below.

Because SEV-SNP manages the RMP that is not tied to NPT directly, SEV-SNP can
ignore TDP MMU page tables when updating RMP.
On the other hand, TDX essentially updates Secure-EPT when it adds a page to
the guest by TDH.MEM.PAGE.ADD().  It needs to protect KVM TDP MMU page tables
with mmu_lock, not guest memfd file mapping with invalidate_lock.  The hook
doesn't apply to TDX well.  The resulted KVM_TDX_INIT_MEM_REGION logic is as
follows.

  get_user_pages_fast(source addr)
  read_lock(mmu_lock)
  kvm_tdp_mmu_get_walk_private_pfn(vcpu, gpa, &pfn);
  if the page table doesn't map gpa, error.
  TDH.MEM.PAGE.ADD()
  TDH.MR.EXTEND()
  read_unlock(mmu_lock)
  put_page()


From 7d4024049b51969a2431805c2117992fc7ec0981 Mon Sep 17 00:00:00 2001
Message-ID: <7d4024049b51969a2431805c2117992fc7ec0981.1713913379.git.isaku.yamahata@intel.com>
In-Reply-To: <cover.1713913379.git.isaku.yamahata@intel.com>
References: <cover.1713913379.git.isaku.yamahata@intel.com>
From: Isaku Yamahata <isaku.yamahata@intel.com>
Date: Tue, 23 Apr 2024 11:33:44 -0700
Subject: [PATCH] KVM: x86/tdp_mmu: Add a helper function to walk down the TDP MMU

KVM_TDX_INIT_MEM_REGION needs to check if the given GFN is already
populated.  Add wrapping logic to kvm_tdp_mmu_get_walk() to export it.

Alternatives are as follows.  Choose the approach of this patch as the
least intrusive change.
- Refactor kvm page fault handler.  Populating part and unlock function.
  The page fault handler to populate with keeping lock, TDH.MEM.PAGE.ADD(),
  unlock.
- Add a callback function to struct kvm_page_fault and call it
  after the page fault handler before unlocking mmu_lock and releasing PFN.

Based on the feedback of
https://lore.kernel.org/kvm/ZfBkle1eZFfjPI8l@google.com/
https://lore.kernel.org/kvm/Zh8DHbb8FzoVErgX@google.com/

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/kvm/mmu.h         |  3 +++
 arch/x86/kvm/mmu/tdp_mmu.c | 44 ++++++++++++++++++++++++++++++++------
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 712e9408f634..4f61f4b9fd64 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -287,6 +287,9 @@ extern bool tdp_mmu_enabled;
 #define tdp_mmu_enabled false
 #endif
 
+int kvm_tdp_mmu_get_walk_private_pfn(struct kvm_vcpu *vcpu, u64 gpa,
+				     kvm_pfn_t *pfn);
+
 static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
 {
 	return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 3592ae4e485f..bafcd8aeb3b3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -2035,14 +2035,25 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
  *
  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
  */
-int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
-			 int *root_level)
+static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+				  bool is_private)
 {
 	struct tdp_iter iter;
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
 	gfn_t gfn = addr >> PAGE_SHIFT;
 	int leaf = -1;
 
+	tdp_mmu_for_each_pte(iter, mmu, is_private, gfn, gfn + 1) {
+		leaf = iter.level;
+		sptes[leaf] = iter.old_spte;
+	}
+
+	return leaf;
+}
+
+int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+			 int *root_level)
+{
 	*root_level = vcpu->arch.mmu->root_role.level;
 
 	/*
@@ -2050,15 +2061,34 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
 	 * instructions in protected guest memory can't be parsed by VMM.
 	 */
 	if (WARN_ON_ONCE(kvm_gfn_shared_mask(vcpu->kvm)))
-		return leaf;
+		return -1;
 
-	tdp_mmu_for_each_pte(iter, mmu, false, gfn, gfn + 1) {
-		leaf = iter.level;
-		sptes[leaf] = iter.old_spte;
+	return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, false);
+}
+
+int kvm_tdp_mmu_get_walk_private_pfn(struct kvm_vcpu *vcpu, u64 gpa,
+				     kvm_pfn_t *pfn)
+{
+	u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte;
+	int leaf;
+
+	lockdep_assert_held(&vcpu->kvm->mmu_lock);
+
+	kvm_tdp_mmu_walk_lockless_begin();
+	leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, true);
+	kvm_tdp_mmu_walk_lockless_end();
+	if (leaf < 0)
+		return -ENOENT;
+
+	spte = sptes[leaf];
+	if (is_shadow_present_pte(spte) && is_last_spte(spte, leaf)) {
+		*pfn = spte_to_pfn(spte);
+		return leaf;
 	}
 
-	return leaf;
+	return -ENOENT;
 }
+EXPORT_SYMBOL_GPL(kvm_tdp_mmu_get_walk_private_pfn);
 
 /*
  * Returns the last level spte pointer of the shadow page walk for the given
Sean Christopherson April 24, 2024, 10:24 p.m. UTC | #3
On Tue, Apr 23, 2024, Isaku Yamahata wrote:
> On Thu, Apr 04, 2024 at 02:50:31PM -0400,
> Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> > During guest run-time, kvm_arch_gmem_prepare() is issued as needed to
> > prepare newly-allocated gmem pages prior to mapping them into the guest.
> > In the case of SEV-SNP, this mainly involves setting the pages to
> > private in the RMP table.
> > 
> > However, for the GPA ranges comprising the initial guest payload, which
> > are encrypted/measured prior to starting the guest, the gmem pages need
> > to be accessed prior to setting them to private in the RMP table so they
> > can be initialized with the userspace-provided data. Additionally, an
> > SNP firmware call is needed afterward to encrypt them in-place and
> > measure the contents into the guest's launch digest.
> > 
> > While it is possible to bypass the kvm_arch_gmem_prepare() hooks so that
> > this handling can be done in an open-coded/vendor-specific manner, this
> > may expose more gmem-internal state/dependencies to external callers
> > than necessary. Try to avoid this by implementing an interface that
> > tries to handle as much of the common functionality inside gmem as
> > possible, while also making it generic enough to potentially be
> > usable/extensible for TDX as well.
> 
> I explored how TDX will use this hook.  However, it resulted in not using this
> hook, and instead used kvm_tdp_mmu_get_walk() with a twist.  The patch is below.
> 
> Because SEV-SNP manages the RMP that is not tied to NPT directly, SEV-SNP can
> ignore TDP MMU page tables when updating RMP.
> On the other hand, TDX essentially updates Secure-EPT when it adds a page to
> the guest by TDH.MEM.PAGE.ADD().  It needs to protect KVM TDP MMU page tables
> with mmu_lock, not guest memfd file mapping with invalidate_lock.  The hook
> doesn't apply to TDX well.  The resulted KVM_TDX_INIT_MEM_REGION logic is as
> follows.
> 
>   get_user_pages_fast(source addr)
>   read_lock(mmu_lock)
>   kvm_tdp_mmu_get_walk_private_pfn(vcpu, gpa, &pfn);
>   if the page table doesn't map gpa, error.
>   TDH.MEM.PAGE.ADD()
>   TDH.MR.EXTEND()
>   read_unlock(mmu_lock)
>   put_page()

Hmm, KVM doesn't _need_ to use invalidate_lock to protect against guest_memfd
invalidation, but I also don't see why it would cause problems.  I.e. why not
take mmu_lock() in TDX's post_populate() implementation?  That would allow having
a sanity check that the PFN that guest_memfd() has is indeed the PFN that KVM's
S-EPT mirror has, i.e. the PFN that KVM is going to PAGE.ADD.

> >From 7d4024049b51969a2431805c2117992fc7ec0981 Mon Sep 17 00:00:00 2001
> Message-ID: <7d4024049b51969a2431805c2117992fc7ec0981.1713913379.git.isaku.yamahata@intel.com>
> In-Reply-To: <cover.1713913379.git.isaku.yamahata@intel.com>
> References: <cover.1713913379.git.isaku.yamahata@intel.com>
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> Date: Tue, 23 Apr 2024 11:33:44 -0700
> Subject: [PATCH] KVM: x86/tdp_mmu: Add a helper function to walk down the TDP MMU
> 
> KVM_TDX_INIT_MEM_REGION needs to check if the given GFN is already
> populated.  Add wrapping logic to kvm_tdp_mmu_get_walk() to export it.

> +int kvm_tdp_mmu_get_walk_private_pfn(struct kvm_vcpu *vcpu, u64 gpa,
> +				     kvm_pfn_t *pfn)
> +{
> +	u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte;
> +	int leaf;
> +
> +	lockdep_assert_held(&vcpu->kvm->mmu_lock);
> +
> +	kvm_tdp_mmu_walk_lockless_begin();

This is obviously not a lockless walk.

> +	leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, true);
> +	kvm_tdp_mmu_walk_lockless_end();
> +	if (leaf < 0)
> +		return -ENOENT;
> +
> +	spte = sptes[leaf];
> +	if (is_shadow_present_pte(spte) && is_last_spte(spte, leaf)) {
> +		*pfn = spte_to_pfn(spte);
> +		return leaf;
>  	}
>  
> -	return leaf;
> +	return -ENOENT;
>  }
> +EXPORT_SYMBOL_GPL(kvm_tdp_mmu_get_walk_private_pfn);
>  
>  /*
>   * Returns the last level spte pointer of the shadow page walk for the given
> -- 
> 2.43.2
> 
> -- 
> Isaku Yamahata <isaku.yamahata@intel.com>
Sean Christopherson April 24, 2024, 10:32 p.m. UTC | #4
On Thu, Apr 04, 2024, Paolo Bonzini wrote:
> +long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
> +		       int (*post_populate)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> +					    void __user *src, int order, void *opaque),

Add a typedef for callback?  If only to make this prototype readable.

> +long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
> +		       int (*post_populate)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> +					    void __user *src, int order, void *opaque),
> +		       void *opaque)
> +{
> +	struct file *file;
> +	struct kvm_memory_slot *slot;
> +
> +	int ret = 0, max_order;
> +	long i;
> +
> +	lockdep_assert_held(&kvm->slots_lock);
> +	if (npages < 0)
> +		return -EINVAL;
> +
> +	slot = gfn_to_memslot(kvm, gfn);
> +	if (!kvm_slot_can_be_private(slot))
> +		return -EINVAL;
> +
> +	file = kvm_gmem_get_file(slot);
> +	if (!file)
> +		return -EFAULT;
> +
> +	filemap_invalidate_lock(file->f_mapping);
> +
> +	npages = min_t(ulong, slot->npages - (gfn - slot->base_gfn), npages);
> +	for (i = 0; i < npages; i += (1 << max_order)) {
> +		gfn_t this_gfn = gfn + i;

KVM usually does something like "start_gfn" or "base_gfn", and then uses "gfn"
for the one gfn that's being processed.  And IMO that's much better because the
propotype for kvm_gmem_populate() does not make it obvious that @gfn is the base
of a range, not a singular gfn to process.


> +		kvm_pfn_t pfn;
> +
> +		ret = __kvm_gmem_get_pfn(file, slot, this_gfn, &pfn, &max_order, false);
> +		if (ret)
> +			break;
> +
> +		if (!IS_ALIGNED(this_gfn, (1 << max_order)) ||
> +		    (npages - i) < (1 << max_order))
> +			max_order = 0;
> +
> +		if (post_populate) {

Is there any use for this without @post_populate?  I.e. why make this optional?

> +			void __user *p = src ? src + i * PAGE_SIZE : NULL;

Eh, I would vote to either define "p" at the top of the loop.  

> +			ret = post_populate(kvm, this_gfn, pfn, p, max_order, opaque);
> +		}
> +
> +		put_page(pfn_to_page(pfn));
> +		if (ret) {
> +			/*
> +			 * Punch a hole so that FGP_CREAT_ONLY can succeed
> +			 * again.
> +			 */
> +			kvm_gmem_undo_get_pfn(file, slot, this_gfn, max_order);
> +			break;
> +		}
> +	}
> +
> +	filemap_invalidate_unlock(file->f_mapping);
> +
> +	fput(file);
> +	return ret && !i ? ret : i;
> +}
> +EXPORT_SYMBOL_GPL(kvm_gmem_populate);
> -- 
> 2.43.0
> 
>
Isaku Yamahata April 25, 2024, 1:12 a.m. UTC | #5
On Wed, Apr 24, 2024 at 03:24:58PM -0700,
Sean Christopherson <seanjc@google.com> wrote:

> On Tue, Apr 23, 2024, Isaku Yamahata wrote:
> > On Thu, Apr 04, 2024 at 02:50:31PM -0400,
> > Paolo Bonzini <pbonzini@redhat.com> wrote:
> > 
> > > During guest run-time, kvm_arch_gmem_prepare() is issued as needed to
> > > prepare newly-allocated gmem pages prior to mapping them into the guest.
> > > In the case of SEV-SNP, this mainly involves setting the pages to
> > > private in the RMP table.
> > > 
> > > However, for the GPA ranges comprising the initial guest payload, which
> > > are encrypted/measured prior to starting the guest, the gmem pages need
> > > to be accessed prior to setting them to private in the RMP table so they
> > > can be initialized with the userspace-provided data. Additionally, an
> > > SNP firmware call is needed afterward to encrypt them in-place and
> > > measure the contents into the guest's launch digest.
> > > 
> > > While it is possible to bypass the kvm_arch_gmem_prepare() hooks so that
> > > this handling can be done in an open-coded/vendor-specific manner, this
> > > may expose more gmem-internal state/dependencies to external callers
> > > than necessary. Try to avoid this by implementing an interface that
> > > tries to handle as much of the common functionality inside gmem as
> > > possible, while also making it generic enough to potentially be
> > > usable/extensible for TDX as well.
> > 
> > I explored how TDX will use this hook.  However, it resulted in not using this
> > hook, and instead used kvm_tdp_mmu_get_walk() with a twist.  The patch is below.
> > 
> > Because SEV-SNP manages the RMP that is not tied to NPT directly, SEV-SNP can
> > ignore TDP MMU page tables when updating RMP.
> > On the other hand, TDX essentially updates Secure-EPT when it adds a page to
> > the guest by TDH.MEM.PAGE.ADD().  It needs to protect KVM TDP MMU page tables
> > with mmu_lock, not guest memfd file mapping with invalidate_lock.  The hook
> > doesn't apply to TDX well.  The resulted KVM_TDX_INIT_MEM_REGION logic is as
> > follows.
> > 
> >   get_user_pages_fast(source addr)
> >   read_lock(mmu_lock)
> >   kvm_tdp_mmu_get_walk_private_pfn(vcpu, gpa, &pfn);
> >   if the page table doesn't map gpa, error.
> >   TDH.MEM.PAGE.ADD()
> >   TDH.MR.EXTEND()
> >   read_unlock(mmu_lock)
> >   put_page()
> 
> Hmm, KVM doesn't _need_ to use invalidate_lock to protect against guest_memfd
> invalidation, but I also don't see why it would cause problems.  I.e. why not
> take mmu_lock() in TDX's post_populate() implementation?

We can take the lock.  Because we have already populated the GFN of guest_memfd,
we need to make kvm_gmem_populate() not pass FGP_CREAT_ONLY.  Otherwise we'll
get -EEXIST.


> That would allow having
> a sanity check that the PFN that guest_memfd() has is indeed the PFN that KVM's
> S-EPT mirror has, i.e. the PFN that KVM is going to PAGE.ADD.

Because we have PFN from the mirrored EPT, I thought it's duplicate to get PFN
again via guest memfd.  We can check if two PFN matches.
Paolo Bonzini April 25, 2024, 5:56 a.m. UTC | #6
On Thu, Apr 25, 2024 at 12:32 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Apr 04, 2024, Paolo Bonzini wrote:
> > +long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
> > +                    int (*post_populate)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> > +                                         void __user *src, int order, void *opaque),
>
> Add a typedef for callback?  If only to make this prototype readable.
>
> > +long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
> > +                    int (*post_populate)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> > +                                         void __user *src, int order, void *opaque),
> > +                    void *opaque)
> > +{
> > +     struct file *file;
> > +     struct kvm_memory_slot *slot;
> > +
> > +     int ret = 0, max_order;
> > +     long i;
> > +
> > +     lockdep_assert_held(&kvm->slots_lock);
> > +     if (npages < 0)
> > +             return -EINVAL;
> > +
> > +     slot = gfn_to_memslot(kvm, gfn);
> > +     if (!kvm_slot_can_be_private(slot))
> > +             return -EINVAL;
> > +
> > +     file = kvm_gmem_get_file(slot);
> > +     if (!file)
> > +             return -EFAULT;
> > +
> > +     filemap_invalidate_lock(file->f_mapping);
> > +
> > +     npages = min_t(ulong, slot->npages - (gfn - slot->base_gfn), npages);
> > +     for (i = 0; i < npages; i += (1 << max_order)) {
> > +             gfn_t this_gfn = gfn + i;
>
> KVM usually does something like "start_gfn" or "base_gfn", and then uses "gfn"
> for the one gfn that's being processed.  And IMO that's much better because the
> propotype for kvm_gmem_populate() does not make it obvious that @gfn is the base
> of a range, not a singular gfn to process.
>
>
> > +             kvm_pfn_t pfn;
> > +
> > +             ret = __kvm_gmem_get_pfn(file, slot, this_gfn, &pfn, &max_order, false);
> > +             if (ret)
> > +                     break;
> > +
> > +             if (!IS_ALIGNED(this_gfn, (1 << max_order)) ||
> > +                 (npages - i) < (1 << max_order))
> > +                     max_order = 0;
> > +
> > +             if (post_populate) {
>
> Is there any use for this without @post_populate?  I.e. why make this optional?

Yeah, it probably does not need to be optional (before, the
copy_from_user was optionally done from kvm_gmem_populate, but not
anymore).

Paolo
Paolo Bonzini April 25, 2024, 6:01 a.m. UTC | #7
On Thu, Apr 25, 2024 at 3:12 AM Isaku Yamahata <isaku.yamahata@intel.com> wrote:
> > >   get_user_pages_fast(source addr)
> > >   read_lock(mmu_lock)
> > >   kvm_tdp_mmu_get_walk_private_pfn(vcpu, gpa, &pfn);
> > >   if the page table doesn't map gpa, error.
> > >   TDH.MEM.PAGE.ADD()
> > >   TDH.MR.EXTEND()
> > >   read_unlock(mmu_lock)
> > >   put_page()
> >
> > Hmm, KVM doesn't _need_ to use invalidate_lock to protect against guest_memfd
> > invalidation, but I also don't see why it would cause problems.

The invalidate_lock is only needed to operate on the guest_memfd, but
it's a rwsem so there are no risks of lock inversion.

> > I.e. why not
> > take mmu_lock() in TDX's post_populate() implementation?
>
> We can take the lock.  Because we have already populated the GFN of guest_memfd,
> we need to make kvm_gmem_populate() not pass FGP_CREAT_ONLY.  Otherwise we'll
> get -EEXIST.

I don't understand why TDH.MEM.PAGE.ADD() cannot be called from the
post-populate hook. Can the code for TDH.MEM.PAGE.ADD be shared
between the memory initialization ioctl and the page fault hook in
kvm_x86_ops?

Paolo

>
> > That would allow having
> > a sanity check that the PFN that guest_memfd() has is indeed the PFN that KVM's
> > S-EPT mirror has, i.e. the PFN that KVM is going to PAGE.ADD.
>
> Because we have PFN from the mirrored EPT, I thought it's duplicate to get PFN
> again via guest memfd.  We can check if two PFN matches.
> --
> Isaku Yamahata <isaku.yamahata@intel.com>
>
Sean Christopherson April 25, 2024, 4 p.m. UTC | #8
On Thu, Apr 25, 2024, Paolo Bonzini wrote:
> On Thu, Apr 25, 2024 at 3:12 AM Isaku Yamahata <isaku.yamahata@intel.com> wrote:
> > > >   get_user_pages_fast(source addr)
> > > >   read_lock(mmu_lock)
> > > >   kvm_tdp_mmu_get_walk_private_pfn(vcpu, gpa, &pfn);
> > > >   if the page table doesn't map gpa, error.
> > > >   TDH.MEM.PAGE.ADD()
> > > >   TDH.MR.EXTEND()
> > > >   read_unlock(mmu_lock)
> > > >   put_page()
> > >
> > > Hmm, KVM doesn't _need_ to use invalidate_lock to protect against guest_memfd
> > > invalidation, but I also don't see why it would cause problems.
> 
> The invalidate_lock is only needed to operate on the guest_memfd, but
> it's a rwsem so there are no risks of lock inversion.
> 
> > > I.e. why not
> > > take mmu_lock() in TDX's post_populate() implementation?
> >
> > We can take the lock.  Because we have already populated the GFN of guest_memfd,
> > we need to make kvm_gmem_populate() not pass FGP_CREAT_ONLY.  Otherwise we'll
> > get -EEXIST.
> 
> I don't understand why TDH.MEM.PAGE.ADD() cannot be called from the
> post-populate hook. Can the code for TDH.MEM.PAGE.ADD be shared
> between the memory initialization ioctl and the page fault hook in
> kvm_x86_ops?

Ah, because TDX is required to pre-fault the memory to establish the S-EPT walk,
and pre-faulting means guest_memfd() 

Requiring that guest_memfd not have a page when initializing the guest image
seems wrong, i.e. I don't think we want FGP_CREAT_ONLY.  And not just because I
am a fan of pre-faulting, I think the semantics are bad.

E.g. IIUC, doing fallocate() to ensure memory is available would cause LAUNCH_UPDATE
to fail.  That's weird and has nothing to do with KVM_PRE_FAULT.

I don't understand why we want FGP_CREAT_ONLY semantics.  Who cares if there's a
page allocated?  KVM already checks that the page is unassigned in the RMP, so
why does guest_memfd care whether or not the page was _just_ allocated?

AFAIK, unwinding on failure is completely uninteresting, and arguably undesirable,
because undoing LAUNCH_UPDATE or PAGE.ADD will affect the measurement, i.e. there
is no scenario where deleting pages from guest_memfd would allow a restart/resume
of the build process to truly succeed.
Isaku Yamahata April 25, 2024, 4:51 p.m. UTC | #9
On Thu, Apr 25, 2024 at 09:00:38AM -0700,
Sean Christopherson <seanjc@google.com> wrote:

> On Thu, Apr 25, 2024, Paolo Bonzini wrote:
> > On Thu, Apr 25, 2024 at 3:12 AM Isaku Yamahata <isaku.yamahata@intel.com> wrote:
> > > > >   get_user_pages_fast(source addr)
> > > > >   read_lock(mmu_lock)
> > > > >   kvm_tdp_mmu_get_walk_private_pfn(vcpu, gpa, &pfn);
> > > > >   if the page table doesn't map gpa, error.
> > > > >   TDH.MEM.PAGE.ADD()
> > > > >   TDH.MR.EXTEND()
> > > > >   read_unlock(mmu_lock)
> > > > >   put_page()
> > > >
> > > > Hmm, KVM doesn't _need_ to use invalidate_lock to protect against guest_memfd
> > > > invalidation, but I also don't see why it would cause problems.
> > 
> > The invalidate_lock is only needed to operate on the guest_memfd, but
> > it's a rwsem so there are no risks of lock inversion.
> > 
> > > > I.e. why not
> > > > take mmu_lock() in TDX's post_populate() implementation?
> > >
> > > We can take the lock.  Because we have already populated the GFN of guest_memfd,
> > > we need to make kvm_gmem_populate() not pass FGP_CREAT_ONLY.  Otherwise we'll
> > > get -EEXIST.
> > 
> > I don't understand why TDH.MEM.PAGE.ADD() cannot be called from the
> > post-populate hook. Can the code for TDH.MEM.PAGE.ADD be shared
> > between the memory initialization ioctl and the page fault hook in
> > kvm_x86_ops?
> 
> Ah, because TDX is required to pre-fault the memory to establish the S-EPT walk,
> and pre-faulting means guest_memfd() 
> 
> Requiring that guest_memfd not have a page when initializing the guest image
> seems wrong, i.e. I don't think we want FGP_CREAT_ONLY.  And not just because I
> am a fan of pre-faulting, I think the semantics are bad.
> 
> E.g. IIUC, doing fallocate() to ensure memory is available would cause LAUNCH_UPDATE
> to fail.  That's weird and has nothing to do with KVM_PRE_FAULT.
> 
> I don't understand why we want FGP_CREAT_ONLY semantics.  Who cares if there's a
> page allocated?  KVM already checks that the page is unassigned in the RMP, so
> why does guest_memfd care whether or not the page was _just_ allocated?
> 
> AFAIK, unwinding on failure is completely uninteresting, and arguably undesirable,
> because undoing LAUNCH_UPDATE or PAGE.ADD will affect the measurement, i.e. there
> is no scenario where deleting pages from guest_memfd would allow a restart/resume
> of the build process to truly succeed.


Just for record.  With the following twist to kvm_gmem_populate,
KVM_TDX_INIT_MEM_REGION can use kvm_gmem_populate().  For those who are curious,
I also append the callback implementation at the end.

--

 include/linux/kvm_host.h | 2 ++
 virt/kvm/guest_memfd.c   | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index df957c9f9115..7c86b77f8895 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2460,6 +2460,7 @@ bool kvm_arch_gmem_prepare_needed(struct kvm *kvm);
  *       (passed to @post_populate, and incremented on each iteration
  *       if not NULL)
  * @npages: number of pages to copy from userspace-buffer
+ * @prepare: Allow page allocation to invoke gmem_prepare hook
  * @post_populate: callback to issue for each gmem page that backs the GPA
  *                 range
  * @opaque: opaque data to pass to @post_populate callback
@@ -2473,6 +2474,7 @@ bool kvm_arch_gmem_prepare_needed(struct kvm *kvm);
  * Returns the number of pages that were populated.
  */
 long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
+		       bool prepare,
 		       int (*post_populate)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
 					    void __user *src, int order, void *opaque),
 		       void *opaque);
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 3195ceefe915..18809e6dea8a 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -638,6 +638,7 @@ static int kvm_gmem_undo_get_pfn(struct file *file, struct kvm_memory_slot *slot
 }
 
 long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
+		       bool prepare,
 		       int (*post_populate)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
 					    void __user *src, int order, void *opaque),
 		       void *opaque)
@@ -667,7 +668,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages
 		gfn_t this_gfn = gfn + i;
 		kvm_pfn_t pfn;
 
-		ret = __kvm_gmem_get_pfn(file, slot, this_gfn, &pfn, &max_order, false);
+		ret = __kvm_gmem_get_pfn(file, slot, this_gfn, &pfn, &max_order, prepare);
 		if (ret)
 			break;
Paolo Bonzini April 26, 2024, 5:41 a.m. UTC | #10
On Thu, Apr 25, 2024 at 6:00 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Apr 25, 2024, Paolo Bonzini wrote:
> > On Thu, Apr 25, 2024 at 3:12 AM Isaku Yamahata <isaku.yamahata@intel.com> wrote:
> > > > >   get_user_pages_fast(source addr)
> > > > >   read_lock(mmu_lock)
> > > > >   kvm_tdp_mmu_get_walk_private_pfn(vcpu, gpa, &pfn);
> > > > >   if the page table doesn't map gpa, error.
> > > > >   TDH.MEM.PAGE.ADD()
> > > > >   TDH.MR.EXTEND()
> > > > >   read_unlock(mmu_lock)
> > > > >   put_page()
> > > >
> > > > Hmm, KVM doesn't _need_ to use invalidate_lock to protect against guest_memfd
> > > > invalidation, but I also don't see why it would cause problems.
> >
> > The invalidate_lock is only needed to operate on the guest_memfd, but
> > it's a rwsem so there are no risks of lock inversion.
> >
> > > > I.e. why not
> > > > take mmu_lock() in TDX's post_populate() implementation?
> > >
> > > We can take the lock.  Because we have already populated the GFN of guest_memfd,
> > > we need to make kvm_gmem_populate() not pass FGP_CREAT_ONLY.  Otherwise we'll
> > > get -EEXIST.
> >
> > I don't understand why TDH.MEM.PAGE.ADD() cannot be called from the
> > post-populate hook. Can the code for TDH.MEM.PAGE.ADD be shared
> > between the memory initialization ioctl and the page fault hook in
> > kvm_x86_ops?
>
> Ah, because TDX is required to pre-fault the memory to establish the S-EPT walk,
> and pre-faulting means guest_memfd()
>
> Requiring that guest_memfd not have a page when initializing the guest image
> seems wrong, i.e. I don't think we want FGP_CREAT_ONLY.  And not just because I
> am a fan of pre-faulting, I think the semantics are bad.

Ok, fair enough. I wanted to do the once-only test in common code but
since SEV code checks for the RMP I can remove that. One less
headache.

Paolo
Paolo Bonzini April 26, 2024, 5:44 a.m. UTC | #11
On Thu, Apr 25, 2024 at 6:51 PM Isaku Yamahata <isaku.yamahata@intel.com> wrote:
> > AFAIK, unwinding on failure is completely uninteresting, and arguably undesirable,
> > because undoing LAUNCH_UPDATE or PAGE.ADD will affect the measurement, i.e. there
> > is no scenario where deleting pages from guest_memfd would allow a restart/resume
> > of the build process to truly succeed.
>
>
> Just for record.  With the following twist to kvm_gmem_populate,
> KVM_TDX_INIT_MEM_REGION can use kvm_gmem_populate().  For those who are curious,
> I also append the callback implementation at the end.

Nice, thank you very much. Since TDX does not need
HAVE_KVM_GMEM_PREPARE, if I get rid of FGP_CREAT_ONLY it will work for
you, right?

Paolo

>
> --
>
>  include/linux/kvm_host.h | 2 ++
>  virt/kvm/guest_memfd.c   | 3 ++-
>  2 files changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index df957c9f9115..7c86b77f8895 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2460,6 +2460,7 @@ bool kvm_arch_gmem_prepare_needed(struct kvm *kvm);
>   *       (passed to @post_populate, and incremented on each iteration
>   *       if not NULL)
>   * @npages: number of pages to copy from userspace-buffer
> + * @prepare: Allow page allocation to invoke gmem_prepare hook
>   * @post_populate: callback to issue for each gmem page that backs the GPA
>   *                 range
>   * @opaque: opaque data to pass to @post_populate callback
> @@ -2473,6 +2474,7 @@ bool kvm_arch_gmem_prepare_needed(struct kvm *kvm);
>   * Returns the number of pages that were populated.
>   */
>  long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
> +                      bool prepare,
>                        int (*post_populate)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>                                             void __user *src, int order, void *opaque),
>                        void *opaque);
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 3195ceefe915..18809e6dea8a 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -638,6 +638,7 @@ static int kvm_gmem_undo_get_pfn(struct file *file, struct kvm_memory_slot *slot
>  }
>
>  long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
> +                      bool prepare,
>                        int (*post_populate)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>                                             void __user *src, int order, void *opaque),
>                        void *opaque)
> @@ -667,7 +668,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages
>                 gfn_t this_gfn = gfn + i;
>                 kvm_pfn_t pfn;
>
> -               ret = __kvm_gmem_get_pfn(file, slot, this_gfn, &pfn, &max_order, false);
> +               ret = __kvm_gmem_get_pfn(file, slot, this_gfn, &pfn, &max_order, prepare);
>                 if (ret)
>                         break;
>
> --
> 2.43.2
>
>
> Here is the callback for KVM_TDX_INIT_MEM_REGION.
> Note: the caller of kvm_gmem_populate() acquires mutex_lock(&kvm->slots_lock)
> and idx = srcu_read_lock(&kvm->srcu).
>
>
> struct tdx_gmem_post_populate_arg {
>         struct kvm_vcpu *vcpu;
>         __u32 flags;
> };
>
> static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>                                   void __user *src, int order, void *_arg)
> {
>         struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>         struct tdx_gmem_post_populate_arg *arg = _arg;
>         struct kvm_vcpu *vcpu = arg->vcpu;
>         struct kvm_memory_slot *slot;
>         gpa_t gpa = gfn_to_gpa(gfn);
>         struct page *page;
>         kvm_pfn_t mmu_pfn;
>         int ret, i;
>         u64 err;
>
>         /* Pin the source page. */
>         ret = get_user_pages_fast((unsigned long)src, 1, 0, &page);
>         if (ret < 0)
>                 return ret;
>         if (ret != 1)
>                 return -ENOMEM;
>
>         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
>         if (!kvm_slot_can_be_private(slot) || !kvm_mem_is_private(kvm, gfn)) {
>                 ret = -EFAULT;
>                 goto out_put_page;
>         }
>
>         read_lock(&kvm->mmu_lock);
>
>         ret = kvm_tdp_mmu_get_walk_private_pfn(vcpu, gpa, &mmu_pfn);
>         if (ret < 0)
>                 goto out;
>         if (ret > PG_LEVEL_4K) {
>                 ret = -EINVAL;
>                 goto out;
>         }
>         if (mmu_pfn != pfn) {
>                 ret = -EAGAIN;
>                 goto out;
>         }
>
>         ret = 0;
>         do {
>                 err = tdh_mem_page_add(kvm_tdx, gpa, pfn_to_hpa(pfn),
>                                        pfn_to_hpa(page_to_pfn(page)), NULL);
>         } while (err == TDX_ERROR_SEPT_BUSY);
>         if (err) {
>                 ret = -EIO;
>                 goto out;
>         }
>
>         WARN_ON_ONCE(!atomic64_read(&kvm_tdx->nr_premapped));
>         atomic64_dec(&kvm_tdx->nr_premapped);
>         tdx_account_td_pages(vcpu->kvm, PG_LEVEL_4K);
>
>         if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
>                 for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
>                         err = tdh_mr_extend(kvm_tdx, gpa + i, NULL);
>                         if (err) {
>                                 ret = -EIO;
>                                 break;
>                         }
>                 }
>         }
>
> out:
>         read_unlock(&kvm->mmu_lock);
> out_put_page:
>         put_page(page);
>         return ret;
> }
>
> --
> Isaku Yamahata <isaku.yamahata@intel.com>
>
Sean Christopherson April 26, 2024, 3:17 p.m. UTC | #12
On Fri, Apr 26, 2024, Paolo Bonzini wrote:
> On Thu, Apr 25, 2024 at 6:00 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Thu, Apr 25, 2024, Paolo Bonzini wrote:
> > > On Thu, Apr 25, 2024 at 3:12 AM Isaku Yamahata <isaku.yamahata@intel.com> wrote:
> > > > > >   get_user_pages_fast(source addr)
> > > > > >   read_lock(mmu_lock)
> > > > > >   kvm_tdp_mmu_get_walk_private_pfn(vcpu, gpa, &pfn);
> > > > > >   if the page table doesn't map gpa, error.
> > > > > >   TDH.MEM.PAGE.ADD()
> > > > > >   TDH.MR.EXTEND()
> > > > > >   read_unlock(mmu_lock)
> > > > > >   put_page()
> > > > >
> > > > > Hmm, KVM doesn't _need_ to use invalidate_lock to protect against guest_memfd
> > > > > invalidation, but I also don't see why it would cause problems.
> > >
> > > The invalidate_lock is only needed to operate on the guest_memfd, but
> > > it's a rwsem so there are no risks of lock inversion.
> > >
> > > > > I.e. why not
> > > > > take mmu_lock() in TDX's post_populate() implementation?
> > > >
> > > > We can take the lock.  Because we have already populated the GFN of guest_memfd,
> > > > we need to make kvm_gmem_populate() not pass FGP_CREAT_ONLY.  Otherwise we'll
> > > > get -EEXIST.
> > >
> > > I don't understand why TDH.MEM.PAGE.ADD() cannot be called from the
> > > post-populate hook. Can the code for TDH.MEM.PAGE.ADD be shared
> > > between the memory initialization ioctl and the page fault hook in
> > > kvm_x86_ops?
> >
> > Ah, because TDX is required to pre-fault the memory to establish the S-EPT walk,
> > and pre-faulting means guest_memfd()
> >
> > Requiring that guest_memfd not have a page when initializing the guest image
> > seems wrong, i.e. I don't think we want FGP_CREAT_ONLY.  And not just because I
> > am a fan of pre-faulting, I think the semantics are bad.
> 
> Ok, fair enough. I wanted to do the once-only test in common code but since
> SEV code checks for the RMP I can remove that. One less headache.

I definitely don't object to having a check in common code, and I'd be in favor
of removing the RMP checks if possible, but tracking needs to be something more
explicit in guest_memfd.

*sigh*

I even left behind a TODO for this exact thing, and y'all didn't even wave at it
as you flew by :-)

        /*
         * Use the up-to-date flag to track whether or not the memory has been
         * zeroed before being handed off to the guest.  There is no backing
         * storage for the memory, so the folio will remain up-to-date until
         * it's removed.
         *
         * TODO: Skip clearing pages when trusted firmware will do it when  <==========================
         * assigning memory to the guest.
         */
        if (!folio_test_uptodate(folio)) {
                unsigned long nr_pages = folio_nr_pages(folio);
                unsigned long i;

                for (i = 0; i < nr_pages; i++)
                        clear_highpage(folio_page(folio, i));

                folio_mark_uptodate(folio);
        }

        if (prepare) {
                int r = kvm_gmem_prepare_folio(inode, index, folio);
                if (r < 0) {
                        folio_unlock(folio);
                        folio_put(folio);
                        return ERR_PTR(r);
                }
        }

Compile tested only (and not even fully as I didn't bother defining
CONFIG_HAVE_KVM_GMEM_INITIALIZE), but I think this is the basic gist.

8< --------------------------------

// SPDX-License-Identifier: GPL-2.0
#include <linux/backing-dev.h>
#include <linux/falloc.h>
#include <linux/kvm_host.h>
#include <linux/pagemap.h>
#include <linux/anon_inodes.h>

#include "kvm_mm.h"

struct kvm_gmem {
	struct kvm *kvm;
	struct xarray bindings;
	struct list_head entry;
};

static int kvm_gmem_initialize_folio(struct kvm *kvm, struct folio *folio,
				     pgoff_t index, void __user *src,
				     void *opaque)
{
#ifdef CONFIG_HAVE_KVM_GMEM_INITIALIZE
	return kvm_arch_gmem_initialize(kvm, folio, index, src, opaque);
#else
	unsigned long nr_pages = folio_nr_pages(folio);
	unsigned long i;

	if (WARN_ON_ONCE(src))
		return -EIO;

	for (i = 0; i < nr_pages; i++)
		clear_highpage(folio_file_page(folio, index + i));
#endif
	return 0;
}


static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
{
	return gfn - slot->base_gfn + slot->gmem.pgoff;
}


static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
{
	/*
	 * Do not return slot->gmem.file if it has already been closed;
	 * there might be some time between the last fput() and when
	 * kvm_gmem_release() clears slot->gmem.file, and you do not
	 * want to spin in the meanwhile.
	 */
	return get_file_active(&slot->gmem.file);
}

static struct folio *__kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
{
	fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
	struct folio *folio;

	/*
	 * The caller is responsible for managing the up-to-date flag (or not),
	 * as the memory doesn't need to be initialized until it's actually
	 * mapped into the guest.  Waiting to initialize memory is necessary
	 * for VM types where the memory can be initialized exactly once.
	 *
	 * Ignore accessed, referenced, and dirty flags.  The memory is
	 * unevictable and there is no storage to write back to.
	 *
	 * TODO: Support huge pages.
	 */
	folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags,
				    mapping_gfp_mask(inode->i_mapping));

	if (folio_test_hwpoison(folio)) {
		folio_unlock(folio);
		return ERR_PTR(-EHWPOISON);
	}
	return folio;
}

static struct folio *kvm_gmem_get_folio(struct file *file,
					struct kvm_memory_slot *slot,
					gfn_t gfn)
{
	pgoff_t index = kvm_gmem_get_index(slot, gfn);
	struct kvm_gmem *gmem = file->private_data;
	struct inode *inode;

	if (file != slot->gmem.file) {
		WARN_ON_ONCE(slot->gmem.file);
		return ERR_PTR(-EFAULT);
	}

	gmem = file->private_data;
	if (xa_load(&gmem->bindings, index) != slot) {
		WARN_ON_ONCE(xa_load(&gmem->bindings, index));
		return ERR_PTR(-EIO);
	}

	inode = file_inode(file);

	/*
	 * The caller is responsible for managing the up-to-date flag (or not),
	 * as the memory doesn't need to be initialized until it's actually
	 * mapped into the guest.  Waiting to initialize memory is necessary
	 * for VM types where the memory can be initialized exactly once.
	 *
	 * Ignore accessed, referenced, and dirty flags.  The memory is
	 * unevictable and there is no storage to write back to.
	 *
	 * TODO: Support huge pages.
	 */
	return __kvm_gmem_get_folio(inode, index);
}

int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
{
	pgoff_t index = kvm_gmem_get_index(slot, gfn);
	struct file *file = kvm_gmem_get_file(slot);
	struct folio *folio;
	struct page *page;

	if (!file)
		return -EFAULT;

	folio = kvm_gmem_get_folio(file, slot, gfn);
	if (IS_ERR(folio))
		goto out;

	if (!folio_test_uptodate(folio)) {
		kvm_gmem_initialize_folio(kvm, folio, index, NULL, NULL);
		folio_mark_uptodate(folio);
	}

	page = folio_file_page(folio, index);

	*pfn = page_to_pfn(page);
	if (max_order)
		*max_order = 0;

out:
	fput(file);
	return IS_ERR(folio) ? PTR_ERR(folio) : 0;
}
EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);

long kvm_gmem_populate(struct kvm *kvm, gfn_t base_gfn, void __user *base_src,
		       long npages, void *opaque)
{
	struct kvm_memory_slot *slot;
	struct file *file;
	int ret = 0, max_order;
	long i;

	lockdep_assert_held(&kvm->slots_lock);
	if (npages < 0)
		return -EINVAL;

	slot = gfn_to_memslot(kvm, base_gfn);
	if (!kvm_slot_can_be_private(slot))
		return -EINVAL;

	file = kvm_gmem_get_file(slot);
	if (!file)
		return -EFAULT;

	filemap_invalidate_lock(file->f_mapping);

	npages = min_t(ulong, slot->npages - (base_gfn - slot->base_gfn), npages);
	for (i = 0; i < npages; i += (1 << max_order)) {
		void __user *src = base_src + i * PAGE_SIZE;
		gfn_t gfn = base_gfn + i;
		pgoff_t index = kvm_gmem_get_index(slot, gfn);
		struct folio *folio;

		folio = kvm_gmem_get_folio(file, slot, gfn);
		if (IS_ERR(folio)) {
			ret = PTR_ERR(folio);
			break;
		}

		if (folio_test_uptodate(folio)) {
			folio_put(folio);
			ret = -EEXIST;
			break;
		}

		kvm_gmem_initialize_folio(kvm, folio, index, src, opaque);
		folio_unlock(folio);
		folio_put(folio);
	}

	filemap_invalidate_unlock(file->f_mapping);

	fput(file);
	return ret && !i ? ret : i;
}
EXPORT_SYMBOL_GPL(kvm_gmem_populate);

static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
				      pgoff_t end)
{
	bool flush = false, found_memslot = false;
	struct kvm_memory_slot *slot;
	struct kvm *kvm = gmem->kvm;
	unsigned long index;

	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
		pgoff_t pgoff = slot->gmem.pgoff;

		struct kvm_gfn_range gfn_range = {
			.start = slot->base_gfn + max(pgoff, start) - pgoff,
			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
			.slot = slot,
			.may_block = true,
		};

		if (!found_memslot) {
			found_memslot = true;

			KVM_MMU_LOCK(kvm);
			kvm_mmu_invalidate_begin(kvm);
		}

		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
	}

	if (flush)
		kvm_flush_remote_tlbs(kvm);

	if (found_memslot)
		KVM_MMU_UNLOCK(kvm);
}

static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
				    pgoff_t end)
{
	struct kvm *kvm = gmem->kvm;

	if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
		KVM_MMU_LOCK(kvm);
		kvm_mmu_invalidate_end(kvm);
		KVM_MMU_UNLOCK(kvm);
	}
}

static long __kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
	struct list_head *gmem_list = &inode->i_mapping->i_private_list;
	pgoff_t start = offset >> PAGE_SHIFT;
	pgoff_t end = (offset + len) >> PAGE_SHIFT;
	struct kvm_gmem *gmem;
	list_for_each_entry(gmem, gmem_list, entry)
		kvm_gmem_invalidate_begin(gmem, start, end);

	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);

	list_for_each_entry(gmem, gmem_list, entry)
		kvm_gmem_invalidate_end(gmem, start, end);

	return 0;
}

static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
	int r;

	/*
	 * Bindings must be stable across invalidation to ensure the start+end
	 * are balanced.
	 */
	filemap_invalidate_lock(inode->i_mapping);
	r = __kvm_gmem_punch_hole(inode, offset, len);
	filemap_invalidate_unlock(inode->i_mapping);
	return r;
}

static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
{
	struct address_space *mapping = inode->i_mapping;
	pgoff_t start, index, end;
	int r;

	/* Dedicated guest is immutable by default. */
	if (offset + len > i_size_read(inode))
		return -EINVAL;

	filemap_invalidate_lock_shared(mapping);

	start = offset >> PAGE_SHIFT;
	end = (offset + len) >> PAGE_SHIFT;

	r = 0;
	for (index = start; index < end; ) {
		struct folio *folio;

		if (signal_pending(current)) {
			r = -EINTR;
			break;
		}

		folio = __kvm_gmem_get_folio(inode, index);
		if (IS_ERR(folio)) {
			r = PTR_ERR(folio);
			break;
		}

		index = folio_next_index(folio);

		folio_unlock(folio);
		folio_put(folio);

		/* 64-bit only, wrapping the index should be impossible. */
		if (WARN_ON_ONCE(!index))
			break;

		cond_resched();
	}

	filemap_invalidate_unlock_shared(mapping);

	return r;
}

static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
			       loff_t len)
{
	int ret;

	if (!(mode & FALLOC_FL_KEEP_SIZE))
		return -EOPNOTSUPP;

	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
		return -EOPNOTSUPP;

	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
		return -EINVAL;

	if (mode & FALLOC_FL_PUNCH_HOLE)
		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
	else
		ret = kvm_gmem_allocate(file_inode(file), offset, len);

	if (!ret)
		file_modified(file);
	return ret;
}

static int kvm_gmem_release(struct inode *inode, struct file *file)
{
	struct kvm_gmem *gmem = file->private_data;
	struct kvm_memory_slot *slot;
	struct kvm *kvm = gmem->kvm;
	unsigned long index;

	/*
	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
	 * reference to the file and thus no new bindings can be created, but
	 * dereferencing the slot for existing bindings needs to be protected
	 * against memslot updates, specifically so that unbind doesn't race
	 * and free the memslot (kvm_gmem_get_file() will return NULL).
	 */
	mutex_lock(&kvm->slots_lock);

	filemap_invalidate_lock(inode->i_mapping);

	xa_for_each(&gmem->bindings, index, slot)
		rcu_assign_pointer(slot->gmem.file, NULL);

	synchronize_rcu();

	/*
	 * All in-flight operations are gone and new bindings can be created.
	 * Zap all SPTEs pointed at by this file.  Do not free the backing
	 * memory, as its lifetime is associated with the inode, not the file.
	 */
	kvm_gmem_invalidate_begin(gmem, 0, -1ul);
	kvm_gmem_invalidate_end(gmem, 0, -1ul);

	list_del(&gmem->entry);

	filemap_invalidate_unlock(inode->i_mapping);

	mutex_unlock(&kvm->slots_lock);

	xa_destroy(&gmem->bindings);
	kfree(gmem);

	kvm_put_kvm(kvm);

	return 0;
}

static struct file_operations kvm_gmem_fops = {
	.open		= generic_file_open,
	.release	= kvm_gmem_release,
	.fallocate	= kvm_gmem_fallocate,
};

void kvm_gmem_init(struct module *module)
{
	kvm_gmem_fops.owner = module;
}

static int kvm_gmem_migrate_folio(struct address_space *mapping,
				  struct folio *dst, struct folio *src,
				  enum migrate_mode mode)
{
	WARN_ON_ONCE(1);
	return -EINVAL;
}

static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
{
	struct list_head *gmem_list = &mapping->i_private_list;
	struct kvm_gmem *gmem;
	pgoff_t start, end;

	filemap_invalidate_lock_shared(mapping);

	start = folio->index;
	end = start + folio_nr_pages(folio);

	list_for_each_entry(gmem, gmem_list, entry)
		kvm_gmem_invalidate_begin(gmem, start, end);

	/*
	 * Do not truncate the range, what action is taken in response to the
	 * error is userspace's decision (assuming the architecture supports
	 * gracefully handling memory errors).  If/when the guest attempts to
	 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
	 * at which point KVM can either terminate the VM or propagate the
	 * error to userspace.
	 */

	list_for_each_entry(gmem, gmem_list, entry)
		kvm_gmem_invalidate_end(gmem, start, end);

	filemap_invalidate_unlock_shared(mapping);

	return MF_DELAYED;
}

#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
static void kvm_gmem_free_folio(struct folio *folio)
{
	struct page *page = folio_page(folio, 0);
	kvm_pfn_t pfn = page_to_pfn(page);
	int order = folio_order(folio);

	kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
}
#endif

static const struct address_space_operations kvm_gmem_aops = {
	.dirty_folio = noop_dirty_folio,
	.migrate_folio	= kvm_gmem_migrate_folio,
	.error_remove_folio = kvm_gmem_error_folio,
#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
	.free_folio = kvm_gmem_free_folio,
#endif
};

static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
			    struct kstat *stat, u32 request_mask,
			    unsigned int query_flags)
{
	struct inode *inode = path->dentry->d_inode;

	generic_fillattr(idmap, request_mask, inode, stat);
	return 0;
}

static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
			    struct iattr *attr)
{
	return -EINVAL;
}
static const struct inode_operations kvm_gmem_iops = {
	.getattr	= kvm_gmem_getattr,
	.setattr	= kvm_gmem_setattr,
};

static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
{
	const char *anon_name = "[kvm-gmem]";
	struct kvm_gmem *gmem;
	struct inode *inode;
	struct file *file;
	int fd, err;

	fd = get_unused_fd_flags(0);
	if (fd < 0)
		return fd;

	gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
	if (!gmem) {
		err = -ENOMEM;
		goto err_fd;
	}

	file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
					 O_RDWR, NULL);
	if (IS_ERR(file)) {
		err = PTR_ERR(file);
		goto err_gmem;
	}

	file->f_flags |= O_LARGEFILE;

	inode = file->f_inode;
	WARN_ON(file->f_mapping != inode->i_mapping);

	inode->i_private = (void *)(unsigned long)flags;
	inode->i_op = &kvm_gmem_iops;
	inode->i_mapping->a_ops = &kvm_gmem_aops;
	inode->i_mapping->flags |= AS_INACCESSIBLE;
	inode->i_mode |= S_IFREG;
	inode->i_size = size;
	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
	mapping_set_unmovable(inode->i_mapping);
	/* Unmovable mappings are supposed to be marked unevictable as well. */
	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));

	kvm_get_kvm(kvm);
	gmem->kvm = kvm;
	xa_init(&gmem->bindings);
	list_add(&gmem->entry, &inode->i_mapping->i_private_list);

	fd_install(fd, file);
	return fd;

err_gmem:
	kfree(gmem);
err_fd:
	put_unused_fd(fd);
	return err;
}

int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
{
	loff_t size = args->size;
	u64 flags = args->flags;
	u64 valid_flags = 0;

	if (flags & ~valid_flags)
		return -EINVAL;

	if (size <= 0 || !PAGE_ALIGNED(size))
		return -EINVAL;

	return __kvm_gmem_create(kvm, size, flags);
}

int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
		  unsigned int fd, loff_t offset)
{
	loff_t size = slot->npages << PAGE_SHIFT;
	unsigned long start, end;
	struct kvm_gmem *gmem;
	struct inode *inode;
	struct file *file;
	int r = -EINVAL;

	BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));

	file = fget(fd);
	if (!file)
		return -EBADF;

	if (file->f_op != &kvm_gmem_fops)
		goto err;

	gmem = file->private_data;
	if (gmem->kvm != kvm)
		goto err;

	inode = file_inode(file);

	if (offset < 0 || !PAGE_ALIGNED(offset) ||
	    offset + size > i_size_read(inode))
		goto err;

	filemap_invalidate_lock(inode->i_mapping);

	start = offset >> PAGE_SHIFT;
	end = start + slot->npages;

	if (!xa_empty(&gmem->bindings) &&
	    xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
		filemap_invalidate_unlock(inode->i_mapping);
		goto err;
	}

	/*
	 * No synchronize_rcu() needed, any in-flight readers are guaranteed to
	 * be see either a NULL file or this new file, no need for them to go
	 * away.
	 */
	rcu_assign_pointer(slot->gmem.file, file);
	slot->gmem.pgoff = start;

	xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
	filemap_invalidate_unlock(inode->i_mapping);

	/*
	 * Drop the reference to the file, even on success.  The file pins KVM,
	 * not the other way 'round.  Active bindings are invalidated if the
	 * file is closed before memslots are destroyed.
	 */
	r = 0;
err:
	fput(file);
	return r;
}

void kvm_gmem_unbind(struct kvm_memory_slot *slot)
{
	unsigned long start = slot->gmem.pgoff;
	unsigned long end = start + slot->npages;
	struct kvm_gmem *gmem;
	struct file *file;

	/*
	 * Nothing to do if the underlying file was already closed (or is being
	 * closed right now), kvm_gmem_release() invalidates all bindings.
	 */
	file = kvm_gmem_get_file(slot);
	if (!file)
		return;

	gmem = file->private_data;

	filemap_invalidate_lock(file->f_mapping);
	xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
	rcu_assign_pointer(slot->gmem.file, NULL);
	synchronize_rcu();
	filemap_invalidate_unlock(file->f_mapping);

	fput(file);
}
Isaku Yamahata April 26, 2024, 5:15 p.m. UTC | #13
On Fri, Apr 26, 2024 at 07:44:40AM +0200,
Paolo Bonzini <pbonzini@redhat.com> wrote:

> On Thu, Apr 25, 2024 at 6:51 PM Isaku Yamahata <isaku.yamahata@intel.com> wrote:
> > > AFAIK, unwinding on failure is completely uninteresting, and arguably undesirable,
> > > because undoing LAUNCH_UPDATE or PAGE.ADD will affect the measurement, i.e. there
> > > is no scenario where deleting pages from guest_memfd would allow a restart/resume
> > > of the build process to truly succeed.
> >
> >
> > Just for record.  With the following twist to kvm_gmem_populate,
> > KVM_TDX_INIT_MEM_REGION can use kvm_gmem_populate().  For those who are curious,
> > I also append the callback implementation at the end.
> 
> Nice, thank you very much. Since TDX does not need
> HAVE_KVM_GMEM_PREPARE, if I get rid of FGP_CREAT_ONLY it will work for
> you, right?

Yes, that's right.
Sean Christopherson June 7, 2024, 11:03 p.m. UTC | #14
SNP folks and/or Paolo, what's the plan for this?  I don't see how what's sitting
in kvm/next can possibly be correct without conditioning population on the folio
being !uptodate.

On Fri, Apr 26, 2024, Sean Christopherson wrote:
> On Fri, Apr 26, 2024, Paolo Bonzini wrote:
> > On Thu, Apr 25, 2024 at 6:00 PM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Thu, Apr 25, 2024, Paolo Bonzini wrote:
> > > > On Thu, Apr 25, 2024 at 3:12 AM Isaku Yamahata <isaku.yamahata@intel.com> wrote:
> > > > > > >   get_user_pages_fast(source addr)
> > > > > > >   read_lock(mmu_lock)
> > > > > > >   kvm_tdp_mmu_get_walk_private_pfn(vcpu, gpa, &pfn);
> > > > > > >   if the page table doesn't map gpa, error.
> > > > > > >   TDH.MEM.PAGE.ADD()
> > > > > > >   TDH.MR.EXTEND()
> > > > > > >   read_unlock(mmu_lock)
> > > > > > >   put_page()
> > > > > >
> > > > > > Hmm, KVM doesn't _need_ to use invalidate_lock to protect against guest_memfd
> > > > > > invalidation, but I also don't see why it would cause problems.
> > > >
> > > > The invalidate_lock is only needed to operate on the guest_memfd, but
> > > > it's a rwsem so there are no risks of lock inversion.
> > > >
> > > > > > I.e. why not
> > > > > > take mmu_lock() in TDX's post_populate() implementation?
> > > > >
> > > > > We can take the lock.  Because we have already populated the GFN of guest_memfd,
> > > > > we need to make kvm_gmem_populate() not pass FGP_CREAT_ONLY.  Otherwise we'll
> > > > > get -EEXIST.
> > > >
> > > > I don't understand why TDH.MEM.PAGE.ADD() cannot be called from the
> > > > post-populate hook. Can the code for TDH.MEM.PAGE.ADD be shared
> > > > between the memory initialization ioctl and the page fault hook in
> > > > kvm_x86_ops?
> > >
> > > Ah, because TDX is required to pre-fault the memory to establish the S-EPT walk,
> > > and pre-faulting means guest_memfd()
> > >
> > > Requiring that guest_memfd not have a page when initializing the guest image
> > > seems wrong, i.e. I don't think we want FGP_CREAT_ONLY.  And not just because I
> > > am a fan of pre-faulting, I think the semantics are bad.
> > 
> > Ok, fair enough. I wanted to do the once-only test in common code but since
> > SEV code checks for the RMP I can remove that. One less headache.
> 
> I definitely don't object to having a check in common code, and I'd be in favor
> of removing the RMP checks if possible, but tracking needs to be something more
> explicit in guest_memfd.
> 
> *sigh*
> 
> I even left behind a TODO for this exact thing, and y'all didn't even wave at it
> as you flew by :-)
> 
>         /*
>          * Use the up-to-date flag to track whether or not the memory has been
>          * zeroed before being handed off to the guest.  There is no backing
>          * storage for the memory, so the folio will remain up-to-date until
>          * it's removed.
>          *
>          * TODO: Skip clearing pages when trusted firmware will do it when  <==========================
>          * assigning memory to the guest.
>          */
>         if (!folio_test_uptodate(folio)) {
>                 unsigned long nr_pages = folio_nr_pages(folio);
>                 unsigned long i;
> 
>                 for (i = 0; i < nr_pages; i++)
>                         clear_highpage(folio_page(folio, i));
> 
>                 folio_mark_uptodate(folio);
>         }
> 
>         if (prepare) {
>                 int r = kvm_gmem_prepare_folio(inode, index, folio);
>                 if (r < 0) {
>                         folio_unlock(folio);
>                         folio_put(folio);
>                         return ERR_PTR(r);
>                 }
>         }
> 
> Compile tested only (and not even fully as I didn't bother defining
> CONFIG_HAVE_KVM_GMEM_INITIALIZE), but I think this is the basic gist.
> 
> 8< --------------------------------
> 
> // SPDX-License-Identifier: GPL-2.0
> #include <linux/backing-dev.h>
> #include <linux/falloc.h>
> #include <linux/kvm_host.h>
> #include <linux/pagemap.h>
> #include <linux/anon_inodes.h>
> 
> #include "kvm_mm.h"
> 
> struct kvm_gmem {
> 	struct kvm *kvm;
> 	struct xarray bindings;
> 	struct list_head entry;
> };
> 
> static int kvm_gmem_initialize_folio(struct kvm *kvm, struct folio *folio,
> 				     pgoff_t index, void __user *src,
> 				     void *opaque)
> {
> #ifdef CONFIG_HAVE_KVM_GMEM_INITIALIZE
> 	return kvm_arch_gmem_initialize(kvm, folio, index, src, opaque);
> #else
> 	unsigned long nr_pages = folio_nr_pages(folio);
> 	unsigned long i;
> 
> 	if (WARN_ON_ONCE(src))
> 		return -EIO;
> 
> 	for (i = 0; i < nr_pages; i++)
> 		clear_highpage(folio_file_page(folio, index + i));
> #endif
> 	return 0;
> }
> 
> 
> static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
> {
> 	return gfn - slot->base_gfn + slot->gmem.pgoff;
> }
> 
> 
> static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
> {
> 	/*
> 	 * Do not return slot->gmem.file if it has already been closed;
> 	 * there might be some time between the last fput() and when
> 	 * kvm_gmem_release() clears slot->gmem.file, and you do not
> 	 * want to spin in the meanwhile.
> 	 */
> 	return get_file_active(&slot->gmem.file);
> }
> 
> static struct folio *__kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
> {
> 	fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
> 	struct folio *folio;
> 
> 	/*
> 	 * The caller is responsible for managing the up-to-date flag (or not),
> 	 * as the memory doesn't need to be initialized until it's actually
> 	 * mapped into the guest.  Waiting to initialize memory is necessary
> 	 * for VM types where the memory can be initialized exactly once.
> 	 *
> 	 * Ignore accessed, referenced, and dirty flags.  The memory is
> 	 * unevictable and there is no storage to write back to.
> 	 *
> 	 * TODO: Support huge pages.
> 	 */
> 	folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags,
> 				    mapping_gfp_mask(inode->i_mapping));
> 
> 	if (folio_test_hwpoison(folio)) {
> 		folio_unlock(folio);
> 		return ERR_PTR(-EHWPOISON);
> 	}
> 	return folio;
> }
> 
> static struct folio *kvm_gmem_get_folio(struct file *file,
> 					struct kvm_memory_slot *slot,
> 					gfn_t gfn)
> {
> 	pgoff_t index = kvm_gmem_get_index(slot, gfn);
> 	struct kvm_gmem *gmem = file->private_data;
> 	struct inode *inode;
> 
> 	if (file != slot->gmem.file) {
> 		WARN_ON_ONCE(slot->gmem.file);
> 		return ERR_PTR(-EFAULT);
> 	}
> 
> 	gmem = file->private_data;
> 	if (xa_load(&gmem->bindings, index) != slot) {
> 		WARN_ON_ONCE(xa_load(&gmem->bindings, index));
> 		return ERR_PTR(-EIO);
> 	}
> 
> 	inode = file_inode(file);
> 
> 	/*
> 	 * The caller is responsible for managing the up-to-date flag (or not),
> 	 * as the memory doesn't need to be initialized until it's actually
> 	 * mapped into the guest.  Waiting to initialize memory is necessary
> 	 * for VM types where the memory can be initialized exactly once.
> 	 *
> 	 * Ignore accessed, referenced, and dirty flags.  The memory is
> 	 * unevictable and there is no storage to write back to.
> 	 *
> 	 * TODO: Support huge pages.
> 	 */
> 	return __kvm_gmem_get_folio(inode, index);
> }
> 
> int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> 		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
> {
> 	pgoff_t index = kvm_gmem_get_index(slot, gfn);
> 	struct file *file = kvm_gmem_get_file(slot);
> 	struct folio *folio;
> 	struct page *page;
> 
> 	if (!file)
> 		return -EFAULT;
> 
> 	folio = kvm_gmem_get_folio(file, slot, gfn);
> 	if (IS_ERR(folio))
> 		goto out;
> 
> 	if (!folio_test_uptodate(folio)) {
> 		kvm_gmem_initialize_folio(kvm, folio, index, NULL, NULL);
> 		folio_mark_uptodate(folio);
> 	}
> 
> 	page = folio_file_page(folio, index);
> 
> 	*pfn = page_to_pfn(page);
> 	if (max_order)
> 		*max_order = 0;
> 
> out:
> 	fput(file);
> 	return IS_ERR(folio) ? PTR_ERR(folio) : 0;
> }
> EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
> 
> long kvm_gmem_populate(struct kvm *kvm, gfn_t base_gfn, void __user *base_src,
> 		       long npages, void *opaque)
> {
> 	struct kvm_memory_slot *slot;
> 	struct file *file;
> 	int ret = 0, max_order;
> 	long i;
> 
> 	lockdep_assert_held(&kvm->slots_lock);
> 	if (npages < 0)
> 		return -EINVAL;
> 
> 	slot = gfn_to_memslot(kvm, base_gfn);
> 	if (!kvm_slot_can_be_private(slot))
> 		return -EINVAL;
> 
> 	file = kvm_gmem_get_file(slot);
> 	if (!file)
> 		return -EFAULT;
> 
> 	filemap_invalidate_lock(file->f_mapping);
> 
> 	npages = min_t(ulong, slot->npages - (base_gfn - slot->base_gfn), npages);
> 	for (i = 0; i < npages; i += (1 << max_order)) {
> 		void __user *src = base_src + i * PAGE_SIZE;
> 		gfn_t gfn = base_gfn + i;
> 		pgoff_t index = kvm_gmem_get_index(slot, gfn);
> 		struct folio *folio;
> 
> 		folio = kvm_gmem_get_folio(file, slot, gfn);
> 		if (IS_ERR(folio)) {
> 			ret = PTR_ERR(folio);
> 			break;
> 		}
> 
> 		if (folio_test_uptodate(folio)) {
> 			folio_put(folio);
> 			ret = -EEXIST;
> 			break;
> 		}
> 
> 		kvm_gmem_initialize_folio(kvm, folio, index, src, opaque);
> 		folio_unlock(folio);
> 		folio_put(folio);
> 	}
> 
> 	filemap_invalidate_unlock(file->f_mapping);
> 
> 	fput(file);
> 	return ret && !i ? ret : i;
> }
> EXPORT_SYMBOL_GPL(kvm_gmem_populate);
> 
> static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> 				      pgoff_t end)
> {
> 	bool flush = false, found_memslot = false;
> 	struct kvm_memory_slot *slot;
> 	struct kvm *kvm = gmem->kvm;
> 	unsigned long index;
> 
> 	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> 		pgoff_t pgoff = slot->gmem.pgoff;
> 
> 		struct kvm_gfn_range gfn_range = {
> 			.start = slot->base_gfn + max(pgoff, start) - pgoff,
> 			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
> 			.slot = slot,
> 			.may_block = true,
> 		};
> 
> 		if (!found_memslot) {
> 			found_memslot = true;
> 
> 			KVM_MMU_LOCK(kvm);
> 			kvm_mmu_invalidate_begin(kvm);
> 		}
> 
> 		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
> 	}
> 
> 	if (flush)
> 		kvm_flush_remote_tlbs(kvm);
> 
> 	if (found_memslot)
> 		KVM_MMU_UNLOCK(kvm);
> }
> 
> static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
> 				    pgoff_t end)
> {
> 	struct kvm *kvm = gmem->kvm;
> 
> 	if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
> 		KVM_MMU_LOCK(kvm);
> 		kvm_mmu_invalidate_end(kvm);
> 		KVM_MMU_UNLOCK(kvm);
> 	}
> }
> 
> static long __kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> {
> 	struct list_head *gmem_list = &inode->i_mapping->i_private_list;
> 	pgoff_t start = offset >> PAGE_SHIFT;
> 	pgoff_t end = (offset + len) >> PAGE_SHIFT;
> 	struct kvm_gmem *gmem;
> 	list_for_each_entry(gmem, gmem_list, entry)
> 		kvm_gmem_invalidate_begin(gmem, start, end);
> 
> 	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
> 
> 	list_for_each_entry(gmem, gmem_list, entry)
> 		kvm_gmem_invalidate_end(gmem, start, end);
> 
> 	return 0;
> }
> 
> static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> {
> 	int r;
> 
> 	/*
> 	 * Bindings must be stable across invalidation to ensure the start+end
> 	 * are balanced.
> 	 */
> 	filemap_invalidate_lock(inode->i_mapping);
> 	r = __kvm_gmem_punch_hole(inode, offset, len);
> 	filemap_invalidate_unlock(inode->i_mapping);
> 	return r;
> }
> 
> static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
> {
> 	struct address_space *mapping = inode->i_mapping;
> 	pgoff_t start, index, end;
> 	int r;
> 
> 	/* Dedicated guest is immutable by default. */
> 	if (offset + len > i_size_read(inode))
> 		return -EINVAL;
> 
> 	filemap_invalidate_lock_shared(mapping);
> 
> 	start = offset >> PAGE_SHIFT;
> 	end = (offset + len) >> PAGE_SHIFT;
> 
> 	r = 0;
> 	for (index = start; index < end; ) {
> 		struct folio *folio;
> 
> 		if (signal_pending(current)) {
> 			r = -EINTR;
> 			break;
> 		}
> 
> 		folio = __kvm_gmem_get_folio(inode, index);
> 		if (IS_ERR(folio)) {
> 			r = PTR_ERR(folio);
> 			break;
> 		}
> 
> 		index = folio_next_index(folio);
> 
> 		folio_unlock(folio);
> 		folio_put(folio);
> 
> 		/* 64-bit only, wrapping the index should be impossible. */
> 		if (WARN_ON_ONCE(!index))
> 			break;
> 
> 		cond_resched();
> 	}
> 
> 	filemap_invalidate_unlock_shared(mapping);
> 
> 	return r;
> }
> 
> static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
> 			       loff_t len)
> {
> 	int ret;
> 
> 	if (!(mode & FALLOC_FL_KEEP_SIZE))
> 		return -EOPNOTSUPP;
> 
> 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> 		return -EOPNOTSUPP;
> 
> 	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> 		return -EINVAL;
> 
> 	if (mode & FALLOC_FL_PUNCH_HOLE)
> 		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
> 	else
> 		ret = kvm_gmem_allocate(file_inode(file), offset, len);
> 
> 	if (!ret)
> 		file_modified(file);
> 	return ret;
> }
> 
> static int kvm_gmem_release(struct inode *inode, struct file *file)
> {
> 	struct kvm_gmem *gmem = file->private_data;
> 	struct kvm_memory_slot *slot;
> 	struct kvm *kvm = gmem->kvm;
> 	unsigned long index;
> 
> 	/*
> 	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
> 	 * reference to the file and thus no new bindings can be created, but
> 	 * dereferencing the slot for existing bindings needs to be protected
> 	 * against memslot updates, specifically so that unbind doesn't race
> 	 * and free the memslot (kvm_gmem_get_file() will return NULL).
> 	 */
> 	mutex_lock(&kvm->slots_lock);
> 
> 	filemap_invalidate_lock(inode->i_mapping);
> 
> 	xa_for_each(&gmem->bindings, index, slot)
> 		rcu_assign_pointer(slot->gmem.file, NULL);
> 
> 	synchronize_rcu();
> 
> 	/*
> 	 * All in-flight operations are gone and new bindings can be created.
> 	 * Zap all SPTEs pointed at by this file.  Do not free the backing
> 	 * memory, as its lifetime is associated with the inode, not the file.
> 	 */
> 	kvm_gmem_invalidate_begin(gmem, 0, -1ul);
> 	kvm_gmem_invalidate_end(gmem, 0, -1ul);
> 
> 	list_del(&gmem->entry);
> 
> 	filemap_invalidate_unlock(inode->i_mapping);
> 
> 	mutex_unlock(&kvm->slots_lock);
> 
> 	xa_destroy(&gmem->bindings);
> 	kfree(gmem);
> 
> 	kvm_put_kvm(kvm);
> 
> 	return 0;
> }
> 
> static struct file_operations kvm_gmem_fops = {
> 	.open		= generic_file_open,
> 	.release	= kvm_gmem_release,
> 	.fallocate	= kvm_gmem_fallocate,
> };
> 
> void kvm_gmem_init(struct module *module)
> {
> 	kvm_gmem_fops.owner = module;
> }
> 
> static int kvm_gmem_migrate_folio(struct address_space *mapping,
> 				  struct folio *dst, struct folio *src,
> 				  enum migrate_mode mode)
> {
> 	WARN_ON_ONCE(1);
> 	return -EINVAL;
> }
> 
> static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
> {
> 	struct list_head *gmem_list = &mapping->i_private_list;
> 	struct kvm_gmem *gmem;
> 	pgoff_t start, end;
> 
> 	filemap_invalidate_lock_shared(mapping);
> 
> 	start = folio->index;
> 	end = start + folio_nr_pages(folio);
> 
> 	list_for_each_entry(gmem, gmem_list, entry)
> 		kvm_gmem_invalidate_begin(gmem, start, end);
> 
> 	/*
> 	 * Do not truncate the range, what action is taken in response to the
> 	 * error is userspace's decision (assuming the architecture supports
> 	 * gracefully handling memory errors).  If/when the guest attempts to
> 	 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
> 	 * at which point KVM can either terminate the VM or propagate the
> 	 * error to userspace.
> 	 */
> 
> 	list_for_each_entry(gmem, gmem_list, entry)
> 		kvm_gmem_invalidate_end(gmem, start, end);
> 
> 	filemap_invalidate_unlock_shared(mapping);
> 
> 	return MF_DELAYED;
> }
> 
> #ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
> static void kvm_gmem_free_folio(struct folio *folio)
> {
> 	struct page *page = folio_page(folio, 0);
> 	kvm_pfn_t pfn = page_to_pfn(page);
> 	int order = folio_order(folio);
> 
> 	kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
> }
> #endif
> 
> static const struct address_space_operations kvm_gmem_aops = {
> 	.dirty_folio = noop_dirty_folio,
> 	.migrate_folio	= kvm_gmem_migrate_folio,
> 	.error_remove_folio = kvm_gmem_error_folio,
> #ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
> 	.free_folio = kvm_gmem_free_folio,
> #endif
> };
> 
> static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
> 			    struct kstat *stat, u32 request_mask,
> 			    unsigned int query_flags)
> {
> 	struct inode *inode = path->dentry->d_inode;
> 
> 	generic_fillattr(idmap, request_mask, inode, stat);
> 	return 0;
> }
> 
> static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
> 			    struct iattr *attr)
> {
> 	return -EINVAL;
> }
> static const struct inode_operations kvm_gmem_iops = {
> 	.getattr	= kvm_gmem_getattr,
> 	.setattr	= kvm_gmem_setattr,
> };
> 
> static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> {
> 	const char *anon_name = "[kvm-gmem]";
> 	struct kvm_gmem *gmem;
> 	struct inode *inode;
> 	struct file *file;
> 	int fd, err;
> 
> 	fd = get_unused_fd_flags(0);
> 	if (fd < 0)
> 		return fd;
> 
> 	gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
> 	if (!gmem) {
> 		err = -ENOMEM;
> 		goto err_fd;
> 	}
> 
> 	file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
> 					 O_RDWR, NULL);
> 	if (IS_ERR(file)) {
> 		err = PTR_ERR(file);
> 		goto err_gmem;
> 	}
> 
> 	file->f_flags |= O_LARGEFILE;
> 
> 	inode = file->f_inode;
> 	WARN_ON(file->f_mapping != inode->i_mapping);
> 
> 	inode->i_private = (void *)(unsigned long)flags;
> 	inode->i_op = &kvm_gmem_iops;
> 	inode->i_mapping->a_ops = &kvm_gmem_aops;
> 	inode->i_mapping->flags |= AS_INACCESSIBLE;
> 	inode->i_mode |= S_IFREG;
> 	inode->i_size = size;
> 	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> 	mapping_set_unmovable(inode->i_mapping);
> 	/* Unmovable mappings are supposed to be marked unevictable as well. */
> 	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> 
> 	kvm_get_kvm(kvm);
> 	gmem->kvm = kvm;
> 	xa_init(&gmem->bindings);
> 	list_add(&gmem->entry, &inode->i_mapping->i_private_list);
> 
> 	fd_install(fd, file);
> 	return fd;
> 
> err_gmem:
> 	kfree(gmem);
> err_fd:
> 	put_unused_fd(fd);
> 	return err;
> }
> 
> int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
> {
> 	loff_t size = args->size;
> 	u64 flags = args->flags;
> 	u64 valid_flags = 0;
> 
> 	if (flags & ~valid_flags)
> 		return -EINVAL;
> 
> 	if (size <= 0 || !PAGE_ALIGNED(size))
> 		return -EINVAL;
> 
> 	return __kvm_gmem_create(kvm, size, flags);
> }
> 
> int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
> 		  unsigned int fd, loff_t offset)
> {
> 	loff_t size = slot->npages << PAGE_SHIFT;
> 	unsigned long start, end;
> 	struct kvm_gmem *gmem;
> 	struct inode *inode;
> 	struct file *file;
> 	int r = -EINVAL;
> 
> 	BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
> 
> 	file = fget(fd);
> 	if (!file)
> 		return -EBADF;
> 
> 	if (file->f_op != &kvm_gmem_fops)
> 		goto err;
> 
> 	gmem = file->private_data;
> 	if (gmem->kvm != kvm)
> 		goto err;
> 
> 	inode = file_inode(file);
> 
> 	if (offset < 0 || !PAGE_ALIGNED(offset) ||
> 	    offset + size > i_size_read(inode))
> 		goto err;
> 
> 	filemap_invalidate_lock(inode->i_mapping);
> 
> 	start = offset >> PAGE_SHIFT;
> 	end = start + slot->npages;
> 
> 	if (!xa_empty(&gmem->bindings) &&
> 	    xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
> 		filemap_invalidate_unlock(inode->i_mapping);
> 		goto err;
> 	}
> 
> 	/*
> 	 * No synchronize_rcu() needed, any in-flight readers are guaranteed to
> 	 * be see either a NULL file or this new file, no need for them to go
> 	 * away.
> 	 */
> 	rcu_assign_pointer(slot->gmem.file, file);
> 	slot->gmem.pgoff = start;
> 
> 	xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
> 	filemap_invalidate_unlock(inode->i_mapping);
> 
> 	/*
> 	 * Drop the reference to the file, even on success.  The file pins KVM,
> 	 * not the other way 'round.  Active bindings are invalidated if the
> 	 * file is closed before memslots are destroyed.
> 	 */
> 	r = 0;
> err:
> 	fput(file);
> 	return r;
> }
> 
> void kvm_gmem_unbind(struct kvm_memory_slot *slot)
> {
> 	unsigned long start = slot->gmem.pgoff;
> 	unsigned long end = start + slot->npages;
> 	struct kvm_gmem *gmem;
> 	struct file *file;
> 
> 	/*
> 	 * Nothing to do if the underlying file was already closed (or is being
> 	 * closed right now), kvm_gmem_release() invalidates all bindings.
> 	 */
> 	file = kvm_gmem_get_file(slot);
> 	if (!file)
> 		return;
> 
> 	gmem = file->private_data;
> 
> 	filemap_invalidate_lock(file->f_mapping);
> 	xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
> 	rcu_assign_pointer(slot->gmem.file, NULL);
> 	synchronize_rcu();
> 	filemap_invalidate_unlock(file->f_mapping);
> 
> 	fput(file);
> }
>
Paolo Bonzini June 10, 2024, 9:48 p.m. UTC | #15
On Sat, Jun 8, 2024 at 1:03 AM Sean Christopherson <seanjc@google.com> wrote:
> SNP folks and/or Paolo, what's the plan for this?  I don't see how what's sitting
> in kvm/next can possibly be correct without conditioning population on the folio
> being !uptodate.

I don't think I have time to look at it closely until Friday; but
thanks for reminding me.

Paolo
Paolo Bonzini June 10, 2024, 10:26 p.m. UTC | #16
On 6/10/24 23:48, Paolo Bonzini wrote:
> On Sat, Jun 8, 2024 at 1:03 AM Sean Christopherson <seanjc@google.com> wrote:
>> SNP folks and/or Paolo, what's the plan for this?  I don't see how what's sitting
>> in kvm/next can possibly be correct without conditioning population on the folio
>> being !uptodate.
> 
> I don't think I have time to look at it closely until Friday; but
> thanks for reminding me.

Ok, I'm officially confused.  I think I understand what you did in your
suggested code.  Limiting it to the bare minimum (keeping the callback
instead of CONFIG_HAVE_KVM_GMEM_INITIALIZE) it would be something
like what I include at the end of the message.

But the discussion upthread was about whether to do the check for
RMP state in sev.c, or do it in common code using folio_mark_uptodate().
I am not sure what you mean by "cannot possibly be correct", and
whether it's referring to kvm_gmem_populate() in general or the
callback in sev_gmem_post_populate().

The change below looks like just an optimization to me, which
suggests that I'm missing something glaring.

Paolo

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index d4206e53a9c81..a0417ef5b86eb 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -52,37 +52,39 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol
  static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare)
  {
  	struct folio *folio;
+	int r;
  
  	/* TODO: Support huge pages. */
  	folio = filemap_grab_folio(inode->i_mapping, index);
  	if (IS_ERR(folio))
  		return folio;
  
-	/*
-	 * Use the up-to-date flag to track whether or not the memory has been
-	 * zeroed before being handed off to the guest.  There is no backing
-	 * storage for the memory, so the folio will remain up-to-date until
-	 * it's removed.
-	 *
-	 * TODO: Skip clearing pages when trusted firmware will do it when
-	 * assigning memory to the guest.
-	 */
-	if (!folio_test_uptodate(folio)) {
-		unsigned long nr_pages = folio_nr_pages(folio);
-		unsigned long i;
+	if (prepare) {
+		/*
+		 * Use the up-to-date flag to track whether or not the memory has
+		 * been handed off to the guest.  There is no backing storage for
+		 * the memory, so the folio will remain up-to-date until it's
+		 * removed.
+		 *
+		 * Take the occasion of the first prepare operation to clear it.
+		 */
+		if (!folio_test_uptodate(folio)) {
+			unsigned long nr_pages = folio_nr_pages(folio);
+			unsigned long i;
  
-		for (i = 0; i < nr_pages; i++)
-			clear_highpage(folio_page(folio, i));
+			for (i = 0; i < nr_pages; i++)
+				clear_highpage(folio_page(folio, i));
+		}
+
+		r = kvm_gmem_prepare_folio(inode, index, folio);
+		if (r < 0)
+			goto err_unlock_put;
  
  		folio_mark_uptodate(folio);
-	}
-
-	if (prepare) {
-		int r =	kvm_gmem_prepare_folio(inode, index, folio);
-		if (r < 0) {
-			folio_unlock(folio);
-			folio_put(folio);
-			return ERR_PTR(r);
+	} else {
+		if (folio_test_uptodate(folio)) {
+			r = -EEXIST;
+			goto err_unlock_put;
  		}
  	}
  
@@ -91,6 +93,11 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool
  	 * unevictable and there is no storage to write back to.
  	 */
  	return folio;
+
+err_unlock_put:
+	folio_unlock(folio);
+	folio_put(folio);
+	return ERR_PTR(r);
  }
  
  static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
@@ -545,8 +552,15 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
  	fput(file);
  }
  
+/* If p_folio is NULL, the folio is cleared, prepared and marked up-to-date
+ * before returning.
+ *
+ * If p_folio is not NULL, this is left to the caller, who must call
+ * folio_mark_uptodate() once the page is ready for use by the guest.
+ */
  static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
-		       gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare)
+		       gfn_t gfn, kvm_pfn_t *pfn, int *max_order,
+		       struct folio **p_folio)
  {
  	pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
  	struct kvm_gmem *gmem = file->private_data;
@@ -565,7 +579,7 @@ static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
  		return -EIO;
  	}
  
-	folio = kvm_gmem_get_folio(file_inode(file), index, prepare);
+	folio = kvm_gmem_get_folio(file_inode(file), index, !p_folio);
  	if (IS_ERR(folio))
  		return PTR_ERR(folio);
  
@@ -577,6 +591,8 @@ static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
  	page = folio_file_page(folio, index);
  
  	*pfn = page_to_pfn(page);
+	if (p_folio)
+		*p_folio = folio;
  	if (max_order)
  		*max_order = 0;
  
@@ -597,7 +613,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
  	if (!file)
  		return -EFAULT;
  
-	r = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, true);
+	r = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, NULL);
  	fput(file);
  	return r;
  }
@@ -629,10 +645,11 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
  
  	npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
  	for (i = 0; i < npages; i += (1 << max_order)) {
+		struct folio *folio;
  		gfn_t gfn = start_gfn + i;
  		kvm_pfn_t pfn;
  
-		ret = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false);
+		ret = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, &folio);
  		if (ret)
  			break;
  
@@ -642,8 +659,10 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
  
  		p = src ? src + i * PAGE_SIZE : NULL;
  		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
+		if (!ret)
+			folio_mark_uptodate(folio);
  
-		put_page(pfn_to_page(pfn));
+		folio_put(folio);
  		if (ret)
  			break;
  	}
Sean Christopherson June 10, 2024, 11:40 p.m. UTC | #17
On Tue, Jun 11, 2024, Paolo Bonzini wrote:
> On 6/10/24 23:48, Paolo Bonzini wrote:
> > On Sat, Jun 8, 2024 at 1:03 AM Sean Christopherson <seanjc@google.com> wrote:
> > > SNP folks and/or Paolo, what's the plan for this?  I don't see how what's sitting
> > > in kvm/next can possibly be correct without conditioning population on the folio
> > > being !uptodate.
> > 
> > I don't think I have time to look at it closely until Friday; but
> > thanks for reminding me.
> 
> Ok, I'm officially confused.  I think I understand what you did in your
> suggested code.  Limiting it to the bare minimum (keeping the callback
> instead of CONFIG_HAVE_KVM_GMEM_INITIALIZE) it would be something
> like what I include at the end of the message.
> 
> But the discussion upthread was about whether to do the check for
> RMP state in sev.c, or do it in common code using folio_mark_uptodate().
> I am not sure what you mean by "cannot possibly be correct", and
> whether it's referring to kvm_gmem_populate() in general or the
> callback in sev_gmem_post_populate().

Doing fallocate() before KVM_SEV_SNP_LAUNCH_UPDATE will cause the latter to fail.
That likely works for QEMU, at least for now, but it's unnecessarily restrictive
and IMO incorrect/wrong.

E.g. a more convoluted, fallocate() + PUNCH_HOLE + KVM_SEV_SNP_LAUNCH_UPDATE will
work (I think?  AFAICT adding and removing pages directly to/from the RMP doesn't
affect SNP's measurement, only pages that are added via SNP_LAUNCH_UPDATE affect
the measurement).

Punting the sanity check to vendor code is also gross and will make it harder to
provide a consistent, unified ABI for all architectures.  E.g. SNP returns -EINVAL
if the page is already assigned, which is quite misleading.

> The change below looks like just an optimization to me, which
> suggests that I'm missing something glaring.

I really dislike @prepare.  There are two paths that should actually initialize
the contents of the folio, and they are mutually exclusive and have meaningfully
different behavior.  Faulting in memory via kvm_gmem_get_pfn() explicitly zeros
the folio _if necessary_, whereas kvm_gmem_populate() initializes the folio with
user-provided data _and_ requires that the folio be !uptodate.

If we fix the above oddity where fallocate() initializes memory, then there's
no need to try and handle the initialization in a common chokepoint as the two
relevant paths will naturally have unique code.

The below is also still suboptimal for TDX, as KVM will zero the memory and then
TDX-module will also zero memory on PAGE.AUGA.

And I find SNP to be the odd one.  IIUC, the ASP (the artist formerly known as
the PSP) doesn't provide any guarantees about the contents of a page that is
assigned to a guest without bouncing through SNP_LAUNCH_UPDATE.  It'd be nice to
explicitly document that somewhere in the SNP code. E.g. if guest_memfd invokes
a common kvm_gmem_initialize_folio() or whatever, then SNP's implementation can
clearly capture that KVM zeros the page to protect the _host_ data.

> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index d4206e53a9c81..a0417ef5b86eb 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -52,37 +52,39 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol
>  static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare)
>  {
>  	struct folio *folio;
> +	int r;
>  	/* TODO: Support huge pages. */
>  	folio = filemap_grab_folio(inode->i_mapping, index);
>  	if (IS_ERR(folio))
>  		return folio;
> -	/*
> -	 * Use the up-to-date flag to track whether or not the memory has been
> -	 * zeroed before being handed off to the guest.  There is no backing
> -	 * storage for the memory, so the folio will remain up-to-date until
> -	 * it's removed.
> -	 *
> -	 * TODO: Skip clearing pages when trusted firmware will do it when
> -	 * assigning memory to the guest.
> -	 */
> -	if (!folio_test_uptodate(folio)) {
> -		unsigned long nr_pages = folio_nr_pages(folio);
> -		unsigned long i;
> +	if (prepare) {
> +		/*
> +		 * Use the up-to-date flag to track whether or not the memory has
> +		 * been handed off to the guest.  There is no backing storage for
> +		 * the memory, so the folio will remain up-to-date until it's
> +		 * removed.
> +		 *
> +		 * Take the occasion of the first prepare operation to clear it.
> +		 */
> +		if (!folio_test_uptodate(folio)) {
> +			unsigned long nr_pages = folio_nr_pages(folio);
> +			unsigned long i;
> -		for (i = 0; i < nr_pages; i++)
> -			clear_highpage(folio_page(folio, i));
> +			for (i = 0; i < nr_pages; i++)
> +				clear_highpage(folio_page(folio, i));
> +		}
> +
> +		r = kvm_gmem_prepare_folio(inode, index, folio);
> +		if (r < 0)
> +			goto err_unlock_put;
>  		folio_mark_uptodate(folio);
> -	}
> -
> -	if (prepare) {
> -		int r =	kvm_gmem_prepare_folio(inode, index, folio);
> -		if (r < 0) {
> -			folio_unlock(folio);
> -			folio_put(folio);
> -			return ERR_PTR(r);
> +	} else {
> +		if (folio_test_uptodate(folio)) {
> +			r = -EEXIST;
> +			goto err_unlock_put;
>  		}
>  	}
Paolo Bonzini June 11, 2024, 6:09 a.m. UTC | #18
On Tue, Jun 11, 2024 at 1:41 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Jun 11, 2024, Paolo Bonzini wrote:
> > On 6/10/24 23:48, Paolo Bonzini wrote:
> > > On Sat, Jun 8, 2024 at 1:03 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > SNP folks and/or Paolo, what's the plan for this?  I don't see how what's sitting
> > > > in kvm/next can possibly be correct without conditioning population on the folio
> > > > being !uptodate.
> > >
> > > I don't think I have time to look at it closely until Friday; but
> > > thanks for reminding me.
> >
> > Ok, I'm officially confused.  I think I understand what you did in your
> > suggested code.  Limiting it to the bare minimum (keeping the callback
> > instead of CONFIG_HAVE_KVM_GMEM_INITIALIZE) it would be something
> > like what I include at the end of the message.
> >
> > But the discussion upthread was about whether to do the check for
> > RMP state in sev.c, or do it in common code using folio_mark_uptodate().
> > I am not sure what you mean by "cannot possibly be correct", and
> > whether it's referring to kvm_gmem_populate() in general or the
> > callback in sev_gmem_post_populate().
>
> Doing fallocate() before KVM_SEV_SNP_LAUNCH_UPDATE will cause the latter to fail.
> That likely works for QEMU, at least for now, but it's unnecessarily restrictive
> and IMO incorrect/wrong.

Ok, I interpreted incorrect as if it caused incorrect initialization
or something similarly fatal.  Being too restrictive can (almost)
always be lifted.

> E.g. a more convoluted, fallocate() + PUNCH_HOLE + KVM_SEV_SNP_LAUNCH_UPDATE will
> work (I think?  AFAICT adding and removing pages directly to/from the RMP doesn't
> affect SNP's measurement, only pages that are added via SNP_LAUNCH_UPDATE affect
> the measurement).

So the starting point is writing testcases (for which indeed I have to
wait until Friday).  It's not exactly a rewrite but almost.

> Punting the sanity check to vendor code is also gross and will make it harder to
> provide a consistent, unified ABI for all architectures.  E.g. SNP returns -EINVAL
> if the page is already assigned, which is quite misleading.
>
> > The change below looks like just an optimization to me, which
> > suggests that I'm missing something glaring.
>
> I really dislike @prepare.  There are two paths that should actually initialize
> the contents of the folio, and they are mutually exclusive and have meaningfully
> different behavior.  Faulting in memory via kvm_gmem_get_pfn() explicitly zeros
> the folio _if necessary_, whereas kvm_gmem_populate() initializes the folio with
> user-provided data _and_ requires that the folio be !uptodate.

No complaints there, I just wanted to start with the minimal change to
use the uptodate flag in kvm_gmem_populate(). And yeah,
kvm_gmem_get_folio() at this point can be basically replaced by
filemap_grab_folio() in the kvm_gmem_populate() path. What I need to
think about, is that there is still quite a bit of code in
__kvm_gmem_get_pfn() that is common to both paths.

Paolo
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 33ed3b884a6b..97d57ec59789 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2450,4 +2450,30 @@  int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
 bool kvm_arch_gmem_prepare_needed(struct kvm *kvm);
 #endif
 
+/**
+ * kvm_gmem_populate() - Populate/prepare a GPA range with guest data
+ *
+ * @kvm: KVM instance
+ * @gfn: starting GFN to be populated
+ * @src: userspace-provided buffer containing data to copy into GFN range
+ *       (passed to @post_populate, and incremented on each iteration
+ *       if not NULL)
+ * @npages: number of pages to copy from userspace-buffer
+ * @post_populate: callback to issue for each gmem page that backs the GPA
+ *                 range
+ * @opaque: opaque data to pass to @post_populate callback
+ *
+ * This is primarily intended for cases where a gmem-backed GPA range needs
+ * to be initialized with userspace-provided data prior to being mapped into
+ * the guest as a private page. This should be called with the slots->lock
+ * held so that caller-enforced invariants regarding the expected memory
+ * attributes of the GPA range do not race with KVM_SET_MEMORY_ATTRIBUTES.
+ *
+ * Returns the number of pages that were populated.
+ */
+long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
+		       int (*post_populate)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
+					    void __user *src, int order, void *opaque),
+		       void *opaque);
+
 #endif
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 51c99667690a..e7de97382a67 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -602,3 +602,81 @@  int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
+
+static int kvm_gmem_undo_get_pfn(struct file *file, struct kvm_memory_slot *slot,
+				 gfn_t gfn, int order)
+{
+	pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
+	struct kvm_gmem *gmem = file->private_data;
+
+	/*
+	 * Races with kvm_gmem_unbind() must have been detected by
+	 * __kvm_gmem_get_gfn(), because the invalidate_lock is
+	 * taken between __kvm_gmem_get_gfn() and kvm_gmem_undo_get_pfn().
+	 */
+	if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot))
+		return -EIO;
+
+	return __kvm_gmem_punch_hole(file_inode(file), index << PAGE_SHIFT, PAGE_SIZE << order);
+}
+
+long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
+		       int (*post_populate)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
+					    void __user *src, int order, void *opaque),
+		       void *opaque)
+{
+	struct file *file;
+	struct kvm_memory_slot *slot;
+
+	int ret = 0, max_order;
+	long i;
+
+	lockdep_assert_held(&kvm->slots_lock);
+	if (npages < 0)
+		return -EINVAL;
+
+	slot = gfn_to_memslot(kvm, gfn);
+	if (!kvm_slot_can_be_private(slot))
+		return -EINVAL;
+
+	file = kvm_gmem_get_file(slot);
+	if (!file)
+		return -EFAULT;
+
+	filemap_invalidate_lock(file->f_mapping);
+
+	npages = min_t(ulong, slot->npages - (gfn - slot->base_gfn), npages);
+	for (i = 0; i < npages; i += (1 << max_order)) {
+		gfn_t this_gfn = gfn + i;
+		kvm_pfn_t pfn;
+
+		ret = __kvm_gmem_get_pfn(file, slot, this_gfn, &pfn, &max_order, false);
+		if (ret)
+			break;
+
+		if (!IS_ALIGNED(this_gfn, (1 << max_order)) ||
+		    (npages - i) < (1 << max_order))
+			max_order = 0;
+
+		if (post_populate) {
+			void __user *p = src ? src + i * PAGE_SIZE : NULL;
+			ret = post_populate(kvm, this_gfn, pfn, p, max_order, opaque);
+		}
+
+		put_page(pfn_to_page(pfn));
+		if (ret) {
+			/*
+			 * Punch a hole so that FGP_CREAT_ONLY can succeed
+			 * again.
+			 */
+			kvm_gmem_undo_get_pfn(file, slot, this_gfn, max_order);
+			break;
+		}
+	}
+
+	filemap_invalidate_unlock(file->f_mapping);
+
+	fput(file);
+	return ret && !i ? ret : i;
+}
+EXPORT_SYMBOL_GPL(kvm_gmem_populate);