diff mbox series

[06/11] KVM: guest_memfd: Add hook for initializing memory

Message ID 20240404185034.3184582-7-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: guest_memfd: New hooks and functionality for SEV-SNP and TDX | expand

Commit Message

Paolo Bonzini April 4, 2024, 6:50 p.m. UTC
guest_memfd pages are generally expected to be in some arch-defined
initial state prior to using them for guest memory. For SEV-SNP this
initial state is 'private', or 'guest-owned', and requires additional
operations to move these pages into a 'private' state by updating the
corresponding entries the RMP table.

Allow for an arch-defined hook to handle updates of this sort, and go
ahead and implement one for x86 so KVM implementations like AMD SVM can
register a kvm_x86_ops callback to handle these updates for SEV-SNP
guests.

The preparation callback is always called when allocating/grabbing
folios via gmem, and it is up to the architecture to keep track of
whether or not the pages are already in the expected state (e.g. the RMP
table in the case of SEV-SNP).

In some cases, it is necessary to defer the preparation of the pages to
handle things like in-place encryption of initial guest memory payloads
before marking these pages as 'private'/'guest-owned'.  Add an argument
(always true for now) to kvm_gmem_get_folio() that allows for the
preparation callback to be bypassed.  To detect possible issues in
the way userspace initializes memory, it is only possible to add an
unprepared page if it is not already included in the filemap.

Link: https://lore.kernel.org/lkml/ZLqVdvsF11Ddo7Dq@google.com/
Co-developed-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Message-Id: <20231230172351.574091-5-michael.roth@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  1 +
 arch/x86/kvm/x86.c                 |  6 +++
 include/linux/kvm_host.h           |  5 +++
 virt/kvm/Kconfig                   |  4 ++
 virt/kvm/guest_memfd.c             | 65 ++++++++++++++++++++++++++++--
 6 files changed, 78 insertions(+), 4 deletions(-)

Comments

Xu Yilun April 22, 2024, 10:53 a.m. UTC | #1
On Thu, Apr 04, 2024 at 02:50:28PM -0400, Paolo Bonzini wrote:
> guest_memfd pages are generally expected to be in some arch-defined
> initial state prior to using them for guest memory. For SEV-SNP this
> initial state is 'private', or 'guest-owned', and requires additional
> operations to move these pages into a 'private' state by updating the
> corresponding entries the RMP table.
> 
> Allow for an arch-defined hook to handle updates of this sort, and go
> ahead and implement one for x86 so KVM implementations like AMD SVM can
> register a kvm_x86_ops callback to handle these updates for SEV-SNP
> guests.
> 
> The preparation callback is always called when allocating/grabbing
> folios via gmem, and it is up to the architecture to keep track of
> whether or not the pages are already in the expected state (e.g. the RMP
> table in the case of SEV-SNP).
> 
> In some cases, it is necessary to defer the preparation of the pages to
> handle things like in-place encryption of initial guest memory payloads
> before marking these pages as 'private'/'guest-owned'.  Add an argument
> (always true for now) to kvm_gmem_get_folio() that allows for the
> preparation callback to be bypassed.  To detect possible issues in

IIUC, we have 2 dedicated flows.
1 kvm_gmem_get_pfn() or kvm_gmem_allocate()
  a. kvm_gmem_get_folio()
  b. gmem_prepare() for RMP

2 in-place encryption or whatever
  a. kvm_gmem_get_folio(FGP_CREAT_ONLY)
  b. in-place encryption
  c. gmem_prepare() for RMP

Could we move gmem_prepare() out of kvm_gmem_get_folio(), then we could
have straightforward flow for each case, and don't have to have an
argument to pospone gmem_prepare().

> the way userspace initializes memory, it is only possible to add an
> unprepared page if it is not already included in the filemap.
> 
> Link: https://lore.kernel.org/lkml/ZLqVdvsF11Ddo7Dq@google.com/
> Co-developed-by: Michael Roth <michael.roth@amd.com>
> Signed-off-by: Michael Roth <michael.roth@amd.com>
> Message-Id: <20231230172351.574091-5-michael.roth@amd.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/include/asm/kvm-x86-ops.h |  1 +
>  arch/x86/include/asm/kvm_host.h    |  1 +
>  arch/x86/kvm/x86.c                 |  6 +++
>  include/linux/kvm_host.h           |  5 +++
>  virt/kvm/Kconfig                   |  4 ++
>  virt/kvm/guest_memfd.c             | 65 ++++++++++++++++++++++++++++--
>  6 files changed, 78 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 5187fcf4b610..d26fcad13e36 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -139,6 +139,7 @@ KVM_X86_OP(vcpu_deliver_sipi_vector)
>  KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
>  KVM_X86_OP_OPTIONAL(get_untagged_addr)
>  KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
> +KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
>  
>  #undef KVM_X86_OP
>  #undef KVM_X86_OP_OPTIONAL
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 01c69840647e..f101fab0040e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1809,6 +1809,7 @@ struct kvm_x86_ops {
>  
>  	gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
>  	void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
> +	int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
>  };
>  
>  struct kvm_x86_nested_ops {
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 2d2619d3eee4..972524ddcfdb 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -13598,6 +13598,12 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
>  }
>  EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
>  
> +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
> +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
> +{
> +	return static_call(kvm_x86_gmem_prepare)(kvm, pfn, gfn, max_order);
> +}
> +#endif
>  
>  int kvm_spec_ctrl_test_value(u64 value)
>  {
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 48f31dcd318a..33ed3b884a6b 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2445,4 +2445,9 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm,
>  }
>  #endif /* CONFIG_KVM_PRIVATE_MEM */
>  
> +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
> +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order);
> +bool kvm_arch_gmem_prepare_needed(struct kvm *kvm);
> +#endif
> +
>  #endif
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 29b73eedfe74..ca870157b2ed 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -109,3 +109,7 @@ config KVM_GENERIC_PRIVATE_MEM
>         select KVM_GENERIC_MEMORY_ATTRIBUTES
>         select KVM_PRIVATE_MEM
>         bool
> +
> +config HAVE_KVM_GMEM_PREPARE
> +       bool
> +       depends on KVM_PRIVATE_MEM
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index e5b3cd02b651..486748e65f36 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -13,12 +13,60 @@ struct kvm_gmem {
>  	struct list_head entry;
>  };
>  
> -static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
> +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
> +bool __weak kvm_arch_gmem_prepare_needed(struct kvm *kvm)
> +{
> +	return false;
> +}
> +#endif

In which case HAVE_KVM_GMEM_PREPARE is selected but
gmem_prepare_needed() is never implemented?  Then all gmem_prepare stuff
are actually dead code.  Maybe we don't need this weak stub?

> +
> +static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio)
> +{
> +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
> +	struct list_head *gmem_list = &inode->i_mapping->i_private_list;
> +	struct kvm_gmem *gmem;
> +
> +	list_for_each_entry(gmem, gmem_list, entry) {
> +		struct kvm_memory_slot *slot;
> +		struct kvm *kvm = gmem->kvm;
> +		struct page *page;
> +		kvm_pfn_t pfn;
> +		gfn_t gfn;
> +		int rc;
> +
> +		if (!kvm_arch_gmem_prepare_needed(kvm))
> +			continue;
> +
> +		slot = xa_load(&gmem->bindings, index);
> +		if (!slot)
> +			continue;
> +
> +		page = folio_file_page(folio, index);
> +		pfn = page_to_pfn(page);
> +		gfn = slot->base_gfn + index - slot->gmem.pgoff;
> +		rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page)));
> +		if (rc) {
> +			pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx, error %d.\n",
> +					    index, rc);
> +			return rc;
> +		}
> +	}
> +
> +#endif
> +	return 0;
> +}
> +
> +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare)
>  {
>  	struct folio *folio;
> +	fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
> +
> +	if (!prepare)
> +		fgp_flags |= FGP_CREAT_ONLY;
>  
>  	/* TODO: Support huge pages. */
> -	folio = filemap_grab_folio(inode->i_mapping, index);
> +	folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags,
> +				    mapping_gfp_mask(inode->i_mapping));
>  	if (IS_ERR_OR_NULL(folio))
>  		return folio;
>  
> @@ -41,6 +89,15 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>  		folio_mark_uptodate(folio);
>  	}
>  
> +	if (prepare) {
> +		int r =	kvm_gmem_prepare_folio(inode, index, folio);
> +		if (r < 0) {
> +			folio_unlock(folio);
> +			folio_put(folio);
> +			return ERR_PTR(r);
> +		}
> +	}
> +

Do we still need to prepare the page if it is hwpoisoned? I see the
hwpoisoned check is outside, in kvm_gmem_get_pfn().

Thanks,
Yilun

>  	/*
>  	 * Ignore accessed, referenced, and dirty flags.  The memory is
>  	 * unevictable and there is no storage to write back to.
> @@ -145,7 +202,7 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
>  			break;
>  		}
>  
> -		folio = kvm_gmem_get_folio(inode, index);
> +		folio = kvm_gmem_get_folio(inode, index, true);
>  		if (IS_ERR_OR_NULL(folio)) {
>  			r = folio ? PTR_ERR(folio) : -ENOMEM;
>  			break;
> @@ -505,7 +562,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>  		goto out_fput;
>  	}
>  
> -	folio = kvm_gmem_get_folio(file_inode(file), index);
> +	folio = kvm_gmem_get_folio(file_inode(file), index, true);
>  	if (!folio) {
>  		r = -ENOMEM;
>  		goto out_fput;
> -- 
> 2.43.0
> 
> 
>
Paolo Bonzini May 7, 2024, 4:17 p.m. UTC | #2
On Mon, Apr 22, 2024 at 12:58 PM Xu Yilun <yilun.xu@linux.intel.com> wrote:
> > In some cases, it is necessary to defer the preparation of the pages to
> > handle things like in-place encryption of initial guest memory payloads
> > before marking these pages as 'private'/'guest-owned'.  Add an argument
> > (always true for now) to kvm_gmem_get_folio() that allows for the
> > preparation callback to be bypassed.  To detect possible issues in
>
> IIUC, we have 2 dedicated flows.
> 1 kvm_gmem_get_pfn() or kvm_gmem_allocate()
>   a. kvm_gmem_get_folio()
>   b. gmem_prepare() for RMP
>
> 2 in-place encryption or whatever
>   a. kvm_gmem_get_folio(FGP_CREAT_ONLY)
>   b. in-place encryption
>   c. gmem_prepare() for RMP
>
> Could we move gmem_prepare() out of kvm_gmem_get_folio(), then we could
> have straightforward flow for each case, and don't have to have an
> argument to pospone gmem_prepare().

There are 3 flows as you note above - kvm_gmem_get_pfn() and
kvm_gmem_allocate() are different paths but they all need to call the
prepare hook. It is a tempting idea to pull kvm_gmem_prepare_folio()
to the two functions (get_pfn and allocate) but the resulting code is
really ugly due to folio_unlock/folio_put.

> > -static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
> > +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
> > +bool __weak kvm_arch_gmem_prepare_needed(struct kvm *kvm)
> > +{
> > +     return false;
> > +}
> > +#endif
>
> In which case HAVE_KVM_GMEM_PREPARE is selected but
> gmem_prepare_needed() is never implemented?  Then all gmem_prepare stuff
> are actually dead code.  Maybe we don't need this weak stub?

It's not needed indeed.

> > +     if (prepare) {
> > +             int r = kvm_gmem_prepare_folio(inode, index, folio);
> > +             if (r < 0) {
> > +                     folio_unlock(folio);
> > +                     folio_put(folio);
> > +                     return ERR_PTR(r);
> > +             }
> > +     }
> > +
>
> Do we still need to prepare the page if it is hwpoisoned? I see the
> hwpoisoned check is outside, in kvm_gmem_get_pfn().

Yep, it can be moved here.

Paolo
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 5187fcf4b610..d26fcad13e36 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -139,6 +139,7 @@  KVM_X86_OP(vcpu_deliver_sipi_vector)
 KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
 KVM_X86_OP_OPTIONAL(get_untagged_addr)
 KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
+KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
 
 #undef KVM_X86_OP
 #undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 01c69840647e..f101fab0040e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1809,6 +1809,7 @@  struct kvm_x86_ops {
 
 	gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
 	void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
+	int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2d2619d3eee4..972524ddcfdb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13598,6 +13598,12 @@  bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
 
+#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
+int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
+{
+	return static_call(kvm_x86_gmem_prepare)(kvm, pfn, gfn, max_order);
+}
+#endif
 
 int kvm_spec_ctrl_test_value(u64 value)
 {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 48f31dcd318a..33ed3b884a6b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2445,4 +2445,9 @@  static inline int kvm_gmem_get_pfn(struct kvm *kvm,
 }
 #endif /* CONFIG_KVM_PRIVATE_MEM */
 
+#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
+int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order);
+bool kvm_arch_gmem_prepare_needed(struct kvm *kvm);
+#endif
+
 #endif
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 29b73eedfe74..ca870157b2ed 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -109,3 +109,7 @@  config KVM_GENERIC_PRIVATE_MEM
        select KVM_GENERIC_MEMORY_ATTRIBUTES
        select KVM_PRIVATE_MEM
        bool
+
+config HAVE_KVM_GMEM_PREPARE
+       bool
+       depends on KVM_PRIVATE_MEM
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index e5b3cd02b651..486748e65f36 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -13,12 +13,60 @@  struct kvm_gmem {
 	struct list_head entry;
 };
 
-static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
+bool __weak kvm_arch_gmem_prepare_needed(struct kvm *kvm)
+{
+	return false;
+}
+#endif
+
+static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio)
+{
+#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
+	struct list_head *gmem_list = &inode->i_mapping->i_private_list;
+	struct kvm_gmem *gmem;
+
+	list_for_each_entry(gmem, gmem_list, entry) {
+		struct kvm_memory_slot *slot;
+		struct kvm *kvm = gmem->kvm;
+		struct page *page;
+		kvm_pfn_t pfn;
+		gfn_t gfn;
+		int rc;
+
+		if (!kvm_arch_gmem_prepare_needed(kvm))
+			continue;
+
+		slot = xa_load(&gmem->bindings, index);
+		if (!slot)
+			continue;
+
+		page = folio_file_page(folio, index);
+		pfn = page_to_pfn(page);
+		gfn = slot->base_gfn + index - slot->gmem.pgoff;
+		rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page)));
+		if (rc) {
+			pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx, error %d.\n",
+					    index, rc);
+			return rc;
+		}
+	}
+
+#endif
+	return 0;
+}
+
+static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare)
 {
 	struct folio *folio;
+	fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
+
+	if (!prepare)
+		fgp_flags |= FGP_CREAT_ONLY;
 
 	/* TODO: Support huge pages. */
-	folio = filemap_grab_folio(inode->i_mapping, index);
+	folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags,
+				    mapping_gfp_mask(inode->i_mapping));
 	if (IS_ERR_OR_NULL(folio))
 		return folio;
 
@@ -41,6 +89,15 @@  static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 		folio_mark_uptodate(folio);
 	}
 
+	if (prepare) {
+		int r =	kvm_gmem_prepare_folio(inode, index, folio);
+		if (r < 0) {
+			folio_unlock(folio);
+			folio_put(folio);
+			return ERR_PTR(r);
+		}
+	}
+
 	/*
 	 * Ignore accessed, referenced, and dirty flags.  The memory is
 	 * unevictable and there is no storage to write back to.
@@ -145,7 +202,7 @@  static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
 			break;
 		}
 
-		folio = kvm_gmem_get_folio(inode, index);
+		folio = kvm_gmem_get_folio(inode, index, true);
 		if (IS_ERR_OR_NULL(folio)) {
 			r = folio ? PTR_ERR(folio) : -ENOMEM;
 			break;
@@ -505,7 +562,7 @@  int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 		goto out_fput;
 	}
 
-	folio = kvm_gmem_get_folio(file_inode(file), index);
+	folio = kvm_gmem_get_folio(file_inode(file), index, true);
 	if (!folio) {
 		r = -ENOMEM;
 		goto out_fput;