diff mbox series

[RFC,30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Message ID 24cf7a9b1ee499c4ca4da76e9945429072014d1e.1726009989.git.ackerleytng@google.com (mailing list archive)
State New
Headers show
Series 1G page support for guest_memfd | expand

Commit Message

Ackerley Tng Sept. 10, 2024, 11:44 p.m. UTC
Since guest_memfd now supports mmap(), folios have to be prepared
before they are faulted into userspace.

When memory attributes are switched between shared and private, the
up-to-date flags will be cleared.

Use the folio's up-to-date flag to indicate being ready for the guest
usage and can be used to mark whether the folio is ready for shared OR
private use.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>

---
 virt/kvm/guest_memfd.c | 131 ++++++++++++++++++++++++++++++++++++++++-
 virt/kvm/kvm_main.c    |   2 +
 virt/kvm/kvm_mm.h      |   7 +++
 3 files changed, 139 insertions(+), 1 deletion(-)

Comments

Elliot Berman Sept. 16, 2024, 8 p.m. UTC | #1
On Tue, Sep 10, 2024 at 11:44:01PM +0000, Ackerley Tng wrote:
> Since guest_memfd now supports mmap(), folios have to be prepared
> before they are faulted into userspace.
> 
> When memory attributes are switched between shared and private, the
> up-to-date flags will be cleared.
> 
> Use the folio's up-to-date flag to indicate being ready for the guest
> usage and can be used to mark whether the folio is ready for shared OR
> private use.

Clearing the up-to-date flag also means that the page gets zero'd out
whenever it transitions between shared and private (either direction).
pKVM (Android) hypervisor policy can allow in-place conversion between
shared/private.

I believe the important thing is that sev_gmem_prepare() needs to be
called prior to giving page to guest. In my series, I had made a
->prepare_inaccessible() callback where KVM would only do this part.
When transitioning to inaccessible, only that callback would be made,
besides the bookkeeping. The folio zeroing happens once when allocating
the folio if the folio is initially accessible (faultable).

From x86 CoCo perspective, I think it also makes sense to not zero
the folio when changing faultiblity from private to shared:
 - If guest is sharing some data with host, you've wiped the data and
   guest has to copy again.
 - Or, if SEV/TDX enforces that page is zero'd between transitions,
   Linux has duplicated the work that trusted entity has already done.

Fuad and I can help add some details for the conversion. Hopefully we
can figure out some of the plan at plumbers this week.

Thanks,
Elliot

> 
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> 
> ---
>  virt/kvm/guest_memfd.c | 131 ++++++++++++++++++++++++++++++++++++++++-
>  virt/kvm/kvm_main.c    |   2 +
>  virt/kvm/kvm_mm.h      |   7 +++
>  3 files changed, 139 insertions(+), 1 deletion(-)
> 
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 110c4bbb004b..fb292e542381 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -129,13 +129,29 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
>  }
>  
>  /**
> - * Use the uptodate flag to indicate that the folio is prepared for KVM's usage.
> + * Use folio's up-to-date flag to indicate that this folio is prepared for usage
> + * by the guest.
> + *
> + * This flag can be used whether the folio is prepared for PRIVATE or SHARED
> + * usage.
>   */
>  static inline void kvm_gmem_mark_prepared(struct folio *folio)
>  {
>  	folio_mark_uptodate(folio);
>  }
>  
> +/**
> + * Use folio's up-to-date flag to indicate that this folio is not yet prepared for
> + * usage by the guest.
> + *
> + * This flag can be used whether the folio is prepared for PRIVATE or SHARED
> + * usage.
> + */
> +static inline void kvm_gmem_clear_prepared(struct folio *folio)
> +{
> +	folio_clear_uptodate(folio);
> +}
> +
>  /*
>   * Process @folio, which contains @gfn, so that the guest can use it.
>   * The folio must be locked and the gfn must be contained in @slot.
> @@ -148,6 +164,12 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	pgoff_t index;
>  	int r;
>  
> +	/*
> +	 * Defensively zero folio to avoid leaking kernel memory in
> +	 * uninitialized pages. This is important since pages can now be mapped
> +	 * into userspace, where hardware (e.g. TDX) won't be clearing those
> +	 * pages.
> +	 */
>  	if (folio_test_hugetlb(folio)) {
>  		folio_zero_user(folio, folio->index << PAGE_SHIFT);
>  	} else {
> @@ -1017,6 +1039,7 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
>  {
>  	struct inode *inode;
>  	struct folio *folio;
> +	bool is_prepared;
>  
>  	inode = file_inode(vmf->vma->vm_file);
>  	if (!kvm_gmem_is_faultable(inode, vmf->pgoff))
> @@ -1026,6 +1049,31 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
>  	if (!folio)
>  		return VM_FAULT_SIGBUS;
>  
> +	is_prepared = folio_test_uptodate(folio);
> +	if (!is_prepared) {
> +		unsigned long nr_pages;
> +		unsigned long i;
> +
> +		if (folio_test_hugetlb(folio)) {
> +			folio_zero_user(folio, folio->index << PAGE_SHIFT);
> +		} else {
> +			/*
> +			 * Defensively zero folio to avoid leaking kernel memory in
> +			 * uninitialized pages. This is important since pages can now be
> +			 * mapped into userspace, where hardware (e.g. TDX) won't be
> +			 * clearing those pages.
> +			 *
> +			 * Will probably need a version of kvm_gmem_prepare_folio() to
> +			 * prepare the page for SHARED use.
> +			 */
> +			nr_pages = folio_nr_pages(folio);
> +			for (i = 0; i < nr_pages; i++)
> +				clear_highpage(folio_page(folio, i));
> +		}
> +
> +		kvm_gmem_mark_prepared(folio);
> +	}
> +
>  	vmf->page = folio_file_page(folio, vmf->pgoff);
>  	return VM_FAULT_LOCKED;
>  }
> @@ -1593,6 +1641,87 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>  }
>  EXPORT_SYMBOL_GPL(kvm_gmem_populate);
>  
> +static void kvm_gmem_clear_prepared_range(struct inode *inode, pgoff_t start,
> +					  pgoff_t end)
> +{
> +	pgoff_t index;
> +
> +	filemap_invalidate_lock_shared(inode->i_mapping);
> +
> +	/* TODO: replace iteration with filemap_get_folios() for efficiency. */
> +	for (index = start; index < end;) {
> +		struct folio *folio;
> +
> +		/* Don't use kvm_gmem_get_folio to avoid allocating */
> +		folio = filemap_lock_folio(inode->i_mapping, index);
> +		if (IS_ERR(folio)) {
> +			++index;
> +			continue;
> +		}
> +
> +		kvm_gmem_clear_prepared(folio);
> +
> +		index = folio_next_index(folio);
> +		folio_unlock(folio);
> +		folio_put(folio);
> +	}
> +
> +	filemap_invalidate_unlock_shared(inode->i_mapping);
> +}
> +
> +/**
> + * Clear the prepared flag for all folios in gfn range [@start, @end) in memslot
> + * @slot.
> + */
> +static void kvm_gmem_clear_prepared_slot(struct kvm_memory_slot *slot, gfn_t start,
> +					 gfn_t end)
> +{
> +	pgoff_t start_offset;
> +	pgoff_t end_offset;
> +	struct file *file;
> +
> +	file = kvm_gmem_get_file(slot);
> +	if (!file)
> +		return;
> +
> +	start_offset = start - slot->base_gfn + slot->gmem.pgoff;
> +	end_offset = end - slot->base_gfn + slot->gmem.pgoff;
> +
> +	kvm_gmem_clear_prepared_range(file_inode(file), start_offset, end_offset);
> +
> +	fput(file);
> +}
> +
> +/**
> + * Clear the prepared flag for all folios for any slot in gfn range
> + * [@start, @end) in @kvm.
> + */
> +void kvm_gmem_clear_prepared_vm(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	int i;
> +
> +	lockdep_assert_held(&kvm->slots_lock);
> +
> +	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
> +		struct kvm_memslot_iter iter;
> +		struct kvm_memslots *slots;
> +
> +		slots = __kvm_memslots(kvm, i);
> +		kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
> +			struct kvm_memory_slot *slot;
> +			gfn_t gfn_start;
> +			gfn_t gfn_end;
> +
> +			slot = iter.slot;
> +			gfn_start = max(start, slot->base_gfn);
> +			gfn_end = min(end, slot->base_gfn + slot->npages);
> +
> +			if (iter.slot->flags & KVM_MEM_GUEST_MEMFD)
> +				kvm_gmem_clear_prepared_slot(iter.slot, gfn_start, gfn_end);
> +		}
> +	}
> +}
> +
>  /**
>   * Returns true if pages in range [@start, @end) in inode @inode have no
>   * userspace mappings.
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 1a7bbcc31b7e..255d27df7f5c 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2565,6 +2565,8 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
>  		KVM_BUG_ON(r, kvm);
>  	}
>  
> +	kvm_gmem_clear_prepared_vm(kvm, start, end);
> +
>  	kvm_handle_gfn_range(kvm, &post_set_range);
>  
>  out_unlock:
> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
> index d8ff2b380d0e..25fd0d9f66cc 100644
> --- a/virt/kvm/kvm_mm.h
> +++ b/virt/kvm/kvm_mm.h
> @@ -43,6 +43,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
>  void kvm_gmem_unbind(struct kvm_memory_slot *slot);
>  int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
>  				   unsigned long attrs);
> +void kvm_gmem_clear_prepared_vm(struct kvm *kvm, gfn_t start, gfn_t end);
>  #else
>  static inline void kvm_gmem_init(struct module *module)
>  {
> @@ -68,6 +69,12 @@ static inline int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start,
>  	return 0;
>  }
>  
> +static inline void kvm_gmem_clear_prepared_slots(struct kvm *kvm,
> +						 gfn_t start, gfn_t end)
> +{
> +	WARN_ON_ONCE(1);
> +}
> +
>  #endif /* CONFIG_KVM_PRIVATE_MEM */
>  
>  #endif /* __KVM_MM_H__ */
> -- 
> 2.46.0.598.g6f2099f65c-goog
>
diff mbox series

Patch

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 110c4bbb004b..fb292e542381 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -129,13 +129,29 @@  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
 }
 
 /**
- * Use the uptodate flag to indicate that the folio is prepared for KVM's usage.
+ * Use folio's up-to-date flag to indicate that this folio is prepared for usage
+ * by the guest.
+ *
+ * This flag can be used whether the folio is prepared for PRIVATE or SHARED
+ * usage.
  */
 static inline void kvm_gmem_mark_prepared(struct folio *folio)
 {
 	folio_mark_uptodate(folio);
 }
 
+/**
+ * Use folio's up-to-date flag to indicate that this folio is not yet prepared for
+ * usage by the guest.
+ *
+ * This flag can be used whether the folio is prepared for PRIVATE or SHARED
+ * usage.
+ */
+static inline void kvm_gmem_clear_prepared(struct folio *folio)
+{
+	folio_clear_uptodate(folio);
+}
+
 /*
  * Process @folio, which contains @gfn, so that the guest can use it.
  * The folio must be locked and the gfn must be contained in @slot.
@@ -148,6 +164,12 @@  static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
 	pgoff_t index;
 	int r;
 
+	/*
+	 * Defensively zero folio to avoid leaking kernel memory in
+	 * uninitialized pages. This is important since pages can now be mapped
+	 * into userspace, where hardware (e.g. TDX) won't be clearing those
+	 * pages.
+	 */
 	if (folio_test_hugetlb(folio)) {
 		folio_zero_user(folio, folio->index << PAGE_SHIFT);
 	} else {
@@ -1017,6 +1039,7 @@  static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
 {
 	struct inode *inode;
 	struct folio *folio;
+	bool is_prepared;
 
 	inode = file_inode(vmf->vma->vm_file);
 	if (!kvm_gmem_is_faultable(inode, vmf->pgoff))
@@ -1026,6 +1049,31 @@  static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
 	if (!folio)
 		return VM_FAULT_SIGBUS;
 
+	is_prepared = folio_test_uptodate(folio);
+	if (!is_prepared) {
+		unsigned long nr_pages;
+		unsigned long i;
+
+		if (folio_test_hugetlb(folio)) {
+			folio_zero_user(folio, folio->index << PAGE_SHIFT);
+		} else {
+			/*
+			 * Defensively zero folio to avoid leaking kernel memory in
+			 * uninitialized pages. This is important since pages can now be
+			 * mapped into userspace, where hardware (e.g. TDX) won't be
+			 * clearing those pages.
+			 *
+			 * Will probably need a version of kvm_gmem_prepare_folio() to
+			 * prepare the page for SHARED use.
+			 */
+			nr_pages = folio_nr_pages(folio);
+			for (i = 0; i < nr_pages; i++)
+				clear_highpage(folio_page(folio, i));
+		}
+
+		kvm_gmem_mark_prepared(folio);
+	}
+
 	vmf->page = folio_file_page(folio, vmf->pgoff);
 	return VM_FAULT_LOCKED;
 }
@@ -1593,6 +1641,87 @@  long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
 }
 EXPORT_SYMBOL_GPL(kvm_gmem_populate);
 
+static void kvm_gmem_clear_prepared_range(struct inode *inode, pgoff_t start,
+					  pgoff_t end)
+{
+	pgoff_t index;
+
+	filemap_invalidate_lock_shared(inode->i_mapping);
+
+	/* TODO: replace iteration with filemap_get_folios() for efficiency. */
+	for (index = start; index < end;) {
+		struct folio *folio;
+
+		/* Don't use kvm_gmem_get_folio to avoid allocating */
+		folio = filemap_lock_folio(inode->i_mapping, index);
+		if (IS_ERR(folio)) {
+			++index;
+			continue;
+		}
+
+		kvm_gmem_clear_prepared(folio);
+
+		index = folio_next_index(folio);
+		folio_unlock(folio);
+		folio_put(folio);
+	}
+
+	filemap_invalidate_unlock_shared(inode->i_mapping);
+}
+
+/**
+ * Clear the prepared flag for all folios in gfn range [@start, @end) in memslot
+ * @slot.
+ */
+static void kvm_gmem_clear_prepared_slot(struct kvm_memory_slot *slot, gfn_t start,
+					 gfn_t end)
+{
+	pgoff_t start_offset;
+	pgoff_t end_offset;
+	struct file *file;
+
+	file = kvm_gmem_get_file(slot);
+	if (!file)
+		return;
+
+	start_offset = start - slot->base_gfn + slot->gmem.pgoff;
+	end_offset = end - slot->base_gfn + slot->gmem.pgoff;
+
+	kvm_gmem_clear_prepared_range(file_inode(file), start_offset, end_offset);
+
+	fput(file);
+}
+
+/**
+ * Clear the prepared flag for all folios for any slot in gfn range
+ * [@start, @end) in @kvm.
+ */
+void kvm_gmem_clear_prepared_vm(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+	int i;
+
+	lockdep_assert_held(&kvm->slots_lock);
+
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+		struct kvm_memslot_iter iter;
+		struct kvm_memslots *slots;
+
+		slots = __kvm_memslots(kvm, i);
+		kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
+			struct kvm_memory_slot *slot;
+			gfn_t gfn_start;
+			gfn_t gfn_end;
+
+			slot = iter.slot;
+			gfn_start = max(start, slot->base_gfn);
+			gfn_end = min(end, slot->base_gfn + slot->npages);
+
+			if (iter.slot->flags & KVM_MEM_GUEST_MEMFD)
+				kvm_gmem_clear_prepared_slot(iter.slot, gfn_start, gfn_end);
+		}
+	}
+}
+
 /**
  * Returns true if pages in range [@start, @end) in inode @inode have no
  * userspace mappings.
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1a7bbcc31b7e..255d27df7f5c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2565,6 +2565,8 @@  static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 		KVM_BUG_ON(r, kvm);
 	}
 
+	kvm_gmem_clear_prepared_vm(kvm, start, end);
+
 	kvm_handle_gfn_range(kvm, &post_set_range);
 
 out_unlock:
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index d8ff2b380d0e..25fd0d9f66cc 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -43,6 +43,7 @@  int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
 void kvm_gmem_unbind(struct kvm_memory_slot *slot);
 int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 				   unsigned long attrs);
+void kvm_gmem_clear_prepared_vm(struct kvm *kvm, gfn_t start, gfn_t end);
 #else
 static inline void kvm_gmem_init(struct module *module)
 {
@@ -68,6 +69,12 @@  static inline int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start,
 	return 0;
 }
 
+static inline void kvm_gmem_clear_prepared_slots(struct kvm *kvm,
+						 gfn_t start, gfn_t end)
+{
+	WARN_ON_ONCE(1);
+}
+
 #endif /* CONFIG_KVM_PRIVATE_MEM */
 
 #endif /* __KVM_MM_H__ */