diff mbox series

[RFC,14/39] KVM: guest_memfd: hugetlb: initialization and cleanup

Message ID 3fec11d8a007505405eadcf2b3e10ec9051cf6bf.1726009989.git.ackerleytng@google.com (mailing list archive)
State New, archived
Headers show
Series 1G page support for guest_memfd | expand

Commit Message

Ackerley Tng Sept. 10, 2024, 11:43 p.m. UTC
First stage of hugetlb support: add initialization and cleanup
routines.

After guest_mem was massaged to use guest_mem inodes instead of
anonymous inodes in an earlier patch, the .evict_inode handler can now
be overridden to do hugetlb metadata cleanup.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 include/uapi/linux/kvm.h |  26 ++++++
 virt/kvm/guest_memfd.c   | 177 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 197 insertions(+), 6 deletions(-)

Comments

Vishal Annapurve Sept. 20, 2024, 9:17 a.m. UTC | #1
On Wed, Sep 11, 2024 at 1:44 AM Ackerley Tng <ackerleytng@google.com> wrote:
>
> ...
> +}
> +
> +static void kvm_gmem_evict_inode(struct inode *inode)
> +{
> +       u64 flags = (u64)inode->i_private;
> +
> +       if (flags & KVM_GUEST_MEMFD_HUGETLB)
> +               kvm_gmem_hugetlb_teardown(inode);
> +       else
> +               truncate_inode_pages_final(inode->i_mapping);
> +
> +       clear_inode(inode);
> +}
> +
>  static const struct super_operations kvm_gmem_super_operations = {
>         .statfs         = simple_statfs,
> +       .evict_inode    = kvm_gmem_evict_inode,

Ackerley, can we use free_inode[1] callback to free any special
metadata associated with the inode instead of relying on
super_operations?

[1] https://elixir.bootlin.com/linux/v6.11/source/include/linux/fs.h#L719

> ...


>
>         if (size <= 0 || !PAGE_ALIGNED(size))
>                 return -EINVAL;
> --
> 2.46.0.598.g6f2099f65c-goog
>
diff mbox series

Patch

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 637efc055145..77de7c4432f6 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -13,6 +13,7 @@ 
 #include <linux/compiler.h>
 #include <linux/ioctl.h>
 #include <asm/kvm.h>
+#include <asm-generic/hugetlb_encode.h>
 
 #define KVM_API_VERSION 12
 
@@ -1558,6 +1559,31 @@  struct kvm_memory_attributes {
 
 #define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
 
+#define KVM_GUEST_MEMFD_HUGETLB (1ULL << 1)
+
+/*
+ * Huge page size encoding when KVM_GUEST_MEMFD_HUGETLB is specified, and a huge
+ * page size other than the default is desired.  See hugetlb_encode.h.  All
+ * known huge page size encodings are provided here.  It is the responsibility
+ * of the application to know which sizes are supported on the running system.
+ * See mmap(2) man page for details.
+ */
+#define KVM_GUEST_MEMFD_HUGE_SHIFT     HUGETLB_FLAG_ENCODE_SHIFT
+#define KVM_GUEST_MEMFD_HUGE_MASK      HUGETLB_FLAG_ENCODE_MASK
+
+#define KVM_GUEST_MEMFD_HUGE_64KB      HUGETLB_FLAG_ENCODE_64KB
+#define KVM_GUEST_MEMFD_HUGE_512KB     HUGETLB_FLAG_ENCODE_512KB
+#define KVM_GUEST_MEMFD_HUGE_1MB       HUGETLB_FLAG_ENCODE_1MB
+#define KVM_GUEST_MEMFD_HUGE_2MB       HUGETLB_FLAG_ENCODE_2MB
+#define KVM_GUEST_MEMFD_HUGE_8MB       HUGETLB_FLAG_ENCODE_8MB
+#define KVM_GUEST_MEMFD_HUGE_16MB      HUGETLB_FLAG_ENCODE_16MB
+#define KVM_GUEST_MEMFD_HUGE_32MB      HUGETLB_FLAG_ENCODE_32MB
+#define KVM_GUEST_MEMFD_HUGE_256MB     HUGETLB_FLAG_ENCODE_256MB
+#define KVM_GUEST_MEMFD_HUGE_512MB     HUGETLB_FLAG_ENCODE_512MB
+#define KVM_GUEST_MEMFD_HUGE_1GB       HUGETLB_FLAG_ENCODE_1GB
+#define KVM_GUEST_MEMFD_HUGE_2GB       HUGETLB_FLAG_ENCODE_2GB
+#define KVM_GUEST_MEMFD_HUGE_16GB      HUGETLB_FLAG_ENCODE_16GB
+
 struct kvm_create_guest_memfd {
 	__u64 size;
 	__u64 flags;
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 5d7fd1f708a6..31e1115273e1 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -3,6 +3,7 @@ 
 #include <linux/mount.h>
 #include <linux/backing-dev.h>
 #include <linux/falloc.h>
+#include <linux/hugetlb.h>
 #include <linux/kvm_host.h>
 #include <linux/pseudo_fs.h>
 #include <linux/pagemap.h>
@@ -18,6 +19,16 @@  struct kvm_gmem {
 	struct list_head entry;
 };
 
+struct kvm_gmem_hugetlb {
+	struct hstate *h;
+	struct hugepage_subpool *spool;
+};
+
+static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode *inode)
+{
+	return inode->i_mapping->i_private_data;
+}
+
 /**
  * folio_file_pfn - like folio_file_page, but return a pfn.
  * @folio: The folio which contains this index.
@@ -154,6 +165,82 @@  static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
 	}
 }
 
+static inline void kvm_gmem_hugetlb_filemap_remove_folio(struct folio *folio)
+{
+	folio_lock(folio);
+
+	folio_clear_dirty(folio);
+	folio_clear_uptodate(folio);
+	filemap_remove_folio(folio);
+
+	folio_unlock(folio);
+}
+
+/**
+ * Removes folios in range [@lstart, @lend) from page cache/filemap (@mapping),
+ * returning the number of pages freed.
+ */
+static int kvm_gmem_hugetlb_filemap_remove_folios(struct address_space *mapping,
+						  struct hstate *h,
+						  loff_t lstart, loff_t lend)
+{
+	const pgoff_t end = lend >> PAGE_SHIFT;
+	pgoff_t next = lstart >> PAGE_SHIFT;
+	struct folio_batch fbatch;
+	int num_freed = 0;
+
+	folio_batch_init(&fbatch);
+	while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
+		int i;
+		for (i = 0; i < folio_batch_count(&fbatch); ++i) {
+			struct folio *folio;
+			pgoff_t hindex;
+			u32 hash;
+
+			folio = fbatch.folios[i];
+			hindex = folio->index >> huge_page_order(h);
+			hash = hugetlb_fault_mutex_hash(mapping, hindex);
+
+			mutex_lock(&hugetlb_fault_mutex_table[hash]);
+			kvm_gmem_hugetlb_filemap_remove_folio(folio);
+			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+			num_freed++;
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+	return num_freed;
+}
+
+/**
+ * Removes folios in range [@lstart, @lend) from page cache of inode, updates
+ * inode metadata and hugetlb reservations.
+ */
+static void kvm_gmem_hugetlb_truncate_folios_range(struct inode *inode,
+						   loff_t lstart, loff_t lend)
+{
+	struct kvm_gmem_hugetlb *hgmem;
+	struct hstate *h;
+	int gbl_reserve;
+	int num_freed;
+
+	hgmem = kvm_gmem_hgmem(inode);
+	h = hgmem->h;
+
+	num_freed = kvm_gmem_hugetlb_filemap_remove_folios(inode->i_mapping,
+							   h, lstart, lend);
+
+	gbl_reserve = hugepage_subpool_put_pages(hgmem->spool, num_freed);
+	hugetlb_acct_memory(h, -gbl_reserve);
+
+	spin_lock(&inode->i_lock);
+	inode->i_blocks -= blocks_per_huge_page(h) * num_freed;
+	spin_unlock(&inode->i_lock);
+}
+
+
 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
 	struct list_head *gmem_list = &inode->i_mapping->i_private_list;
@@ -307,8 +394,33 @@  static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
 	return get_file_active(&slot->gmem.file);
 }
 
+static void kvm_gmem_hugetlb_teardown(struct inode *inode)
+{
+	struct kvm_gmem_hugetlb *hgmem;
+
+	truncate_inode_pages_final_prepare(inode->i_mapping);
+	kvm_gmem_hugetlb_truncate_folios_range(inode, 0, LLONG_MAX);
+
+	hgmem = kvm_gmem_hgmem(inode);
+	hugepage_put_subpool(hgmem->spool);
+	kfree(hgmem);
+}
+
+static void kvm_gmem_evict_inode(struct inode *inode)
+{
+	u64 flags = (u64)inode->i_private;
+
+	if (flags & KVM_GUEST_MEMFD_HUGETLB)
+		kvm_gmem_hugetlb_teardown(inode);
+	else
+		truncate_inode_pages_final(inode->i_mapping);
+
+	clear_inode(inode);
+}
+
 static const struct super_operations kvm_gmem_super_operations = {
 	.statfs		= simple_statfs,
+	.evict_inode	= kvm_gmem_evict_inode,
 };
 
 static int kvm_gmem_init_fs_context(struct fs_context *fc)
@@ -431,6 +543,42 @@  static const struct inode_operations kvm_gmem_iops = {
 	.setattr	= kvm_gmem_setattr,
 };
 
+static int kvm_gmem_hugetlb_setup(struct inode *inode, loff_t size, u64 flags)
+{
+	struct kvm_gmem_hugetlb *hgmem;
+	struct hugepage_subpool *spool;
+	int page_size_log;
+	struct hstate *h;
+	long hpages;
+
+	page_size_log = (flags >> KVM_GUEST_MEMFD_HUGE_SHIFT) & KVM_GUEST_MEMFD_HUGE_MASK;
+	h = hstate_sizelog(page_size_log);
+
+	/* Round up to accommodate size requests that don't align with huge pages */
+	hpages = round_up(size, huge_page_size(h)) >> huge_page_shift(h);
+
+	spool = hugepage_new_subpool(h, hpages, hpages, false);
+	if (!spool)
+		goto err;
+
+	hgmem = kzalloc(sizeof(*hgmem), GFP_KERNEL);
+	if (!hgmem)
+		goto err_subpool;
+
+	inode->i_blkbits = huge_page_shift(h);
+
+	hgmem->h = h;
+	hgmem->spool = spool;
+	inode->i_mapping->i_private_data = hgmem;
+
+	return 0;
+
+err_subpool:
+	kfree(spool);
+err:
+	return -ENOMEM;
+}
+
 static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
 						      loff_t size, u64 flags)
 {
@@ -443,9 +591,13 @@  static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
 		return inode;
 
 	err = security_inode_init_security_anon(inode, &qname, NULL);
-	if (err) {
-		iput(inode);
-		return ERR_PTR(err);
+	if (err)
+		goto out;
+
+	if (flags & KVM_GUEST_MEMFD_HUGETLB) {
+		err = kvm_gmem_hugetlb_setup(inode, size, flags);
+		if (err)
+			goto out;
 	}
 
 	inode->i_private = (void *)(unsigned long)flags;
@@ -459,6 +611,11 @@  static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
 	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
 
 	return inode;
+
+out:
+	iput(inode);
+
+	return ERR_PTR(err);
 }
 
 static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
@@ -526,14 +683,22 @@  static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	return err;
 }
 
+#define KVM_GUEST_MEMFD_ALL_FLAGS KVM_GUEST_MEMFD_HUGETLB
+
 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
 {
 	loff_t size = args->size;
 	u64 flags = args->flags;
-	u64 valid_flags = 0;
 
-	if (flags & ~valid_flags)
-		return -EINVAL;
+	if (flags & KVM_GUEST_MEMFD_HUGETLB) {
+		/* Allow huge page size encoding in flags */
+		if (flags & ~(KVM_GUEST_MEMFD_ALL_FLAGS |
+			      (KVM_GUEST_MEMFD_HUGE_MASK << KVM_GUEST_MEMFD_HUGE_SHIFT)))
+			return -EINVAL;
+	} else {
+		if (flags & ~KVM_GUEST_MEMFD_ALL_FLAGS)
+			return -EINVAL;
+	}
 
 	if (size <= 0 || !PAGE_ALIGNED(size))
 		return -EINVAL;