[3/3] KVM: gmem: track preparedness a page at a time

Message ID	20241108155056.332412-4-pbonzini@redhat.com (mailing list archive)
State	New
Headers	show Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.129.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8966621F4AC for <kvm@vger.kernel.org>; Fri, 8 Nov 2024 15:51:14 +0000 (UTC) From: Paolo Bonzini <pbonzini@redhat.com> To: linux-kernel@vger.kernel.org, kvm@vger.kernel.org Cc: michael.roth@amd.com, seanjc@google.com Subject: [PATCH 3/3] KVM: gmem: track preparedness a page at a time Date: Fri, 8 Nov 2024 10:50:56 -0500 Message-ID: <20241108155056.332412-4-pbonzini@redhat.com> In-Reply-To: <20241108155056.332412-1-pbonzini@redhat.com> References: <20241108155056.332412-1-pbonzini@redhat.com> Precedence: bulk MIME-Version: 1.0 Content-Type: text/plain Content-Transfer-Encoding: 8bit
Series	KVM: gmem: track preparedness a page at a time \| expand [0/3] KVM: gmem: track preparedness a page at a time [1/3] KVM: gmem: allocate private data for the gmem inode [2/3] KVM: gmem: add a complete set of functions to query page preparedness [3/3] KVM: gmem: track preparedness a page at a time [2.5/3] KVM: gmem: limit hole-punching to ranges within the file

Message ID

20241108155056.332412-4-pbonzini@redhat.com (mailing list archive)

State

New

Headers

From: Paolo Bonzini <pbonzini@redhat.com>
To: linux-kernel@vger.kernel.org,
	kvm@vger.kernel.org
Cc: michael.roth@amd.com,
	seanjc@google.com
Subject: [PATCH 3/3] KVM: gmem: track preparedness a page at a time
Date: Fri,  8 Nov 2024 10:50:56 -0500
Message-ID: <20241108155056.332412-4-pbonzini@redhat.com>
In-Reply-To: <20241108155056.332412-1-pbonzini@redhat.com>
References: <20241108155056.332412-1-pbonzini@redhat.com>
Precedence: bulk
MIME-Version: 1.0
Content-Type: text/plain
Content-Transfer-Encoding: 8bit

Series

KVM: gmem: track preparedness a page at a time | expand

Commit Message

Paolo Bonzini Nov. 8, 2024, 3:50 p.m. UTC

With support for large pages in gmem, it may happen that part of the gmem
is mapped with large pages and part with 4k pages.  For example, if a
conversion happens on a small region within a large page, the large
page has to be smashed into small pages even if backed by a large folio.
Each of the small pages will have its own state of preparedness,
which makes it harder to use the uptodate flag for preparedness.

Just switch to a bitmap in the inode's i_private data.  This is
a bit gnarly because ordinary bitmap operations in Linux are not
atomic, but otherwise not too hard.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/guest_memfd.c | 103 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 100 insertions(+), 3 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 416e02a00cae..e08503dfdd8a 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -68,8 +68,13 @@  static struct file *kvm_gmem_create_file(const char *name, const struct file_ope
 }
 
 
+#define KVM_GMEM_INODE_SIZE(size)			\
+	struct_size_t(struct kvm_gmem_inode, prepared,	\
+		      DIV_ROUND_UP(size, PAGE_SIZE * BITS_PER_LONG))
+
 struct kvm_gmem_inode {
 	unsigned long flags;
+	unsigned long prepared[];
 };
 
 struct kvm_gmem {
@@ -107,18 +112,110 @@  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
 	return 0;
 }
 
+/*
+ * The bitmap of prepared pages has to be accessed atomically, because
+ * preparation is not protected by any guest.  This unfortunately means
+ * that we cannot use regular bitmap operations.
+ *
+ * The logic becomes a bit simpler for set and test, which operate a
+ * folio at a time and therefore can assume that the range is naturally
+ * aligned (meaning that either it is smaller than a word, or it is does
+ * not include fractions of a word).  For punch-hole operations however
+ * there is all the complexity.
+ */
+
+static void bitmap_set_atomic_word(unsigned long *p, unsigned long start, unsigned long len)
+{
+	unsigned long mask_to_set =
+		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
+
+	atomic_long_or(mask_to_set, (atomic_long_t *)p);
+}
+
+static void bitmap_clear_atomic_word(unsigned long *p, unsigned long start, unsigned long len)
+{
+	unsigned long mask_to_set =
+		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
+
+	atomic_long_andnot(mask_to_set, (atomic_long_t *)p);
+}
+
+static bool bitmap_test_allset_word(unsigned long *p, unsigned long start, unsigned long len)
+{
+	unsigned long mask_to_set =
+		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
+
+	return (*p & mask_to_set) == mask_to_set;
+}
+
 static void kvm_gmem_mark_prepared(struct file *file, pgoff_t index, struct folio *folio)
 {
-	folio_mark_uptodate(folio);
+	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
+	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
+	unsigned long npages = folio_nr_pages(folio);
+
+	/* Folios must be naturally aligned */
+	WARN_ON_ONCE(index & (npages - 1));
+	index &= ~(npages - 1);
+
+	/* Clear page before updating bitmap.  */
+	smp_wmb();
+
+	if (npages < BITS_PER_LONG) {
+		bitmap_set_atomic_word(p, index, npages);
+	} else {
+		BUILD_BUG_ON(BITS_PER_LONG != 64);
+		memset64((u64 *)p, ~0, BITS_TO_LONGS(npages));
+	}
 }
 
 static void kvm_gmem_mark_range_unprepared(struct inode *inode, pgoff_t index, pgoff_t npages)
 {
+	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)inode->i_private;
+	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
+
+	index &= BITS_PER_LONG - 1;
+	if (index) {
+		int first_word_count = min(npages, BITS_PER_LONG - index);
+		bitmap_clear_atomic_word(p, index, first_word_count);
+		npages -= first_word_count;
+		p++;
+	}
+
+	if (npages > BITS_PER_LONG) {
+		BUILD_BUG_ON(BITS_PER_LONG != 64);
+		memset64((u64 *)p, 0, BITS_TO_LONGS(npages));
+		p += BIT_WORD(npages);
+		npages &= BITS_PER_LONG - 1;
+	}
+
+	if (npages)
+		bitmap_clear_atomic_word(p++, 0, npages);
 }
 
 static bool kvm_gmem_is_prepared(struct file *file, pgoff_t index, struct folio *folio)
 {
-	return folio_test_uptodate(folio);
+	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
+	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
+	unsigned long npages = folio_nr_pages(folio);
+	bool ret;
+
+	/* Folios must be naturally aligned */
+	WARN_ON_ONCE(index & (npages - 1));
+	index &= ~(npages - 1);
+
+	if (npages < BITS_PER_LONG) {
+		ret = bitmap_test_allset_word(p, index, npages);
+	} else {
+		for (; npages > 0; npages -= BITS_PER_LONG)
+			if (*p++ != ~0)
+				break;
+		ret = (npages == 0);
+	}
+
+	/* Synchronize with kvm_gmem_mark_prepared().  */
+	smp_rmb();
+	return ret;
 }
 
 /*
@@ -499,7 +596,7 @@  static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	struct file *file;
 	int fd, err;
 
-	i_gmem = kvzalloc(sizeof(struct kvm_gmem_inode), GFP_KERNEL);
+	i_gmem = kvzalloc(KVM_GMEM_INODE_SIZE(size), GFP_KERNEL);
 	if (!i_gmem)
 		return -ENOMEM;
 	i_gmem->flags = flags;

[3/3] KVM: gmem: track preparedness a page at a time

Commit Message

Patch