@@ -68,8 +68,13 @@ static struct file *kvm_gmem_create_file(const char *name, const struct file_ope
}
+#define KVM_GMEM_INODE_SIZE(size) \
+ struct_size_t(struct kvm_gmem_inode, prepared, \
+ DIV_ROUND_UP(size, PAGE_SIZE * BITS_PER_LONG))
+
struct kvm_gmem_inode {
unsigned long flags;
+ unsigned long prepared[];
};
struct kvm_gmem {
@@ -107,18 +112,110 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
return 0;
}
+/*
+ * The bitmap of prepared pages has to be accessed atomically, because
+ * preparation is not protected by any guest. This unfortunately means
+ * that we cannot use regular bitmap operations.
+ *
+ * The logic becomes a bit simpler for set and test, which operate a
+ * folio at a time and therefore can assume that the range is naturally
+ * aligned (meaning that either it is smaller than a word, or it is does
+ * not include fractions of a word). For punch-hole operations however
+ * there is all the complexity.
+ */
+
+static void bitmap_set_atomic_word(unsigned long *p, unsigned long start, unsigned long len)
+{
+ unsigned long mask_to_set =
+ BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
+
+ atomic_long_or(mask_to_set, (atomic_long_t *)p);
+}
+
+static void bitmap_clear_atomic_word(unsigned long *p, unsigned long start, unsigned long len)
+{
+ unsigned long mask_to_set =
+ BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
+
+ atomic_long_andnot(mask_to_set, (atomic_long_t *)p);
+}
+
+static bool bitmap_test_allset_word(unsigned long *p, unsigned long start, unsigned long len)
+{
+ unsigned long mask_to_set =
+ BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
+
+ return (*p & mask_to_set) == mask_to_set;
+}
+
static void kvm_gmem_mark_prepared(struct file *file, pgoff_t index, struct folio *folio)
{
- folio_mark_uptodate(folio);
+ struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
+ unsigned long *p = i_gmem->prepared + BIT_WORD(index);
+ unsigned long npages = folio_nr_pages(folio);
+
+ /* Folios must be naturally aligned */
+ WARN_ON_ONCE(index & (npages - 1));
+ index &= ~(npages - 1);
+
+ /* Clear page before updating bitmap. */
+ smp_wmb();
+
+ if (npages < BITS_PER_LONG) {
+ bitmap_set_atomic_word(p, index, npages);
+ } else {
+ BUILD_BUG_ON(BITS_PER_LONG != 64);
+ memset64((u64 *)p, ~0, BITS_TO_LONGS(npages));
+ }
}
static void kvm_gmem_mark_range_unprepared(struct inode *inode, pgoff_t index, pgoff_t npages)
{
+ struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)inode->i_private;
+ unsigned long *p = i_gmem->prepared + BIT_WORD(index);
+
+ index &= BITS_PER_LONG - 1;
+ if (index) {
+ int first_word_count = min(npages, BITS_PER_LONG - index);
+ bitmap_clear_atomic_word(p, index, first_word_count);
+ npages -= first_word_count;
+ p++;
+ }
+
+ if (npages > BITS_PER_LONG) {
+ BUILD_BUG_ON(BITS_PER_LONG != 64);
+ memset64((u64 *)p, 0, BITS_TO_LONGS(npages));
+ p += BIT_WORD(npages);
+ npages &= BITS_PER_LONG - 1;
+ }
+
+ if (npages)
+ bitmap_clear_atomic_word(p++, 0, npages);
}
static bool kvm_gmem_is_prepared(struct file *file, pgoff_t index, struct folio *folio)
{
- return folio_test_uptodate(folio);
+ struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
+ unsigned long *p = i_gmem->prepared + BIT_WORD(index);
+ unsigned long npages = folio_nr_pages(folio);
+ bool ret;
+
+ /* Folios must be naturally aligned */
+ WARN_ON_ONCE(index & (npages - 1));
+ index &= ~(npages - 1);
+
+ if (npages < BITS_PER_LONG) {
+ ret = bitmap_test_allset_word(p, index, npages);
+ } else {
+ for (; npages > 0; npages -= BITS_PER_LONG)
+ if (*p++ != ~0)
+ break;
+ ret = (npages == 0);
+ }
+
+ /* Synchronize with kvm_gmem_mark_prepared(). */
+ smp_rmb();
+ return ret;
}
/*
@@ -499,7 +596,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
struct file *file;
int fd, err;
- i_gmem = kvzalloc(sizeof(struct kvm_gmem_inode), GFP_KERNEL);
+ i_gmem = kvzalloc(KVM_GMEM_INODE_SIZE(size), GFP_KERNEL);
if (!i_gmem)
return -ENOMEM;
i_gmem->flags = flags;
With support for large pages in gmem, it may happen that part of the gmem is mapped with large pages and part with 4k pages. For example, if a conversion happens on a small region within a large page, the large page has to be smashed into small pages even if backed by a large folio. Each of the small pages will have its own state of preparedness, which makes it harder to use the uptodate flag for preparedness. Just switch to a bitmap in the inode's i_private data. This is a bit gnarly because ordinary bitmap operations in Linux are not atomic, but otherwise not too hard. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- virt/kvm/guest_memfd.c | 103 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 100 insertions(+), 3 deletions(-)