@@ -60,10 +60,37 @@ static bool kvm_gmem_test_accessible(struct kvm *kvm)
return kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM;
}
+static int kvm_gmem_init_sharing_count(struct folio *folio)
+{
+ refcount_t *sharing_count = kmalloc(sizeof(*sharing_count), GFP_KERNEL);
+
+ if (!sharing_count)
+ return -ENOMEM;
+
+ /*
+ * we need to use sharing_count == 1 to mean "no sharing", because
+ * dropping a refcount_t to 0 and later incrementing it again would
+ * result in a WARN.
+ */
+ refcount_set(sharing_count, 1);
+ folio_change_private(folio, (void *)sharing_count);
+
+ return 0;
+}
+
static int kvm_gmem_folio_set_private(struct folio *folio)
{
unsigned long start, npages, i;
int r;
+ unsigned int sharing_refcount = refcount_read(folio_get_private(folio));
+
+ /*
+ * We must only remove direct map entries after the last internal
+ * reference has gone away, e.g. after the refcount dropped back
+ * to 1.
+ */
+ WARN_ONCE(sharing_refcount != 1, "%d unexpected sharing_refcounts pfn=%lx",
+ sharing_refcount - 1, folio_pfn(folio));
start = (unsigned long) folio_address(folio);
npages = folio_nr_pages(folio);
@@ -97,6 +124,15 @@ static int kvm_gmem_folio_clear_private(struct folio *folio)
{
unsigned long npages, i;
int r = 0;
+ unsigned int sharing_refcount = refcount_read(folio_get_private(folio));
+
+ /*
+ * We must restore direct map entries on acquiring the first "sharing
+ * reference". The refcount is lifted _after_ the call to
+ * kvm_gmem_folio_clear_private, so it will still be 1 here.
+ */
+ WARN_ONCE(sharing_refcount != 1, "%d unexpected sharing_refcounts pfn=%lx",
+ sharing_refcount - 1, folio_pfn(folio));
npages = folio_nr_pages(folio);
@@ -156,13 +192,21 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, unsi
if (folio_test_private(folio) && share) {
r = kvm_gmem_folio_clear_private(folio);
- } else if (!folio_test_private(folio) && !share) {
- r = kvm_gmem_folio_set_private(folio);
+ } else if (!folio_test_private(folio)) {
+ r = kvm_gmem_init_sharing_count(folio);
+ if (r)
+ goto out_err;
+
+ if (!share)
+ r = kvm_gmem_folio_set_private(folio);
}
if (r)
goto out_err;
+ if (share)
+ refcount_inc(folio_get_private(folio));
+
out:
/*
* Ignore accessed, referenced, and dirty flags. The memory is
@@ -429,7 +473,10 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
static void kvm_gmem_invalidate_folio(struct folio *folio, size_t start, size_t end)
{
if (start == 0 && end == folio_size(folio)) {
+ refcount_t *sharing_count = folio_get_private(folio);
+
kvm_gmem_folio_clear_private(folio);
+ kfree(sharing_count);
}
}
@@ -699,12 +746,20 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
int kvm_gmem_put_shared_pfn(kvm_pfn_t pfn) {
+ int r = 0;
struct folio *folio = pfn_folio(pfn);
+ refcount_t *sharing_count;
if (!kvm_gmem_test_no_direct_map(folio_inode(folio)))
return 0;
- return kvm_gmem_folio_set_private(folio);
+ sharing_count = folio_get_private(folio);
+ refcount_dec(sharing_count);
+
+ if (refcount_read(sharing_count) == 1)
+ r = kvm_gmem_folio_set_private(folio);
+
+ return r;
}
EXPORT_SYMBOL_GPL(kvm_gmem_put_shared_pfn);
Currently, if KVM_GMEM_NO_DIRECT_MAP is set and KVM wants to internally access a gmem folio, KVM needs to reinsert the folio into the direct map, and hold the folio lock until KVM is done using the folio (and the folio is removed from the direct map again). This means that long-term reinsertion into the direct map, and concurrent accesses to the same gmem folio are currently impossible. These are needed however for data structures of paravirtual devices, such as kvm-clock, which are shared between guest and host via guest memory pages (and multiple vCPUs can put their kvm-clock data into the same guest page). Thus, introduce the concept of a "sharing refcount", which gets incremented on every call to kvm_gmem_get_pfn with KVM_GMEM_GET_PFN_SHARED set. Direct map manipulations are only done when the first refcount is grabbed (direct map entries are restored), or when the last reference goes away (direct map entries are removed). While holding a sharing reference, the folio lock may be dropped, as the refcounting ensures that the direct map entry will not be removed as long as at least one reference is held. However, whoever is holding a reference will need to listen and respond to gmem invalidation events (such as the page being in the process of being fallocated away). Since refcount_t does not play nicely with references dropping to 0 and later being raised again (it will WARN), we use a refcount of 1 to mean "no sharing references held anywhere, folio not in direct map". Signed-off-by: Patrick Roy <roypat@amazon.co.uk> --- virt/kvm/guest_memfd.c | 61 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 3 deletions(-)