@@ -843,6 +843,9 @@ struct kvm {
bool attribute_change_in_progress;
#endif
char stats_id[KVM_STATS_NAME_SIZE];
+#ifdef CONFIG_KVM_PRIVATE_MEM
+ atomic_t gmem_active_invalidate_count;
+#endif
};
#define kvm_err(fmt, ...) \
@@ -71,6 +71,7 @@ struct gfn_to_pfn_cache {
bool active;
bool valid;
bool private;
+ bool needs_unmap;
};
#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
@@ -231,6 +231,15 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
struct kvm *kvm = gmem->kvm;
unsigned long index;
+ atomic_inc(&kvm->gmem_active_invalidate_count);
+
+ xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
+ pgoff_t pgoff = slot->gmem.pgoff;
+
+ gfn_to_pfn_cache_invalidate_gfns_start(kvm, slot->base_gfn + start - pgoff,
+ slot->base_gfn + end - pgoff, true);
+ }
+
xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
pgoff_t pgoff = slot->gmem.pgoff;
@@ -268,6 +277,8 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
kvm_mmu_invalidate_end(kvm);
KVM_MMU_UNLOCK(kvm);
}
+
+ atomic_dec(&kvm->gmem_active_invalidate_count);
}
static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
@@ -478,7 +489,13 @@ static void kvm_gmem_invalidate_folio(struct folio *folio, size_t start, size_t
if (start == 0 && end == folio_size(folio)) {
refcount_t *sharing_count = folio_get_private(folio);
- kvm_gmem_folio_clear_private(folio);
+ /*
+ * gfn_to_pfn_caches do not decrement the refcount if they
+ * get invalidated due to the gmem pfn going away (fallocate,
+ * or error_remove_folio)
+ */
+ if (refcount_read(sharing_count) == 1)
+ kvm_gmem_folio_clear_private(folio);
kfree(sharing_count);
}
}
@@ -1161,6 +1161,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
xa_init(&kvm->mem_attr_array);
#endif
+#ifdef CONFIG_KVM_PRIVATE_MEM
+ atomic_set(&kvm->gmem_active_invalidate_count, 0);
+#endif
INIT_LIST_HEAD(&kvm->gpc_list);
spin_lock_init(&kvm->gpc_lock);
@@ -2549,7 +2552,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
}
kvm->attribute_change_in_progress = true;
- gfn_to_pfn_cache_invalidate_gfns_start(kvm, start, end);
+ gfn_to_pfn_cache_invalidate_gfns_start(kvm, start, end, false);
kvm_handle_gfn_range(kvm, &pre_set_range);
@@ -30,7 +30,8 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
void gfn_to_pfn_cache_invalidate_gfns_start(struct kvm *kvm,
gfn_t start,
- gfn_t end);
+ gfn_t end,
+ bool needs_unmap);
#else
static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
unsigned long start,
@@ -40,7 +41,8 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
static inline void gfn_to_pfn_cache_invalidate_gfns_start(struct kvm *kvm,
gfn_t start,
- gfn_t end)
+ gfn_t end,
+ bool needs_unmap)
{
}
#endif /* HAVE_KVM_PFNCACHE */
@@ -61,8 +61,15 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
/*
* Identical to `gfn_to_pfn_cache_invalidate_start`, except based on gfns
* instead of uhvas.
+ *
+ * needs_unmap indicates whether this invalidation is because a gmem range went
+ * away (fallocate(FALLOC_FL_PUNCH_HOLE), error_remove_folio), in which case
+ * we must not call kvm_gmem_put_shared_pfn for it, or because of a memory
+ * attribute change, in which case the gmem pfn still exists, but simply
+ * is no longer mapped into the guest.
*/
-void gfn_to_pfn_cache_invalidate_gfns_start(struct kvm *kvm, gfn_t start, gfn_t end)
+void gfn_to_pfn_cache_invalidate_gfns_start(struct kvm *kvm, gfn_t start, gfn_t end,
+ bool needs_unmap)
{
struct gfn_to_pfn_cache *gpc;
@@ -78,14 +85,16 @@ void gfn_to_pfn_cache_invalidate_gfns_start(struct kvm *kvm, gfn_t start, gfn_t
continue;
}
- if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
+ if (!is_error_noslot_pfn(gpc->pfn) &&
gpa_to_gfn(gpc->gpa) >= start && gpa_to_gfn(gpc->gpa) < end) {
read_unlock_irq(&gpc->lock);
write_lock_irq(&gpc->lock);
- if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
- gpa_to_gfn(gpc->gpa) >= start && gpa_to_gfn(gpc->gpa) < end)
+ if (!is_error_noslot_pfn(gpc->pfn) &&
+ gpa_to_gfn(gpc->gpa) >= start && gpa_to_gfn(gpc->gpa) < end) {
gpc->valid = false;
+ gpc->needs_unmap = needs_unmap && gpc->private;
+ }
write_unlock_irq(&gpc->lock);
continue;
}
@@ -194,6 +203,9 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s
*/
if (kvm->attribute_change_in_progress)
return true;
+
+ if (atomic_read_acquire(&kvm->gmem_active_invalidate_count))
+ return true;
/*
* Ensure mn_active_invalidate_count is read before
* mmu_invalidate_seq. This pairs with the smp_wmb() in
@@ -425,20 +437,28 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l
* Some/all of the uhva, gpa, and memslot generation info may still be
* valid, leave it as is.
*/
+ unmap_old = gpc->needs_unmap;
if (ret) {
gpc->valid = false;
gpc->pfn = KVM_PFN_ERR_FAULT;
gpc->khva = NULL;
+ gpc->needs_unmap = false;
+ } else {
+ gpc->needs_unmap = true;
}
/* Detect a pfn change before dropping the lock! */
- unmap_old = (old_pfn != gpc->pfn);
+ unmap_old &= (old_pfn != gpc->pfn);
out_unlock:
+ if (unmap_old)
+ folio_get(pfn_folio(old_pfn));
write_unlock_irq(&gpc->lock);
- if (unmap_old)
+ if (unmap_old) {
gpc_unmap(old_pfn, old_khva, old_private);
+ folio_put(pfn_folio(old_pfn));
+ }
return ret;
}
@@ -530,6 +550,7 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
kvm_pfn_t old_pfn;
void *old_khva;
bool old_private;
+ bool old_needs_unmap;
guard(mutex)(&gpc->refresh_lock);
@@ -555,14 +576,25 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
old_private = gpc->private;
gpc->private = false;
+ old_needs_unmap = gpc->needs_unmap;
+ gpc->needs_unmap = false;
+
old_pfn = gpc->pfn;
gpc->pfn = KVM_PFN_ERR_FAULT;
+
+ if (old_needs_unmap && old_private)
+ folio_get(pfn_folio(old_pfn));
+
write_unlock_irq(&gpc->lock);
spin_lock(&kvm->gpc_lock);
list_del(&gpc->list);
spin_unlock(&kvm->gpc_lock);
- gpc_unmap(old_pfn, old_khva, old_private);
+ if (old_needs_unmap) {
+ gpc_unmap(old_pfn, old_khva, old_private);
+ if (old_private)
+ folio_put(pfn_folio(old_pfn));
+ }
}
}
Invalidate gfn_to_pfn_caches that hold gmem pfns whenever gmem invalidations occur (fallocate(FALLOC_FL_PUNCH_HOLE), error_remove_folio).. gmem invalidations are difficult to handle for gpcs. The unmap path for gmem pfns in gpc tries to decrement the sharing ref count, and potentially modifies the direct map. However, these are not operations we can do after the gmem folio that used to sit in the pfn has been freed (and after we drop gpc->lock in gfn_to_pfn_cache_invalidate_gfns_start we are racing against the freeing of the folio, and we cannot do direct map manipulations before dropping the lock). Thus, in these cases (punch hole and error_remove_folio), we must "leak" the sharing reference (which is fine because either the folio has already been freed, or it is about to be freed by ->invalidate_folio, which only reinserts into the direct map. So if the folio already is in the direct map, no harm is done). So in these cases, we simply store a flag that tells gpc to skip unmapping of these pfns when the time comes to refresh the cache. A slightly different case are if just the memory attributes on a memslot change. If we switch from private to shared, the gmem pfn will still be there, it will simply no longer be mapped into the guest. In this scenario, we must unmap to decrement the sharing count, and reinsert into the direct map. Otherwise, if for example the gpc gets deactivated while the gfn is set to shared, and after that the gfn is flipped to private, something else might use the pfn, but it is still present in the direct map (which violates the security goal of direct map removal). However, there is one edge case we need to deal with: It could happen that a gpc gets invalidated by a memory attribute change (e.g. gpc->needs_unmap = true), then refreshed, and after the refresh loop has exited and the gpc->lock is dropped, but before we get to gpc_unmap, the gmem folio that occupies the invalidated pfn of the cache is fallocated away. Now needs_unmap will be true, but we are once again racing against the freeing of the folio. For this case, take a reference to the folio before we drop the gpc->lock, and only drop the reference after gpc_unmap returned, to avoid the folio being freed. For similar reasons, gfn_to_pfn_cache_invalidate_gfns_start needs to not ignore already invalidated caches, as a cache that was invalidated due to a memory attribute change will have needs_unmap=true. If a fallocate(FALLOC_FL_PUNCH_HOLE) operation happens on the same range, this will need to get updated to needs_unmap=false, even if the cache is already invalidated. Signed-off-by: Patrick Roy <roypat@amazon.co.uk> --- include/linux/kvm_host.h | 3 +++ include/linux/kvm_types.h | 1 + virt/kvm/guest_memfd.c | 19 +++++++++++++++- virt/kvm/kvm_main.c | 5 ++++- virt/kvm/kvm_mm.h | 6 +++-- virt/kvm/pfncache.c | 46 +++++++++++++++++++++++++++++++++------ 6 files changed, 69 insertions(+), 11 deletions(-)