[v11,4/8] KVM: mmu: Improve handling of non-refcounted pfns

Message ID	20240229025759.1187910-5-stevensd@google.com (mailing list archive)
State	New, archived
Headers	show Received: from mail-pf1-f181.google.com (mail-pf1-f181.google.com [209.85.210.181]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C76B73AC1E for <kvm@vger.kernel.org>; Thu, 29 Feb 2024 02:58:25 +0000 (UTC) From: David Stevens <stevensd@chromium.org> To: Sean Christopherson <seanjc@google.com>, Paolo Bonzini <pbonzini@redhat.com> Cc: Yu Zhang <yu.c.zhang@linux.intel.com>, Isaku Yamahata <isaku.yamahata@gmail.com>, Zhi Wang <zhi.wang.linux@gmail.com>, Maxim Levitsky <mlevitsk@redhat.com>, kvmarm@lists.linux.dev, linux-kernel@vger.kernel.org, kvm@vger.kernel.org, David Stevens <stevensd@chromium.org> Subject: [PATCH v11 4/8] KVM: mmu: Improve handling of non-refcounted pfns Date: Thu, 29 Feb 2024 11:57:55 +0900 Message-ID: <20240229025759.1187910-5-stevensd@google.com> In-Reply-To: <20240229025759.1187910-1-stevensd@google.com> References: <20240229025759.1187910-1-stevensd@google.com> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	KVM: allow mapping non-refcounted pages \| expand [v11,0/8] KVM: allow mapping non-refcounted pages [v11,1/8] KVM: Assert that a page's refcount is elevated when marking accessed/dirty [v11,2/8] KVM: Relax BUG_ON argument validation [v11,3/8] KVM: mmu: Introduce kvm_follow_pfn() [v11,4/8] KVM: mmu: Improve handling of non-refcounted pfns [v11,5/8] KVM: Migrate kvm_vcpu_map() to kvm_follow_pfn() [v11,6/8] KVM: x86: Migrate to kvm_follow_pfn() [v11,7/8] KVM: x86/mmu: Track if sptes refer to refcounted pages [v11,8/8] KVM: x86/mmu: Handle non-refcounted pages

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 290db5133c36..66516088bb0a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1219,10 +1219,39 @@ struct kvm_follow_pfn { bool atomic; /* Try to create a writable mapping even for a read fault. */ bool try_map_writable; + /* + * Usage of the returned pfn will be guared by a mmu notifier. If + * FOLL_GET is not set, this must be true. + */ + bool guarded_by_mmu_notifier; + /* + * When false, do not return pfns for non-refcounted struct pages. + * + * This allows callers to continue to rely on the legacy behavior + * where pfs returned by gfn_to_pfn can be safely passed to + * kvm_release_pfn without worrying about corrupting the refcount of + * non-refcounted pages. + * + * Callers that opt into non-refcount struct pages need to track + * whether or not the returned pages are refcounted and avoid touching + * them when they are not. Some architectures may not have enough + * free space in PTEs to do this. + */ + bool allow_non_refcounted_struct_page; /* Outputs of kvm_follow_pfn */ hva_t hva; bool writable; + /* + * Non-NULL if the returned pfn is for a page with a valid refcount, + * NULL if the returned pfn has no struct page or if the struct page is + * not being refcounted (e.g. tail pages of non-compound higher order + * allocations from IO/PFNMAP mappings). + * + * NOTE: This will still be set if FOLL_GET is not specified, but the + * returned page will not have an elevated refcount. + */ + struct page *refcounted_page; }; kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 575756c9c5b0..984bcf8511e7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -96,6 +96,13 @@ unsigned int halt_poll_ns_shrink; module_param(halt_poll_ns_shrink, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); +/* + * Allow non-refcounted struct pages and non-struct page memory to + * be mapped without MMU notifier protection. + */ +static bool allow_unsafe_mappings; +module_param(allow_unsafe_mappings, bool, 0444); + /* * Ordering of locks: * @@ -2786,6 +2793,24 @@ static inline int check_user_page_hwpoison(unsigned long addr) return rc == -EHWPOISON; } +static kvm_pfn_t kvm_follow_refcounted_pfn(struct kvm_follow_pfn *kfp, + struct page *page) +{ + kvm_pfn_t pfn = page_to_pfn(page); + + /* + * FIXME: Ideally, KVM wouldn't pass FOLL_GET to gup() when the caller + * doesn't want to grab a reference, but gup() doesn't support getting + * just the pfn, i.e. FOLL_GET is effectively mandatory. If that ever + * changes, drop this and simply don't pass FOLL_GET to gup(). + */ + if (!(kfp->flags & FOLL_GET)) + put_page(page); + + kfp->refcounted_page = page; + return pfn; +} + /* * The fast path to get the writable pfn which will be stored in @pfn, * true indicates success, otherwise false is returned. It's also the @@ -2804,7 +2829,7 @@ static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) return false; if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, page)) { - *pfn = page_to_pfn(page[0]); + *pfn = kvm_follow_refcounted_pfn(kfp, page[0]); kfp->writable = true; return true; } @@ -2851,7 +2876,7 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) page = wpage; } } - *pfn = page_to_pfn(page); + *pfn = kvm_follow_refcounted_pfn(kfp, page); return npages; } @@ -2866,16 +2891,6 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) return true; } -static int kvm_try_get_pfn(kvm_pfn_t pfn) -{ - struct page *page = kvm_pfn_to_refcounted_page(pfn); - - if (!page) - return 1; - - return get_page_unless_zero(page); -} - static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn) { @@ -2884,6 +2899,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, pte_t pte; spinlock_t *ptl; bool write_fault = kfp->flags & FOLL_WRITE; + struct page *page; int r; r = follow_pte(vma->vm_mm, kfp->hva, &ptep, &ptl); @@ -2908,37 +2924,40 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, pte = ptep_get(ptep); + kfp->writable = pte_write(pte); + pfn = pte_pfn(pte); + + page = kvm_pfn_to_refcounted_page(pfn); + if (write_fault && !pte_write(pte)) { pfn = KVM_PFN_ERR_RO_FAULT; goto out; } - kfp->writable = pte_write(pte); - pfn = pte_pfn(pte); + if (!page) + goto out; /* - * Get a reference here because callers of *hva_to_pfn* and - * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the - * returned pfn. This is only needed if the VMA has VM_MIXEDMAP - * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will - * simply do nothing for reserved pfns. - * - * Whoever called remap_pfn_range is also going to call e.g. - * unmap_mapping_range before the underlying pages are freed, - * causing a call to our MMU notifier. - * - * Certain IO or PFNMAP mappings can be backed with valid - * struct pages, but be allocated without refcounting e.g., - * tail pages of non-compound higher order allocations, which - * would then underflow the refcount when the caller does the - * required put_page. Don't allow those pages here. + * IO or PFNMAP mappings can be backed with valid struct pages but be + * allocated without refcounting. We need to detect that to make sure we + * only pass refcounted pages to kvm_follow_refcounted_pfn. */ - if (!kvm_try_get_pfn(pfn)) - r = -EFAULT; + if (get_page_unless_zero(page)) + WARN_ON_ONCE(kvm_follow_refcounted_pfn(kfp, page) != pfn); out: pte_unmap_unlock(ptep, ptl); - *p_pfn = pfn; + + if (page && !kfp->refcounted_page && + !kfp->allow_non_refcounted_struct_page) { + r = -EFAULT; + } else if (!kfp->refcounted_page && + !kfp->guarded_by_mmu_notifier && + !allow_unsafe_mappings) { + r = -EFAULT; + } else { + *p_pfn = pfn; + } return r; } @@ -3004,6 +3023,11 @@ kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp) kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp) { kfp->writable = false; + kfp->refcounted_page = NULL; + + if (WARN_ON_ONCE(!(kfp->flags & FOLL_GET) && !kfp->guarded_by_mmu_notifier)) + return KVM_PFN_ERR_FAULT; + kfp->hva = __gfn_to_hva_many(kfp->slot, kfp->gfn, NULL, kfp->flags & FOLL_WRITE); @@ -3028,9 +3052,10 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, struct kvm_follow_pfn kfp = { .slot = slot, .gfn = gfn, - .flags = 0, + .flags = FOLL_GET, .atomic = atomic, .try_map_writable = !!writable, + .allow_non_refcounted_struct_page = false, }; if (write_fault) @@ -3060,8 +3085,9 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, struct kvm_follow_pfn kfp = { .slot = gfn_to_memslot(kvm, gfn), .gfn = gfn, - .flags = write_fault ? FOLL_WRITE : 0, + .flags = FOLL_GET | (write_fault ? FOLL_WRITE : 0), .try_map_writable = !!writable, + .allow_non_refcounted_struct_page = false, }; pfn = kvm_follow_pfn(&kfp); if (writable) @@ -3075,7 +3101,8 @@ kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) struct kvm_follow_pfn kfp = { .slot = slot, .gfn = gfn, - .flags = FOLL_WRITE, + .flags = FOLL_GET | FOLL_WRITE, + .allow_non_refcounted_struct_page = false, }; return kvm_follow_pfn(&kfp); } @@ -3086,8 +3113,13 @@ kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gf struct kvm_follow_pfn kfp = { .slot = slot, .gfn = gfn, - .flags = FOLL_WRITE, + .flags = FOLL_GET | FOLL_WRITE, .atomic = true, + /* + * Setting atomic means __kvm_follow_pfn will never make it + * to hva_to_pfn_remapped, so this is vacuously true. + */ + .allow_non_refcounted_struct_page = true, }; return kvm_follow_pfn(&kfp); } diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 1fb21c2ced5d..6e82062ea203 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -147,8 +147,9 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) struct kvm_follow_pfn kfp = { .slot = gpc->memslot, .gfn = gpa_to_gfn(gpc->gpa), - .flags = FOLL_WRITE, + .flags = FOLL_GET | FOLL_WRITE, .hva = gpc->uhva, + .allow_non_refcounted_struct_page = false, }; lockdep_assert_held(&gpc->refresh_lock);

[v11,4/8] KVM: mmu: Improve handling of non-refcounted pfns

Commit Message

Patch