Message ID | 20210125064234.2078146-1-stevensd@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: x86/mmu: consider the hva in mmu_notifer retry | expand |
+Cc the other architectures, I'm guessing this would be a helpful optimization for all archs. Quite a few comments, but they're all little more than nits. Nice! On Mon, Jan 25, 2021, David Stevens wrote: > From: David Stevens <stevensd@chromium.org> > > Use the range passed to mmu_notifer's invalidate_range_start to prevent s/mmu_notifer/mmu_notifier. And maybe avoid calling out invalidate_range_start() by name? It took me a few reads to understand it's referring to the function, i.e. the start of the invalidation, not the start of the range. > spurious page fault retries due to changes in unrelated host virtual > addresses. This needs to elaborate on the exact scenario this is handling, as is it sounds like KVM is tracking the history of invalidations or something. Understanding this patch requires a priori knowledge of mmu_notifier_count. Something like: Track the range being invalidated by mmu_notifier and skip page fault retries if the fault address is not affected by the in-progress invalidation. Disable the optimization if multiple invalidations are in-progress to keep things simple, as tracking multiple ranges has diminishing returns. > This has the secondary effect of greatly reducing the likelihood of extreme Out of curiosity, is this really the _secondary_ effect? I would expect this change to primarily benefit scenarios where the invalidation has gotten waylaid for whatever reason. > latency when handing a page fault due to another thread having been preempted > while modifying host virtual addresses. > > Signed-off-by: David Stevens <stevensd@chromium.org> > --- ... > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 6d16481aa29d..79166288ed03 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -3658,8 +3658,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, > } > > static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, > - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, > - bool *writable) > + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, > + bool write, bool *writable) Side topic, I'm all for creating a 'struct kvm_page_fault' or whatever to hold all these variables. The helper functions stacks are getting unwieldy. Definitely doesn't need to be addressed here, this just reminded of how ugly these stacks are. > { > struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); > bool async; > @@ -3672,7 +3672,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, > } > > async = false; > - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); > + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, > + write, writable, hva); > if (!async) > return false; /* *pfn has correct page already */ > > @@ -3686,7 +3687,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, > return true; > } > > - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); > + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, > + write, writable, hva); > return false; > } > > @@ -3699,6 +3701,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, > gfn_t gfn = gpa >> PAGE_SHIFT; > unsigned long mmu_seq; > kvm_pfn_t pfn; > + hva_t hva; > int r; > > if (page_fault_handle_page_track(vcpu, error_code, gfn)) > @@ -3717,7 +3720,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, > mmu_seq = vcpu->kvm->mmu_notifier_seq; > smp_rmb(); > > - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) > + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva, > + write, &map_writable)) > return RET_PF_RETRY; > > if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) > @@ -3725,7 +3729,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, > > r = RET_PF_RETRY; > spin_lock(&vcpu->kvm->mmu_lock); > - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) > + if (mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) > goto out_unlock; > r = make_mmu_pages_available(vcpu); > if (r) > diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h > index 50e268eb8e1a..3171784139a4 100644 > --- a/arch/x86/kvm/mmu/paging_tmpl.h > +++ b/arch/x86/kvm/mmu/paging_tmpl.h > @@ -790,6 +790,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, > struct guest_walker walker; > int r; > kvm_pfn_t pfn; > + hva_t hva; > unsigned long mmu_seq; > bool map_writable, is_self_change_mapping; > int max_level; > @@ -840,8 +841,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, > mmu_seq = vcpu->kvm->mmu_notifier_seq; > smp_rmb(); > > - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, > - &map_writable)) > + if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva, > + write_fault, &map_writable)) > return RET_PF_RETRY; > > if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) > @@ -869,7 +870,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, > > r = RET_PF_RETRY; > spin_lock(&vcpu->kvm->mmu_lock); > - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) > + if (mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) > goto out_unlock; > > kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index f3b1013fb22c..b70097685249 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -502,6 +502,8 @@ struct kvm { > struct mmu_notifier mmu_notifier; > unsigned long mmu_notifier_seq; > long mmu_notifier_count; > + unsigned long mmu_notifier_range_start; > + unsigned long mmu_notifier_range_end; > #endif > long tlbs_dirty; > struct list_head devices; > @@ -729,7 +731,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); > kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); > kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, > bool atomic, bool *async, bool write_fault, > - bool *writable); > + bool *writable, hva_t *hva); > > void kvm_release_pfn_clean(kvm_pfn_t pfn); > void kvm_release_pfn_dirty(kvm_pfn_t pfn); > @@ -1203,6 +1205,24 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) > return 1; > return 0; > } > + > +static inline int mmu_notifier_retry_hva(struct kvm *kvm, > + unsigned long mmu_seq, > + unsigned long hva) > +{ > + /* > + * Unlike mmu_notifier_retry, this function relies on > + * kvm->mmu_lock for consistency. mmu_notifier_retry is the outlier due to PPC behavior. Maybe just add a lockdep annonation and call it good? > + */ This needs a comment to explicitly state that 'count > 1' cannot be done at this time. My initial thought is that it would be more intuitive to check for 'count > 1' here, but that would potentially check the wrong wrange when count goes from 2->1. The comment about persistence in invalidate_range_start() is a good hint, but I think it's worth being explicit to avoid bad "cleanup" in the future. > + if (unlikely(kvm->mmu_notifier_count)) { > + if (kvm->mmu_notifier_range_start <= hva && > + hva < kvm->mmu_notifier_range_end) Combine these into a single statement? I think the result is easier to read? if (unlikely(kvm->mmu_notifier_count) && kvm->mmu_notifier_range_start <= hva && hva < kvm->mmu_notifier_range_end) > + return 1; > + } > + if (kvm->mmu_notifier_seq != mmu_seq) > + return 1; > + return 0; > +} > #endif > > #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index fa9e3614d30e..d6e1ef5cb184 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -483,6 +483,18 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, > * count is also read inside the mmu_lock critical section. > */ > kvm->mmu_notifier_count++; > + if (likely(kvm->mmu_notifier_count == 1)) { > + kvm->mmu_notifier_range_start = range->start; > + kvm->mmu_notifier_range_end = range->end; > + } else { > + /** > + * Tracking multiple concurrent ranges has diminishing returns, > + * so just use the maximum range. This persists until after all > + * outstanding invalidation operations complete. > + */ > + kvm->mmu_notifier_range_start = 0; > + kvm->mmu_notifier_range_end = ULONG_MAX; Hrm, I don't think there's a corner case in practice, but ULONG_MAX is a legal virtual address and range_end is exclusive. E.g. passing hva=-1ul would get a false negative in mmu_notifier_retry_hva(). It's not an issue as written because hva is generated from the gfn, and hva can't be a kernel address. I'm guessing mmu_notifier also doesn't fire on kernel addresses. I assume that all holds true for other architectures, and adding checks in mmu_notifier_retry_hva() feels like a waste of cycles, but it still bugs me. :-) Maybe zero out range_end and explicitly check for that, just to be paranoid? if (unlikely(kvm->mmu_notifier_count) && (!kvm->mmu_notifier_range_end || (kvm->mmu_notifier_range_start <= hva && hva < kvm->mmu_notifier_range_end)) > + } > need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end, > range->flags); > /* we've to flush the tlb before the pages can be freed */ > @@ -2010,9 +2022,11 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, > > kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, > bool atomic, bool *async, bool write_fault, > - bool *writable) > + bool *writable, hva_t *hva) Hrm, it feels like we should really split gfn->hva and hva->pfn into separate operations, but pretty much every arch needs the hva error handling. Splitting it would probably do more harm than good, at least not without a lot of additional refactoring. Bummer. > { > unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); Newline here. > + if (hva) > + *hva = addr; > > if (addr == KVM_HVA_ERR_RO_BAD) { > if (writable) > @@ -2041,19 +2055,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, > bool *writable) > { > return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, > - write_fault, writable); > + write_fault, writable, NULL); > } > EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); > > kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) > { > - return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); > + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL); > } > EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); > > kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) > { > - return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); > + return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL); > } > EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); > > -- > 2.30.0.280.ga3ce27912f-goog >
> > This has the secondary effect of greatly reducing the likelihood of extreme > > Out of curiosity, is this really the _secondary_ effect? I would expect this > change to primarily benefit scenarios where the invalidation has gotten > waylaid for whatever reason. Yeah, this is the primary benefit. I was thinking about it as the reduction in page fault retries is the direct effect, and that in turn leads to a secondary effect of a reduction in the chance of extreme latency. But I guess that's not a particularly important distinction to make. I'll reword this. > > This needs a comment to explicitly state that 'count > 1' cannot be done at > this time. My initial thought is that it would be more intuitive to check for > 'count > 1' here, but that would potentially check the wrong wrange when count > goes from 2->1. The comment about persistence in invalidate_range_start() is a > good hint, but I think it's worth being explicit to avoid bad "cleanup" in the > future. > > > + if (unlikely(kvm->mmu_notifier_count)) { > > + if (kvm->mmu_notifier_range_start <= hva && > > + hva < kvm->mmu_notifier_range_end) I'm not sure I understand what you're suggesting here. How exactly would 'count > 1' be used incorrectly here? I'm fine with adding a comment, but I'm not sure what the comment needs to clarify. -David
On Tue, Jan 26, 2021, David Stevens wrote: > > This needs a comment to explicitly state that 'count > 1' cannot be done at > > this time. My initial thought is that it would be more intuitive to check for > > 'count > 1' here, but that would potentially check the wrong wrange when count > > goes from 2->1. The comment about persistence in invalidate_range_start() is a > > good hint, but I think it's worth being explicit to avoid bad "cleanup" in the > > future. > > > > > + if (unlikely(kvm->mmu_notifier_count)) { > > > + if (kvm->mmu_notifier_range_start <= hva && > > > + hva < kvm->mmu_notifier_range_end) > > I'm not sure I understand what you're suggesting here. How exactly > would 'count > 1' be used incorrectly here? I'm fine with adding a > comment, but I'm not sure what the comment needs to clarify. There's no guarantee that the remaining in-progress invalidation when the count goes from 2->1 is the same invalidation call that set range_start/range_end. E.g. given two invalidations, A and B, the order of calls could be: kvm_mmu_notifier_invalidate_range_start(A) kvm_mmu_notifier_invalidate_range_start(B) kvm_mmu_notifier_invalidate_range_end(A) kvm_mmu_notifier_invalidate_range_end(B) <-- ??? or kvm_mmu_notifier_invalidate_range_start(A) kvm_mmu_notifier_invalidate_range_start(B) kvm_mmu_notifier_invalidate_range_end(B) kvm_mmu_notifier_invalidate_range_end(A) <-- ??? In the first case, "A" is in-progress when the count goes 2->1, in the second case "B" is still in-progress. Checking for "count > 1" in the consumer instead of handling it in the producer (as you did) would lead to the consumer checking against the wrong range. I don't see a way to solve that without adding some amount of history, which I agree is unnecessary.
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 38ea396a23d6..8e06cd3f759c 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -590,7 +590,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, } else { /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, &write_ok); + writing, &write_ok, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; page = NULL; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index bb35490400e9..e603de7ade52 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -822,7 +822,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, upgrade_p); + writing, upgrade_p, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; page = NULL; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..79166288ed03 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3658,8 +3658,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, - bool *writable) + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, + bool write, bool *writable) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; @@ -3672,7 +3672,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, } async = false; - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, + write, writable, hva); if (!async) return false; /* *pfn has correct page already */ @@ -3686,7 +3687,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return true; } - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, + write, writable, hva); return false; } @@ -3699,6 +3701,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; kvm_pfn_t pfn; + hva_t hva; int r; if (page_fault_handle_page_track(vcpu, error_code, gfn)) @@ -3717,7 +3720,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva, + write, &map_writable)) return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) @@ -3725,7 +3729,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + if (mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 50e268eb8e1a..3171784139a4 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -790,6 +790,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, struct guest_walker walker; int r; kvm_pfn_t pfn; + hva_t hva; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; int max_level; @@ -840,8 +841,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, - &map_writable)) + if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva, + write_fault, &map_writable)) return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) @@ -869,7 +870,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + if (mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f3b1013fb22c..b70097685249 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -502,6 +502,8 @@ struct kvm { struct mmu_notifier mmu_notifier; unsigned long mmu_notifier_seq; long mmu_notifier_count; + unsigned long mmu_notifier_range_start; + unsigned long mmu_notifier_range_end; #endif long tlbs_dirty; struct list_head devices; @@ -729,7 +731,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, bool *async, bool write_fault, - bool *writable); + bool *writable, hva_t *hva); void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); @@ -1203,6 +1205,24 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) return 1; return 0; } + +static inline int mmu_notifier_retry_hva(struct kvm *kvm, + unsigned long mmu_seq, + unsigned long hva) +{ + /* + * Unlike mmu_notifier_retry, this function relies on + * kvm->mmu_lock for consistency. + */ + if (unlikely(kvm->mmu_notifier_count)) { + if (kvm->mmu_notifier_range_start <= hva && + hva < kvm->mmu_notifier_range_end) + return 1; + } + if (kvm->mmu_notifier_seq != mmu_seq) + return 1; + return 0; +} #endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fa9e3614d30e..d6e1ef5cb184 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -483,6 +483,18 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, * count is also read inside the mmu_lock critical section. */ kvm->mmu_notifier_count++; + if (likely(kvm->mmu_notifier_count == 1)) { + kvm->mmu_notifier_range_start = range->start; + kvm->mmu_notifier_range_end = range->end; + } else { + /** + * Tracking multiple concurrent ranges has diminishing returns, + * so just use the maximum range. This persists until after all + * outstanding invalidation operations complete. + */ + kvm->mmu_notifier_range_start = 0; + kvm->mmu_notifier_range_end = ULONG_MAX; + } need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end, range->flags); /* we've to flush the tlb before the pages can be freed */ @@ -2010,9 +2022,11 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, bool *async, bool write_fault, - bool *writable) + bool *writable, hva_t *hva) { unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); + if (hva) + *hva = addr; if (addr == KVM_HVA_ERR_RO_BAD) { if (writable) @@ -2041,19 +2055,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable) { return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, - write_fault, writable); + write_fault, writable, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); + return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);