diff mbox series

[mm-unstable,v2,01/10] mm/kvm: add mmu_notifier_ops->test_clear_young()

Message ID 20230526234435.662652-2-yuzhao@google.com (mailing list archive)
State New, archived
Headers show
Series mm/kvm: locklessly clear the accessed bit | expand

Commit Message

Yu Zhao May 26, 2023, 11:44 p.m. UTC
Add mmu_notifier_ops->test_clear_young() to supersede test_young()
and clear_young().

test_clear_young() has a fast path, which if supported, allows its
callers to safely clear the accessed bit without taking
kvm->mmu_lock.

The fast path requires arch-specific code that generally relies on
RCU and CAS: the former protects KVM page tables from being freed
while the latter clears the accessed bit atomically against both the
hardware and other software page table walkers. If the fast path is
unsupported, test_clear_young() falls back to the existing slow path
where kvm->mmu_lock is then taken.

test_clear_young() can also operate on a range of KVM PTEs
individually according to a bitmap, if the caller provides it.

Signed-off-by: Yu Zhao <yuzhao@google.com>
---
 include/linux/kvm_host.h     | 22 +++++++++++
 include/linux/mmu_notifier.h | 52 ++++++++++++++++++++++++
 mm/mmu_notifier.c            | 24 ++++++++++++
 virt/kvm/kvm_main.c          | 76 +++++++++++++++++++++++++++++++++++-
 4 files changed, 173 insertions(+), 1 deletion(-)

Comments

Tzung-Bi Shih June 6, 2023, 8:34 a.m. UTC | #1
On Fri, May 26, 2023 at 05:44:26PM -0600, Yu Zhao wrote:
> +/*
> + * Architectures that implement kvm_arch_test_clear_young() should override
> + * kvm_arch_has_test_clear_young().
> + *
> + * kvm_arch_has_test_clear_young() is allowed to return false positive, i.e., it
> + * can return true if kvm_arch_test_clear_young() is supported but disabled due
> + * to some runtime constraint. In this case, kvm_arch_test_clear_young() should

Is it a typo here?  s/kvm_arch_test_clear_young/kvm_arch_has_test_clear_young/.

> +static inline int mmu_notifier_clear_young(struct mm_struct *mm,
> +					   unsigned long start,
> +					   unsigned long end)
> +{
> +	return 0;
> +}
> +

This looks irrelevant to the patch but a fix for commit 1d7715c676a1
("mmu-notifier: add clear_young callback") instead.
Yu Zhao June 9, 2023, 1 a.m. UTC | #2
On Tue, Jun 6, 2023 at 2:34 AM Tzung-Bi Shih <tzungbi@kernel.org> wrote:
>
> On Fri, May 26, 2023 at 05:44:26PM -0600, Yu Zhao wrote:
> > +/*
> > + * Architectures that implement kvm_arch_test_clear_young() should override
> > + * kvm_arch_has_test_clear_young().
> > + *
> > + * kvm_arch_has_test_clear_young() is allowed to return false positive, i.e., it
> > + * can return true if kvm_arch_test_clear_young() is supported but disabled due
> > + * to some runtime constraint. In this case, kvm_arch_test_clear_young() should
>
> Is it a typo here?  s/kvm_arch_test_clear_young/kvm_arch_has_test_clear_young/.

Not a typo.
Paolo Bonzini June 9, 2023, 9:04 a.m. UTC | #3
On 5/31/23 21:17, Jason Gunthorpe wrote:
>> +	int (*test_clear_young)(struct mmu_notifier *mn, struct mm_struct *mm,
>> +				unsigned long start, unsigned long end,
>> +				bool clear, unsigned long *bitmap);
>> +
> Why leave clear_young behind? Just make a NULL bitmap mean
> clear_young?

It goes away in patch 2, together with:

@@ -437,7 +412,7 @@ static inline int mmu_notifier_clear_young(struct mm_struct *mm,
  					   unsigned long end)
  {
  	if (mm_has_notifiers(mm))
-		return __mmu_notifier_clear_young(mm, start, end);
+		return __mmu_notifier_test_clear_young(mm, start, end, true, NULL);
  	return 0;
  }
  
@@ -445,7 +420,7 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm,
  					  unsigned long address)
  {
  	if (mm_has_notifiers(mm))
-		return __mmu_notifier_test_young(mm, address);
+		return __mmu_notifier_test_clear_young(mm, address, address + 1, false, NULL);
  	return 0;
  }
  

Paolo
Sean Christopherson June 15, 2023, 5:42 p.m. UTC | #4
On Fri, May 26, 2023, Yu Zhao wrote:
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 0e571e973bc2..374262545f96 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -258,6 +258,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
>  #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
>  struct kvm_gfn_range {
>  	struct kvm_memory_slot *slot;
> +	void *args;

There's no reason to make this "void *", just declare "struct test_clear_young_args"
in the header.  Arch code won't be able to use it regardless.  And I vote for
something more like "test_clear_young_metadata", as there's far more information
in there than just function arguments.

And to stave off the argument that "void *" would allow reuse, take this opportunity
to unionize the test_clear_young field with the change_pte field, e.g.

	/* comment about these fields being callback specific. */
	union {
		struct test_clear_young_metadata *metadata;
		pte_t pte;
		unsigned long callback_arg; /* needs a better name */
	};

> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 51e4882d0873..31ee58754b19 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -541,6 +541,7 @@ typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
>  typedef void (*on_unlock_fn_t)(struct kvm *kvm);
>  
>  struct kvm_hva_range {
> +	void *args;

Same feedback as kvm_gfn_range.

>  	unsigned long start;
>  	unsigned long end;
>  	pte_t pte;
> @@ -549,6 +550,7 @@ struct kvm_hva_range {
>  	on_unlock_fn_t on_unlock;
>  	bool flush_on_ret;
>  	bool may_block;
> +	bool lockless;
>  };
>  
>  /*
> @@ -602,6 +604,8 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
>  			hva_end = min(range->end, slot->userspace_addr +
>  						  (slot->npages << PAGE_SHIFT));
>  
> +			gfn_range.args = range->args;

And this goes away because the generic callback_arg is what gets propagated.

> +
>  			/*
>  			 * To optimize for the likely case where the address
>  			 * range is covered by zero or one memslots, don't
> @@ -619,7 +623,7 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
>  			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
>  			gfn_range.slot = slot;
>  
> -			if (!locked) {
> +			if (!range->lockless && !locked) {
>  				locked = true;
>  				KVM_MMU_LOCK(kvm);
>  				if (!IS_KVM_NULL_FN(range->on_lock))
> @@ -628,6 +632,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
>  					break;
>  			}
>  			ret |= range->handler(kvm, &gfn_range);
> +
> +			if (range->lockless && ret)

I don't like overloading "lockless" to also mean "stop on ret".  Just add another
flag, there's literally no cost for most callbacks as everything is constant at
compile time and gets optimized away.

> +		range.args = &args;
> +		range.lockless = true;

The lockless and stop_on_ret behavior needs comments.

> +		range.handler = kvm_arch_test_clear_young;
> +
> +		if (!__kvm_handle_hva_range(kvm, &range))
> +			return args.young ? MMU_NOTIFIER_RANGE_LOCKLESS : 0;
> +	}
> +
> +	if (bitmap)
> +		return 0;
> +
> +	range.args = NULL;
> +	range.lockless = false;

No need to manually clear these, they'll be zeroed by the initialization code.

E.g. all in all, something like so

---
 include/linux/kvm_host.h |  9 +++++++--
 virt/kvm/kvm_main.c      | 29 +++++++++++++++++------------
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7a0922cbc36f..e04605061f5e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -256,12 +256,17 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+struct test_clear_young_metadata;
+
 struct kvm_gfn_range {
 	struct kvm_memory_slot *slot;
-	void *args;
 	gfn_t start;
 	gfn_t end;
-	pte_t pte;
+	union {
+		struct test_clear_young_metadata *metadata;
+		pte_t pte;
+		unsigned long callback_arg;
+	};
 	bool may_block;
 };
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ac83cfb30771..8cf4fee9cd8b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -536,16 +536,20 @@ typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
 typedef void (*on_unlock_fn_t)(struct kvm *kvm);
 
 struct kvm_hva_range {
-	void *args;
 	unsigned long start;
 	unsigned long end;
-	pte_t pte;
 	hva_handler_t handler;
+	union {
+		struct test_clear_young_metadata *metadata;
+		pte_t pte;
+		unsigned long callback_arg;
+	};
 	on_lock_fn_t on_lock;
 	on_unlock_fn_t on_unlock;
 	bool flush_on_ret;
 	bool may_block;
 	bool lockless;
+	bool stop_on_ret;
 };
 
 /*
@@ -576,6 +580,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 	struct kvm_memslots *slots;
 	int i, idx;
 
+	BUILD_BUG_ON(sizeof(gfn_range.callback_arg) != sizeof(gfn_range.pte) ||
+		     sizeof(gfn_range.callback_arg) != sizeof(gfn_range.metadata));
+
 	if (WARN_ON_ONCE(range->end <= range->start))
 		return 0;
 
@@ -599,16 +606,14 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 			hva_end = min(range->end, slot->userspace_addr +
 						  (slot->npages << PAGE_SHIFT));
 
-			gfn_range.args = range->args;
-
 			/*
 			 * To optimize for the likely case where the address
 			 * range is covered by zero or one memslots, don't
 			 * bother making these conditional (to avoid writes on
 			 * the second or later invocation of the handler).
 			 */
-			gfn_range.pte = range->pte;
 			gfn_range.may_block = range->may_block;
+			gfn_range.callback_arg = range->callback_arg;
 
 			/*
 			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
@@ -628,7 +633,8 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 			}
 			ret |= range->handler(kvm, &gfn_range);
 
-			if (range->lockless && ret)
+			/* comment goes here. */
+			if (range->stop_on_ret && ret)
 				break;
 		}
 	}
@@ -830,7 +836,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 	return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
 }
 
-struct test_clear_young_args {
+struct test_clear_young_metadata {
 	unsigned long *bitmap;
 	unsigned long end;
 	bool clear;
@@ -839,7 +845,7 @@ struct test_clear_young_args {
 
 bool kvm_should_clear_young(struct kvm_gfn_range *range, gfn_t gfn)
 {
-	struct test_clear_young_args *args = range->args;
+	struct test_clear_young_metadata *args = range->metadata;
 
 	VM_WARN_ON_ONCE(gfn < range->start || gfn >= range->end);
 
@@ -880,14 +886,15 @@ static int kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn, struct mm_
 	trace_kvm_age_hva(start, end);
 
 	if (kvm_test_clear_young) {
-		struct test_clear_young_args args = {
+		struct test_clear_young_metadata args = {
 			.bitmap	= bitmap,
 			.end	= end,
 			.clear	= clear,
 		};
 
-		range.args = &args;
 		range.lockless = true;
+		range.stop_on_ret = true;
+		range.metadata = &args;
 		range.handler = kvm_test_clear_young;
 
 		if (!__kvm_handle_hva_range(kvm, &range))
@@ -897,8 +904,6 @@ static int kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn, struct mm_
 	if (bitmap)
 		return 0;
 
-	range.args = NULL;
-	range.lockless = false;
 	range.handler = clear ? kvm_age_gfn : kvm_test_age_gfn;
 
 	return __kvm_handle_hva_range(kvm, &range) ? MMU_NOTIFIER_RANGE_YOUNG : 0;

base-commit: 7a5d8be2c18415b73b9380741095f439d6983a40
Nicholas Piggin June 20, 2023, 7:30 a.m. UTC | #5
On Sat May 27, 2023 at 9:44 AM AEST, Yu Zhao wrote:
> Add mmu_notifier_ops->test_clear_young() to supersede test_young()
> and clear_young().
>
> test_clear_young() has a fast path, which if supported, allows its
> callers to safely clear the accessed bit without taking
> kvm->mmu_lock.
>
> The fast path requires arch-specific code that generally relies on
> RCU and CAS: the former protects KVM page tables from being freed
> while the latter clears the accessed bit atomically against both the
> hardware and other software page table walkers. If the fast path is
> unsupported, test_clear_young() falls back to the existing slow path
> where kvm->mmu_lock is then taken.
>
> test_clear_young() can also operate on a range of KVM PTEs
> individually according to a bitmap, if the caller provides it.

It would be better if you could do patch 1 that only touches the
mmu_notifier code and implements mmu_notifier_test_clear_young() in
terms of existing callbacks, and next patch swaps KVM to new callbacks
and remove the old ones.

> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> index 64a3e051c3c4..dfdbb370682d 100644
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -60,6 +60,8 @@ enum mmu_notifier_event {
>  };
>  
>  #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
> +#define MMU_NOTIFIER_RANGE_LOCKLESS	(1 << 1)
> +#define MMU_NOTIFIER_RANGE_YOUNG	(1 << 2)
>  
>  struct mmu_notifier_ops {
>  	/*
> @@ -122,6 +124,10 @@ struct mmu_notifier_ops {
>  			  struct mm_struct *mm,
>  			  unsigned long address);
>  
> +	int (*test_clear_young)(struct mmu_notifier *mn, struct mm_struct *mm,
> +				unsigned long start, unsigned long end,
> +				bool clear, unsigned long *bitmap);

This should have a comment like the others. Callback wants to know how
to implement it.

Could add a _range on it as well while you're here, to correct that
inconsistency.

> +
>  	/*
>  	 * change_pte is called in cases that pte mapping to page is changed:
>  	 * for example, when ksm remaps pte to point to a new shared page.
> @@ -392,6 +398,9 @@ extern int __mmu_notifier_clear_young(struct mm_struct *mm,
>  				      unsigned long end);
>  extern int __mmu_notifier_test_young(struct mm_struct *mm,
>  				     unsigned long address);
> +extern int __mmu_notifier_test_clear_young(struct mm_struct *mm,
> +					   unsigned long start, unsigned long end,
> +					   bool clear, unsigned long *bitmap);
>  extern void __mmu_notifier_change_pte(struct mm_struct *mm,
>  				      unsigned long address, pte_t pte);
>  extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
> @@ -440,6 +449,35 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm,
>  	return 0;
>  }
>  
> +/*
> + * mmu_notifier_test_clear_young() returns nonzero if any of the KVM PTEs within
> + * a given range was young. Specifically, it returns MMU_NOTIFIER_RANGE_LOCKLESS
> + * if the fast path was successful, MMU_NOTIFIER_RANGE_YOUNG otherwise.
> + *
> + * The last parameter to the function is a bitmap and only the fast path
> + * supports it: if it is NULL, the function falls back to the slow path if the
> + * fast path was unsuccessful; otherwise, the function bails out.

Then if it was NULL, you would just not populate it. Minmize differences
and cases for the caller/implementations.

> + *
> + * The bitmap has the following specifications:
> + * 1. The number of bits should be at least (end-start)/PAGE_SIZE.
> + * 2. The offset of each bit should be relative to the end, i.e., the offset
> + *    corresponding to addr should be (end-addr)/PAGE_SIZE-1. This is convenient
> + *    for batching while forward looping.
> + *
> + * When testing, this function sets the corresponding bit in the bitmap for each
> + * young KVM PTE. When clearing, this function clears the accessed bit for each
> + * young KVM PTE whose corresponding bit in the bitmap is set.

I think this is over-designed as a first pass. The secondary MMU should
just implement the call always. If it can't do it locklessly, then just
do individual lookups. If the benefit is in the batching as you say then
the locked version will get similar benefit. Possibly more because locks
like some amount of batching when contended.

I think that would reduce some concerns about cases of secondary MMUs
that do not not support the lockless version yet, and avoid
proliferation of code paths by platform.

_If_ that was insufficient then I would like to see numbers and profiles
and incremental patch to expose more complexity like this.

Also mmu notifier code should say nothing about KVM PTEs or use kvm
names in any code or comments either. "if the page was accessed via the
secondary MMU" or similar is mutually understandable to KVM and mm
developers.

> @@ -880,6 +887,72 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
>  					     kvm_test_age_gfn);
>  }
>  
> +struct test_clear_young_args {
> +	unsigned long *bitmap;
> +	unsigned long end;
> +	bool clear;
> +	bool young;
> +};
> +
> +bool kvm_should_clear_young(struct kvm_gfn_range *range, gfn_t gfn)
> +{
> +	struct test_clear_young_args *args = range->args;
> +
> +	VM_WARN_ON_ONCE(gfn < range->start || gfn >= range->end);
> +
> +	args->young = true;
> +
> +	if (args->bitmap) {
> +		int offset = hva_to_gfn_memslot(args->end - 1, range->slot) - gfn;
> +
> +		if (args->clear)
> +			return test_bit(offset, args->bitmap);
> +
> +		__set_bit(offset, args->bitmap);
> +	}
> +
> +	return args->clear;
> +}

I don't quite understnd what's going on here. This is actually the
function that notes the young pte, despite its name suggesting it is
only a query.

Shouldn't it set the bitmap bit even in the clear case? And why is it
testing at all? Oh, it seems to be some strange mix of test *or* clear
young. With the bitmap being a predicate in some cases for the clear
case.

This is a fairly confusing multi-modal API then. I think it should
take 2 bitmaps, one is the young bitmap and the other is the predicate
bitmap, and either/or can be NULL.

Also this kvm_should_clear_young helper is clunky and misnamed. If you
just provided an inline helper to get test_clear_young bitmap offset
from gfn, then set/clear bit in the caller is quite trivial.

> +
> +static int kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn, struct mm_struct *mm,
> +					     unsigned long start, unsigned long end,
> +					     bool clear, unsigned long *bitmap)
> +{
> +	struct kvm *kvm = mmu_notifier_to_kvm(mn);
> +	struct kvm_hva_range range = {
> +		.start		= start,
> +		.end		= end,
> +		.on_lock	= (void *)kvm_null_fn,
> +		.on_unlock	= (void *)kvm_null_fn,
> +	};
> +
> +	trace_kvm_age_hva(start, end);
> +
> +	if (kvm_arch_has_test_clear_young()) {
> +		struct test_clear_young_args args = {
> +			.bitmap	= bitmap,
> +			.end	= end,
> +			.clear	= clear,
> +		};
> +
> +		range.args = &args;
> +		range.lockless = true;
> +		range.handler = kvm_arch_test_clear_young;
> +
> +		if (!__kvm_handle_hva_range(kvm, &range))
> +			return args.young ? MMU_NOTIFIER_RANGE_LOCKLESS : 0;
> +	}
> +
> +	if (bitmap)
> +		return 0;
> +
> +	range.args = NULL;
> +	range.lockless = false;
> +	range.handler = clear ? kvm_age_gfn : kvm_test_age_gfn;

Minor thing, but KVM's "young" handling has been called "age" until now.
Any reason not to stick with that theme?

Thanks,
Nick
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0e571e973bc2..374262545f96 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -258,6 +258,7 @@  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
 struct kvm_gfn_range {
 	struct kvm_memory_slot *slot;
+	void *args;
 	gfn_t start;
 	gfn_t end;
 	pte_t pte;
@@ -267,6 +268,27 @@  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_should_clear_young(struct kvm_gfn_range *range, gfn_t gfn);
+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range);
+#endif
+
+/*
+ * Architectures that implement kvm_arch_test_clear_young() should override
+ * kvm_arch_has_test_clear_young().
+ *
+ * kvm_arch_has_test_clear_young() is allowed to return false positive, i.e., it
+ * can return true if kvm_arch_test_clear_young() is supported but disabled due
+ * to some runtime constraint. In this case, kvm_arch_test_clear_young() should
+ * return true; otherwise, it should return false.
+ *
+ * For each young KVM PTE, kvm_arch_test_clear_young() should call
+ * kvm_should_clear_young() to decide whether to clear the accessed bit.
+ */
+#ifndef kvm_arch_has_test_clear_young
+static inline bool kvm_arch_has_test_clear_young(void)
+{
+	return false;
+}
 #endif
 
 enum {
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 64a3e051c3c4..dfdbb370682d 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -60,6 +60,8 @@  enum mmu_notifier_event {
 };
 
 #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
+#define MMU_NOTIFIER_RANGE_LOCKLESS	(1 << 1)
+#define MMU_NOTIFIER_RANGE_YOUNG	(1 << 2)
 
 struct mmu_notifier_ops {
 	/*
@@ -122,6 +124,10 @@  struct mmu_notifier_ops {
 			  struct mm_struct *mm,
 			  unsigned long address);
 
+	int (*test_clear_young)(struct mmu_notifier *mn, struct mm_struct *mm,
+				unsigned long start, unsigned long end,
+				bool clear, unsigned long *bitmap);
+
 	/*
 	 * change_pte is called in cases that pte mapping to page is changed:
 	 * for example, when ksm remaps pte to point to a new shared page.
@@ -392,6 +398,9 @@  extern int __mmu_notifier_clear_young(struct mm_struct *mm,
 				      unsigned long end);
 extern int __mmu_notifier_test_young(struct mm_struct *mm,
 				     unsigned long address);
+extern int __mmu_notifier_test_clear_young(struct mm_struct *mm,
+					   unsigned long start, unsigned long end,
+					   bool clear, unsigned long *bitmap);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
 				      unsigned long address, pte_t pte);
 extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
@@ -440,6 +449,35 @@  static inline int mmu_notifier_test_young(struct mm_struct *mm,
 	return 0;
 }
 
+/*
+ * mmu_notifier_test_clear_young() returns nonzero if any of the KVM PTEs within
+ * a given range was young. Specifically, it returns MMU_NOTIFIER_RANGE_LOCKLESS
+ * if the fast path was successful, MMU_NOTIFIER_RANGE_YOUNG otherwise.
+ *
+ * The last parameter to the function is a bitmap and only the fast path
+ * supports it: if it is NULL, the function falls back to the slow path if the
+ * fast path was unsuccessful; otherwise, the function bails out.
+ *
+ * The bitmap has the following specifications:
+ * 1. The number of bits should be at least (end-start)/PAGE_SIZE.
+ * 2. The offset of each bit should be relative to the end, i.e., the offset
+ *    corresponding to addr should be (end-addr)/PAGE_SIZE-1. This is convenient
+ *    for batching while forward looping.
+ *
+ * When testing, this function sets the corresponding bit in the bitmap for each
+ * young KVM PTE. When clearing, this function clears the accessed bit for each
+ * young KVM PTE whose corresponding bit in the bitmap is set.
+ */
+static inline int mmu_notifier_test_clear_young(struct mm_struct *mm,
+						unsigned long start, unsigned long end,
+						bool clear, unsigned long *bitmap)
+{
+	if (mm_has_notifiers(mm))
+		return __mmu_notifier_test_clear_young(mm, start, end, clear, bitmap);
+
+	return 0;
+}
+
 static inline void mmu_notifier_change_pte(struct mm_struct *mm,
 					   unsigned long address, pte_t pte)
 {
@@ -684,12 +722,26 @@  static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+					   unsigned long start,
+					   unsigned long end)
+{
+	return 0;
+}
+
 static inline int mmu_notifier_test_young(struct mm_struct *mm,
 					  unsigned long address)
 {
 	return 0;
 }
 
+static inline int mmu_notifier_test_clear_young(struct mm_struct *mm,
+						unsigned long start, unsigned long end,
+						bool clear, unsigned long *bitmap)
+{
+	return 0;
+}
+
 static inline void mmu_notifier_change_pte(struct mm_struct *mm,
 					   unsigned long address, pte_t pte)
 {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 50c0dde1354f..7e6aba4bddcb 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -424,6 +424,30 @@  int __mmu_notifier_test_young(struct mm_struct *mm,
 	return young;
 }
 
+int __mmu_notifier_test_clear_young(struct mm_struct *mm,
+				    unsigned long start, unsigned long end,
+				    bool clear, unsigned long *bitmap)
+{
+	int idx;
+	struct mmu_notifier *mn;
+	int young = 0;
+
+	idx = srcu_read_lock(&srcu);
+
+	hlist_for_each_entry_srcu(mn, &mm->notifier_subscriptions->list, hlist,
+				  srcu_read_lock_held(&srcu)) {
+		if (mn->ops->test_clear_young)
+			young |= mn->ops->test_clear_young(mn, mm, start, end, clear, bitmap);
+
+		if (young && !clear)
+			break;
+	}
+
+	srcu_read_unlock(&srcu, idx);
+
+	return young;
+}
+
 void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 			       pte_t pte)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 51e4882d0873..31ee58754b19 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -541,6 +541,7 @@  typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
 typedef void (*on_unlock_fn_t)(struct kvm *kvm);
 
 struct kvm_hva_range {
+	void *args;
 	unsigned long start;
 	unsigned long end;
 	pte_t pte;
@@ -549,6 +550,7 @@  struct kvm_hva_range {
 	on_unlock_fn_t on_unlock;
 	bool flush_on_ret;
 	bool may_block;
+	bool lockless;
 };
 
 /*
@@ -602,6 +604,8 @@  static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 			hva_end = min(range->end, slot->userspace_addr +
 						  (slot->npages << PAGE_SHIFT));
 
+			gfn_range.args = range->args;
+
 			/*
 			 * To optimize for the likely case where the address
 			 * range is covered by zero or one memslots, don't
@@ -619,7 +623,7 @@  static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
 			gfn_range.slot = slot;
 
-			if (!locked) {
+			if (!range->lockless && !locked) {
 				locked = true;
 				KVM_MMU_LOCK(kvm);
 				if (!IS_KVM_NULL_FN(range->on_lock))
@@ -628,6 +632,9 @@  static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 					break;
 			}
 			ret |= range->handler(kvm, &gfn_range);
+
+			if (range->lockless && ret)
+				break;
 		}
 	}
 
@@ -880,6 +887,72 @@  static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 					     kvm_test_age_gfn);
 }
 
+struct test_clear_young_args {
+	unsigned long *bitmap;
+	unsigned long end;
+	bool clear;
+	bool young;
+};
+
+bool kvm_should_clear_young(struct kvm_gfn_range *range, gfn_t gfn)
+{
+	struct test_clear_young_args *args = range->args;
+
+	VM_WARN_ON_ONCE(gfn < range->start || gfn >= range->end);
+
+	args->young = true;
+
+	if (args->bitmap) {
+		int offset = hva_to_gfn_memslot(args->end - 1, range->slot) - gfn;
+
+		if (args->clear)
+			return test_bit(offset, args->bitmap);
+
+		__set_bit(offset, args->bitmap);
+	}
+
+	return args->clear;
+}
+
+static int kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn, struct mm_struct *mm,
+					     unsigned long start, unsigned long end,
+					     bool clear, unsigned long *bitmap)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	struct kvm_hva_range range = {
+		.start		= start,
+		.end		= end,
+		.on_lock	= (void *)kvm_null_fn,
+		.on_unlock	= (void *)kvm_null_fn,
+	};
+
+	trace_kvm_age_hva(start, end);
+
+	if (kvm_arch_has_test_clear_young()) {
+		struct test_clear_young_args args = {
+			.bitmap	= bitmap,
+			.end	= end,
+			.clear	= clear,
+		};
+
+		range.args = &args;
+		range.lockless = true;
+		range.handler = kvm_arch_test_clear_young;
+
+		if (!__kvm_handle_hva_range(kvm, &range))
+			return args.young ? MMU_NOTIFIER_RANGE_LOCKLESS : 0;
+	}
+
+	if (bitmap)
+		return 0;
+
+	range.args = NULL;
+	range.lockless = false;
+	range.handler = clear ? kvm_age_gfn : kvm_test_age_gfn;
+
+	return __kvm_handle_hva_range(kvm, &range) ? MMU_NOTIFIER_RANGE_YOUNG : 0;
+}
+
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 				     struct mm_struct *mm)
 {
@@ -898,6 +971,7 @@  static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
 	.clear_young		= kvm_mmu_notifier_clear_young,
 	.test_young		= kvm_mmu_notifier_test_young,
+	.test_clear_young	= kvm_mmu_notifier_test_clear_young,
 	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,
 };