diff mbox series

[2/5] KVM: selftests: access_tracking_perf_test: Add option to skip the sanity check

Message ID 20250327012350.1135621-3-jthoughton@google.com (mailing list archive)
State New
Headers show
Series KVM: selftests: access_tracking_perf_test fixes for NUMA balancing and MGLRU | expand

Commit Message

James Houghton March 27, 2025, 1:23 a.m. UTC
From: Maxim Levitsky <mlevitsk@redhat.com>

Add an option to skip sanity check of number of still idle pages,
and set it by default to skip, in case hypervisor or NUMA balancing
is detected.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Co-developed-by: James Houghton <jthoughton@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
---
 .../selftests/kvm/access_tracking_perf_test.c | 61 ++++++++++++++++---
 .../testing/selftests/kvm/include/test_util.h |  1 +
 tools/testing/selftests/kvm/lib/test_util.c   |  7 +++
 3 files changed, 60 insertions(+), 9 deletions(-)

Comments

Maxim Levitsky March 28, 2025, 7:32 p.m. UTC | #1
On Thu, 2025-03-27 at 01:23 +0000, James Houghton wrote:
> From: Maxim Levitsky <mlevitsk@redhat.com>
> 
> Add an option to skip sanity check of number of still idle pages,
> and set it by default to skip, in case hypervisor or NUMA balancing
> is detected.
> 
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> Co-developed-by: James Houghton <jthoughton@google.com>
> Signed-off-by: James Houghton <jthoughton@google.com>
> ---
>  .../selftests/kvm/access_tracking_perf_test.c | 61 ++++++++++++++++---
>  .../testing/selftests/kvm/include/test_util.h |  1 +
>  tools/testing/selftests/kvm/lib/test_util.c   |  7 +++
>  3 files changed, 60 insertions(+), 9 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c
> index 447e619cf856e..0e594883ec13e 100644
> --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c
> +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c
> @@ -65,6 +65,16 @@ static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
>  /* Whether to overlap the regions of memory vCPUs access. */
>  static bool overlap_memory_access;
>  
> +/*
> + * If the test should only warn if there are too many idle pages (i.e., it is
> + * expected).
> + * -1: Not yet set.
> + *  0: We do not expect too many idle pages, so FAIL if too many idle pages.
> + *  1: Having too many idle pages is expected, so merely print a warning if
> + *     too many idle pages are found.
> + */
> +static int idle_pages_warn_only = -1;
> +
>  struct test_params {
>  	/* The backing source for the region of memory. */
>  	enum vm_mem_backing_src_type backing_src;
> @@ -177,18 +187,12 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm,
>  	 * arbitrary; high enough that we ensure most memory access went through
>  	 * access tracking but low enough as to not make the test too brittle
>  	 * over time and across architectures.
> -	 *
> -	 * When running the guest as a nested VM, "warn" instead of asserting
> -	 * as the TLB size is effectively unlimited and the KVM doesn't
> -	 * explicitly flush the TLB when aging SPTEs.  As a result, more pages
> -	 * are cached and the guest won't see the "idle" bit cleared.
>  	 */
>  	if (still_idle >= pages / 10) {
> -#ifdef __x86_64__
> -		TEST_ASSERT(this_cpu_has(X86_FEATURE_HYPERVISOR),
> +		TEST_ASSERT(idle_pages_warn_only,
>  			    "vCPU%d: Too many pages still idle (%lu out of %lu)",
>  			    vcpu_idx, still_idle, pages);
> -#endif
> +
>  		printf("WARNING: vCPU%d: Too many pages still idle (%lu out of %lu), "
>  		       "this will affect performance results.\n",
>  		       vcpu_idx, still_idle, pages);
> @@ -328,6 +332,31 @@ static void run_test(enum vm_guest_mode mode, void *arg)
>  	memstress_destroy_vm(vm);
>  }
>  
> +static int access_tracking_unreliable(void)
> +{
> +#ifdef __x86_64__
> +	/*
> +	 * When running nested, the TLB size is effectively unlimited and the
> +	 * KVM doesn't explicitly flush the TLB when aging SPTEs.  As a result,
> +	 * more pages are cached and the guest won't see the "idle" bit cleared.
> +	 */
Tiny nitpick: nested on KVM, because on other hypervisors it might work differently,
but overall most of them probably suffer from the same problem,
so its probably better to say something like that:

'When running nested, the TLB size might be effectively unlimited, 
for example this is the case when running on top of KVM L0'


> +	if (this_cpu_has(X86_FEATURE_HYPERVISOR)) {
> +		puts("Skipping idle page count sanity check, because the test is run nested");
> +		return 1;
> +	}
> +#endif
> +	/*
> +	 * When NUMA balancing is enabled, guest memory can be mapped
> +	 * PROT_NONE, and the Accessed bits won't be queriable.

Tiny nitpick: the accessed bit in this case are lost, because KVM no longer propagates
it from secondary to primary paging, and the secondary mapping might be lost due to zapping,
and the biggest offender of this is NUMA balancing.

> +	 */
> +	if (is_numa_balancing_enabled()) {
> +		puts("Skipping idle page count sanity check, because NUMA balancing is enabled");
> +		return 1;
> +	}
> +
> +	return 0;
> +}

Very good idea of extracting this logic into a function and documenting it.

> +
>  static void help(char *name)
>  {
>  	puts("");
> @@ -342,6 +371,12 @@ static void help(char *name)
>  	printf(" -v: specify the number of vCPUs to run.\n");
>  	printf(" -o: Overlap guest memory accesses instead of partitioning\n"
>  	       "     them into a separate region of memory for each vCPU.\n");
> +	printf(" -w: Control whether the test warns or fails if more than 10%\n"
> +	       "     of pages are still seen as idle/old after accessing guest\n"
> +	       "     memory.  >0 == warn only, 0 == fail, <0 == auto.  For auto\n"
> +	       "     mode, the test fails by default, but switches to warn only\n"
> +	       "     if NUMA balancing is enabled or the test detects it's running\n"
> +	       "     in a VM.\n");
>  	backing_src_help("-s");
>  	puts("");
>  	exit(0);
> @@ -359,7 +394,7 @@ int main(int argc, char *argv[])
>  
>  	guest_modes_append_default();
>  
> -	while ((opt = getopt(argc, argv, "hm:b:v:os:")) != -1) {
> +	while ((opt = getopt(argc, argv, "hm:b:v:os:w:")) != -1) {
>  		switch (opt) {
>  		case 'm':
>  			guest_modes_cmdline(optarg);
> @@ -376,6 +411,11 @@ int main(int argc, char *argv[])
>  		case 's':
>  			params.backing_src = parse_backing_src_type(optarg);
>  			break;
> +		case 'w':
> +			idle_pages_warn_only =
> +				atoi_non_negative("Idle pages warning",
> +						  optarg);
> +			break;
>  		case 'h':
>  		default:
>  			help(argv[0]);
> @@ -388,6 +428,9 @@ int main(int argc, char *argv[])
>  		       "CONFIG_IDLE_PAGE_TRACKING is not enabled");
>  	close(page_idle_fd);
>  
> +	if (idle_pages_warn_only == -1)
> +		idle_pages_warn_only = access_tracking_unreliable();
> +
>  	for_each_guest_mode(run_test, &params);
>  
>  	return 0;
> diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
> index 77d13d7920cb8..c6ef895fbd9ab 100644
> --- a/tools/testing/selftests/kvm/include/test_util.h
> +++ b/tools/testing/selftests/kvm/include/test_util.h
> @@ -153,6 +153,7 @@ bool is_backing_src_hugetlb(uint32_t i);
>  void backing_src_help(const char *flag);
>  enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
>  long get_run_delay(void);
> +bool is_numa_balancing_enabled(void);
>  
>  /*
>   * Whether or not the given source type is shared memory (as opposed to
> diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
> index 3dc8538f5d696..03eb99af9b8de 100644
> --- a/tools/testing/selftests/kvm/lib/test_util.c
> +++ b/tools/testing/selftests/kvm/lib/test_util.c
> @@ -176,6 +176,13 @@ size_t get_trans_hugepagesz(void)
>  	return get_sysfs_val("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size");
>  }
>  
> +bool is_numa_balancing_enabled(void)
> +{
> +	if (!test_sysfs_path("/proc/sys/kernel/numa_balancing"))
> +		return false;
> +	return get_sysfs_val("/proc/sys/kernel/numa_balancing") == 1;
> +}
> +
>  size_t get_def_hugetlb_pagesz(void)
>  {
>  	char buf[64];



Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>

Best regards,
	Maxim Levitsky
James Houghton March 28, 2025, 9:26 p.m. UTC | #2
On Fri, Mar 28, 2025 at 12:32 PM Maxim Levitsky <mlevitsk@redhat.com> wrote:
>
> On Thu, 2025-03-27 at 01:23 +0000, James Houghton wrote:
> > From: Maxim Levitsky <mlevitsk@redhat.com>
> >
> > Add an option to skip sanity check of number of still idle pages,
> > and set it by default to skip, in case hypervisor or NUMA balancing
> > is detected.
> >
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > Co-developed-by: James Houghton <jthoughton@google.com>
> > Signed-off-by: James Houghton <jthoughton@google.com>
> > ---
> >  .../selftests/kvm/access_tracking_perf_test.c | 61 ++++++++++++++++---
> >  .../testing/selftests/kvm/include/test_util.h |  1 +
> >  tools/testing/selftests/kvm/lib/test_util.c   |  7 +++
> >  3 files changed, 60 insertions(+), 9 deletions(-)
> >
> > diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c
> > index 447e619cf856e..0e594883ec13e 100644
> > --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c
> > +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c
> > @@ -65,6 +65,16 @@ static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
> >  /* Whether to overlap the regions of memory vCPUs access. */
> >  static bool overlap_memory_access;
> > 
> > +/*
> > + * If the test should only warn if there are too many idle pages (i.e., it is
> > + * expected).
> > + * -1: Not yet set.
> > + *  0: We do not expect too many idle pages, so FAIL if too many idle pages.
> > + *  1: Having too many idle pages is expected, so merely print a warning if
> > + *     too many idle pages are found.
> > + */
> > +static int idle_pages_warn_only = -1;
> > +
> >  struct test_params {
> >       /* The backing source for the region of memory. */
> >       enum vm_mem_backing_src_type backing_src;
> > @@ -177,18 +187,12 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm,
> >        * arbitrary; high enough that we ensure most memory access went through
> >        * access tracking but low enough as to not make the test too brittle
> >        * over time and across architectures.
> > -      *
> > -      * When running the guest as a nested VM, "warn" instead of asserting
> > -      * as the TLB size is effectively unlimited and the KVM doesn't
> > -      * explicitly flush the TLB when aging SPTEs.  As a result, more pages
> > -      * are cached and the guest won't see the "idle" bit cleared.
> >        */
> >       if (still_idle >= pages / 10) {
> > -#ifdef __x86_64__
> > -             TEST_ASSERT(this_cpu_has(X86_FEATURE_HYPERVISOR),
> > +             TEST_ASSERT(idle_pages_warn_only,
> >                           "vCPU%d: Too many pages still idle (%lu out of %lu)",
> >                           vcpu_idx, still_idle, pages);
> > -#endif
> > +
> >               printf("WARNING: vCPU%d: Too many pages still idle (%lu out of %lu), "
> >                      "this will affect performance results.\n",
> >                      vcpu_idx, still_idle, pages);
> > @@ -328,6 +332,31 @@ static void run_test(enum vm_guest_mode mode, void *arg)
> >       memstress_destroy_vm(vm);
> >  }
> > 
> > +static int access_tracking_unreliable(void)
> > +{
> > +#ifdef __x86_64__
> > +     /*
> > +      * When running nested, the TLB size is effectively unlimited and the
> > +      * KVM doesn't explicitly flush the TLB when aging SPTEs.  As a result,
> > +      * more pages are cached and the guest won't see the "idle" bit cleared.
> > +      */
> Tiny nitpick: nested on KVM, because on other hypervisors it might work differently,
> but overall most of them probably suffer from the same problem,
> so its probably better to say something like that:
>
> 'When running nested, the TLB size might be effectively unlimited,
> for example this is the case when running on top of KVM L0'

Added this clarification (see below), thanks!

This wording was left as-is from before, but I agree that it could be better.

>
>
> > +     if (this_cpu_has(X86_FEATURE_HYPERVISOR)) {
> > +             puts("Skipping idle page count sanity check, because the test is run nested");
> > +             return 1;
> > +     }
> > +#endif
> > +     /*
> > +      * When NUMA balancing is enabled, guest memory can be mapped
> > +      * PROT_NONE, and the Accessed bits won't be queriable.
>
> Tiny nitpick: the accessed bit in this case are lost, because KVM no longer propagates
> it from secondary to primary paging, and the secondary mapping might be lost due to zapping,
> and the biggest offender of this is NUMA balancing.

Reworded this bit. The current wording is indeed misleading.

> > +      */
> > +     if (is_numa_balancing_enabled()) {
> > +             puts("Skipping idle page count sanity check, because NUMA balancing is enabled");
> > +             return 1;
> > +     }
> > +
> > +     return 0;
> > +}
>
> Very good idea of extracting this logic into a function and documenting it.

:)

> Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>

Thanks, though I've already included your Signed-off-by. I'll just keep your
Signed-off-by and From:.

I'm applying the following diff for the next version:

diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c
index 0e594883ec13e..1770998c7675b 100644
--- a/tools/testing/selftests/kvm/access_tracking_perf_test.c
+++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c
@@ -336,9 +336,10 @@ static int access_tracking_unreliable(void)
 {
 #ifdef __x86_64__
 	/*
-	 * When running nested, the TLB size is effectively unlimited and the
-	 * KVM doesn't explicitly flush the TLB when aging SPTEs.  As a result,
-	 * more pages are cached and the guest won't see the "idle" bit cleared.
+	 * When running nested, the TLB size may be effectively unlimited (for
+	 * example, this is the case when running on KVM L0), and KVM doesn't
+	 * explicitly flush the TLB when aging SPTEs.  As a result, more pages
+	 * are cached and the guest won't see the "idle" bit cleared.
 	 */
 	if (this_cpu_has(X86_FEATURE_HYPERVISOR)) {
 		puts("Skipping idle page count sanity check, because the test is run nested");
@@ -346,8 +347,8 @@ static int access_tracking_unreliable(void)
 	}
 #endif
 	/*
-	 * When NUMA balancing is enabled, guest memory can be mapped
-	 * PROT_NONE, and the Accessed bits won't be queriable.
+	 * When NUMA balancing is enabled, guest memory will be unmapped to get
+	 * NUMA faults, dropping the Accessed bits.
 	 */
 	if (is_numa_balancing_enabled()) {
 		puts("Skipping idle page count sanity check, because NUMA balancing is enabled");
diff mbox series

Patch

diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c
index 447e619cf856e..0e594883ec13e 100644
--- a/tools/testing/selftests/kvm/access_tracking_perf_test.c
+++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c
@@ -65,6 +65,16 @@  static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
 /* Whether to overlap the regions of memory vCPUs access. */
 static bool overlap_memory_access;
 
+/*
+ * If the test should only warn if there are too many idle pages (i.e., it is
+ * expected).
+ * -1: Not yet set.
+ *  0: We do not expect too many idle pages, so FAIL if too many idle pages.
+ *  1: Having too many idle pages is expected, so merely print a warning if
+ *     too many idle pages are found.
+ */
+static int idle_pages_warn_only = -1;
+
 struct test_params {
 	/* The backing source for the region of memory. */
 	enum vm_mem_backing_src_type backing_src;
@@ -177,18 +187,12 @@  static void mark_vcpu_memory_idle(struct kvm_vm *vm,
 	 * arbitrary; high enough that we ensure most memory access went through
 	 * access tracking but low enough as to not make the test too brittle
 	 * over time and across architectures.
-	 *
-	 * When running the guest as a nested VM, "warn" instead of asserting
-	 * as the TLB size is effectively unlimited and the KVM doesn't
-	 * explicitly flush the TLB when aging SPTEs.  As a result, more pages
-	 * are cached and the guest won't see the "idle" bit cleared.
 	 */
 	if (still_idle >= pages / 10) {
-#ifdef __x86_64__
-		TEST_ASSERT(this_cpu_has(X86_FEATURE_HYPERVISOR),
+		TEST_ASSERT(idle_pages_warn_only,
 			    "vCPU%d: Too many pages still idle (%lu out of %lu)",
 			    vcpu_idx, still_idle, pages);
-#endif
+
 		printf("WARNING: vCPU%d: Too many pages still idle (%lu out of %lu), "
 		       "this will affect performance results.\n",
 		       vcpu_idx, still_idle, pages);
@@ -328,6 +332,31 @@  static void run_test(enum vm_guest_mode mode, void *arg)
 	memstress_destroy_vm(vm);
 }
 
+static int access_tracking_unreliable(void)
+{
+#ifdef __x86_64__
+	/*
+	 * When running nested, the TLB size is effectively unlimited and the
+	 * KVM doesn't explicitly flush the TLB when aging SPTEs.  As a result,
+	 * more pages are cached and the guest won't see the "idle" bit cleared.
+	 */
+	if (this_cpu_has(X86_FEATURE_HYPERVISOR)) {
+		puts("Skipping idle page count sanity check, because the test is run nested");
+		return 1;
+	}
+#endif
+	/*
+	 * When NUMA balancing is enabled, guest memory can be mapped
+	 * PROT_NONE, and the Accessed bits won't be queriable.
+	 */
+	if (is_numa_balancing_enabled()) {
+		puts("Skipping idle page count sanity check, because NUMA balancing is enabled");
+		return 1;
+	}
+
+	return 0;
+}
+
 static void help(char *name)
 {
 	puts("");
@@ -342,6 +371,12 @@  static void help(char *name)
 	printf(" -v: specify the number of vCPUs to run.\n");
 	printf(" -o: Overlap guest memory accesses instead of partitioning\n"
 	       "     them into a separate region of memory for each vCPU.\n");
+	printf(" -w: Control whether the test warns or fails if more than 10%\n"
+	       "     of pages are still seen as idle/old after accessing guest\n"
+	       "     memory.  >0 == warn only, 0 == fail, <0 == auto.  For auto\n"
+	       "     mode, the test fails by default, but switches to warn only\n"
+	       "     if NUMA balancing is enabled or the test detects it's running\n"
+	       "     in a VM.\n");
 	backing_src_help("-s");
 	puts("");
 	exit(0);
@@ -359,7 +394,7 @@  int main(int argc, char *argv[])
 
 	guest_modes_append_default();
 
-	while ((opt = getopt(argc, argv, "hm:b:v:os:")) != -1) {
+	while ((opt = getopt(argc, argv, "hm:b:v:os:w:")) != -1) {
 		switch (opt) {
 		case 'm':
 			guest_modes_cmdline(optarg);
@@ -376,6 +411,11 @@  int main(int argc, char *argv[])
 		case 's':
 			params.backing_src = parse_backing_src_type(optarg);
 			break;
+		case 'w':
+			idle_pages_warn_only =
+				atoi_non_negative("Idle pages warning",
+						  optarg);
+			break;
 		case 'h':
 		default:
 			help(argv[0]);
@@ -388,6 +428,9 @@  int main(int argc, char *argv[])
 		       "CONFIG_IDLE_PAGE_TRACKING is not enabled");
 	close(page_idle_fd);
 
+	if (idle_pages_warn_only == -1)
+		idle_pages_warn_only = access_tracking_unreliable();
+
 	for_each_guest_mode(run_test, &params);
 
 	return 0;
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index 77d13d7920cb8..c6ef895fbd9ab 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -153,6 +153,7 @@  bool is_backing_src_hugetlb(uint32_t i);
 void backing_src_help(const char *flag);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 long get_run_delay(void);
+bool is_numa_balancing_enabled(void);
 
 /*
  * Whether or not the given source type is shared memory (as opposed to
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index 3dc8538f5d696..03eb99af9b8de 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -176,6 +176,13 @@  size_t get_trans_hugepagesz(void)
 	return get_sysfs_val("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size");
 }
 
+bool is_numa_balancing_enabled(void)
+{
+	if (!test_sysfs_path("/proc/sys/kernel/numa_balancing"))
+		return false;
+	return get_sysfs_val("/proc/sys/kernel/numa_balancing") == 1;
+}
+
 size_t get_def_hugetlb_pagesz(void)
 {
 	char buf[64];