diff mbox series

[2/3] KVM: selftests: Collect memory access latency samples

Message ID 20221115173258.2530923-3-coltonlewis@google.com (mailing list archive)
State New, archived
Headers show
Series Calculate memory access latency stats | expand

Commit Message

Colton Lewis Nov. 15, 2022, 5:32 p.m. UTC
Collect memory access latency measured in clock cycles.

This introduces a dependency on the timers for ARM and x86. No other
architectures are implemented and their samples will all be 0.

Because keeping all samples is impractical due to the space required
in some cases (pooled memory w/ 64 vcpus would be 64 GB/vcpu * 64
vcpus * 250,000 samples/GB * 8 bytes/sample ~ 8 Gb extra memory just
for samples), resevior sampling is used to only keep a small number of
samples per vcpu (1000 samples in this patch).

Resevoir sampling means despite keeping only a small number of
samples, each sample has an equal chance of making it to the
resevoir. Simple proofs of this can be found online. This makes the
resevoir a good representation of the distribution of samples and
enables calculation of reasonably accurate percentiles.

All samples are stored in a statically allocated flat array for ease
of combining them later. Samples are stored at an offset in this array
calculated by the vcpu index (so vcpu 5 sample 10 would be stored at
address sample_times + 5 * vcpu_idx + 10).

Signed-off-by: Colton Lewis <coltonlewis@google.com>
---
 .../selftests/kvm/lib/perf_test_util.c        | 34 +++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

Comments

Ricardo Koller Jan. 17, 2023, 8:43 p.m. UTC | #1
On Tue, Nov 15, 2022 at 05:32:57PM +0000, Colton Lewis wrote:
> Collect memory access latency measured in clock cycles.
> 
> This introduces a dependency on the timers for ARM and x86. No other
> architectures are implemented and their samples will all be 0.
> 
> Because keeping all samples is impractical due to the space required
> in some cases (pooled memory w/ 64 vcpus would be 64 GB/vcpu * 64
> vcpus * 250,000 samples/GB * 8 bytes/sample ~ 8 Gb extra memory just
> for samples), resevior sampling is used to only keep a small number of

nit: reservoir

> samples per vcpu (1000 samples in this patch).

Didn't see this before my previous comment. But, I guess it still
applies: isn't it possible to know the number of events to store?  to
avoid the "100" obtained via trial and error.

> 
> Resevoir sampling means despite keeping only a small number of
> samples, each sample has an equal chance of making it to the
> resevoir. Simple proofs of this can be found online. This makes the
> resevoir a good representation of the distribution of samples and

reservoir

> enables calculation of reasonably accurate percentiles.
> 
> All samples are stored in a statically allocated flat array for ease
> of combining them later. Samples are stored at an offset in this array
> calculated by the vcpu index (so vcpu 5 sample 10 would be stored at
> address sample_times + 5 * vcpu_idx + 10).
> 
> Signed-off-by: Colton Lewis <coltonlewis@google.com>
> ---
>  .../selftests/kvm/lib/perf_test_util.c        | 34 +++++++++++++++++--
>  1 file changed, 32 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
> index a48904b64e19..0311da76bae0 100644
> --- a/tools/testing/selftests/kvm/lib/perf_test_util.c
> +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
> @@ -4,6 +4,9 @@
>   */
>  #include <inttypes.h>
>  
> +#if defined(__aarch64__)
> +#include "aarch64/arch_timer.h"
> +#endif
>  #include "kvm_util.h"
>  #include "perf_test_util.h"
>  #include "processor.h"
> @@ -44,6 +47,18 @@ static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
>  /* Store all samples in a flat array so they can be easily sorted later. */
>  uint64_t latency_samples[SAMPLE_CAPACITY];
>  
> +static uint64_t perf_test_timer_read(void)
> +{
> +#if defined(__aarch64__)
> +	return timer_get_cntct(VIRTUAL);
> +#elif defined(__x86_64__)
> +	return rdtsc();
> +#else
> +#warn __func__ " is not implemented for this architecture, will return 0"
> +	return 0;
> +#endif
> +}
> +
>  /*
>   * Continuously write to the first 8 bytes of each page in the
>   * specified region.
> @@ -59,6 +74,10 @@ void perf_test_guest_code(uint32_t vcpu_idx)
>  	int i;
>  	struct guest_random_state rand_state =
>  		new_guest_random_state(pta->random_seed + vcpu_idx);
> +	uint64_t *latency_samples_offset = latency_samples + SAMPLES_PER_VCPU * vcpu_idx;
> +	uint64_t count_before;
> +	uint64_t count_after;
> +	uint32_t maybe_sample;
>  
>  	gva = vcpu_args->gva;
>  	pages = vcpu_args->pages;
> @@ -75,10 +94,21 @@ void perf_test_guest_code(uint32_t vcpu_idx)
>  
>  			addr = gva + (page * pta->guest_page_size);
>  
> -			if (guest_random_u32(&rand_state) % 100 < pta->write_percent)
> +			if (guest_random_u32(&rand_state) % 100 < pta->write_percent) {
> +				count_before = perf_test_timer_read();
>  				*(uint64_t *)addr = 0x0123456789ABCDEF;
> -			else
> +				count_after = perf_test_timer_read();
> +			} else {
> +				count_before = perf_test_timer_read();
>  				READ_ONCE(*(uint64_t *)addr);
> +				count_after = perf_test_timer_read();

"count_before ... ACCESS count_after" could be moved to some macro,
e.g.,:
	t = MEASURE(READ_ONCE(*(uint64_t *)addr));

> +			}
> +
> +			maybe_sample = guest_random_u32(&rand_state) % (i + 1);
> +			if (i < SAMPLES_PER_VCPU)
> +				latency_samples_offset[i] = count_after - count_before;
> +			else if (maybe_sample < SAMPLES_PER_VCPU)
> +				latency_samples_offset[maybe_sample] = count_after - count_before;

I would prefer these reservoir sampling details to be in a helper, 
e.g.,:
	reservoir_sample_record(t, i);

>  		}
>  
>  		GUEST_SYNC(1);
> -- 
> 2.38.1.431.g37b22c650d-goog
>
Ricardo Koller Jan. 17, 2023, 8:48 p.m. UTC | #2
On Tue, Nov 15, 2022 at 05:32:57PM +0000, Colton Lewis wrote:
> Collect memory access latency measured in clock cycles.
> 
> This introduces a dependency on the timers for ARM and x86. No other
> architectures are implemented and their samples will all be 0.
> 
> Because keeping all samples is impractical due to the space required
> in some cases (pooled memory w/ 64 vcpus would be 64 GB/vcpu * 64
> vcpus * 250,000 samples/GB * 8 bytes/sample ~ 8 Gb extra memory just
> for samples), resevior sampling is used to only keep a small number of
> samples per vcpu (1000 samples in this patch).
> 
> Resevoir sampling means despite keeping only a small number of

Could you add a reference? I'm trying to understand the algorithm, but
I'm not even sure what's the version implemented ("Optimal: Algorithm
L"?). 

> samples, each sample has an equal chance of making it to the
> resevoir. Simple proofs of this can be found online. This makes the
> resevoir a good representation of the distribution of samples and
> enables calculation of reasonably accurate percentiles.
> 
> All samples are stored in a statically allocated flat array for ease
> of combining them later. Samples are stored at an offset in this array
> calculated by the vcpu index (so vcpu 5 sample 10 would be stored at
> address sample_times + 5 * vcpu_idx + 10).
> 
> Signed-off-by: Colton Lewis <coltonlewis@google.com>
> ---
>  .../selftests/kvm/lib/perf_test_util.c        | 34 +++++++++++++++++--
>  1 file changed, 32 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
> index a48904b64e19..0311da76bae0 100644
> --- a/tools/testing/selftests/kvm/lib/perf_test_util.c
> +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
> @@ -4,6 +4,9 @@
>   */
>  #include <inttypes.h>
>  
> +#if defined(__aarch64__)
> +#include "aarch64/arch_timer.h"
> +#endif
>  #include "kvm_util.h"
>  #include "perf_test_util.h"
>  #include "processor.h"
> @@ -44,6 +47,18 @@ static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
>  /* Store all samples in a flat array so they can be easily sorted later. */
>  uint64_t latency_samples[SAMPLE_CAPACITY];
>  
> +static uint64_t perf_test_timer_read(void)
> +{
> +#if defined(__aarch64__)
> +	return timer_get_cntct(VIRTUAL);
> +#elif defined(__x86_64__)
> +	return rdtsc();
> +#else
> +#warn __func__ " is not implemented for this architecture, will return 0"
> +	return 0;
> +#endif
> +}
> +
>  /*
>   * Continuously write to the first 8 bytes of each page in the
>   * specified region.
> @@ -59,6 +74,10 @@ void perf_test_guest_code(uint32_t vcpu_idx)
>  	int i;
>  	struct guest_random_state rand_state =
>  		new_guest_random_state(pta->random_seed + vcpu_idx);
> +	uint64_t *latency_samples_offset = latency_samples + SAMPLES_PER_VCPU * vcpu_idx;
> +	uint64_t count_before;
> +	uint64_t count_after;
> +	uint32_t maybe_sample;
>  
>  	gva = vcpu_args->gva;
>  	pages = vcpu_args->pages;
> @@ -75,10 +94,21 @@ void perf_test_guest_code(uint32_t vcpu_idx)
>  
>  			addr = gva + (page * pta->guest_page_size);
>  
> -			if (guest_random_u32(&rand_state) % 100 < pta->write_percent)
> +			if (guest_random_u32(&rand_state) % 100 < pta->write_percent) {
> +				count_before = perf_test_timer_read();
>  				*(uint64_t *)addr = 0x0123456789ABCDEF;
> -			else
> +				count_after = perf_test_timer_read();
> +			} else {
> +				count_before = perf_test_timer_read();
>  				READ_ONCE(*(uint64_t *)addr);
> +				count_after = perf_test_timer_read();
> +			}
> +
> +			maybe_sample = guest_random_u32(&rand_state) % (i + 1);
> +			if (i < SAMPLES_PER_VCPU)
> +				latency_samples_offset[i] = count_after - count_before;
> +			else if (maybe_sample < SAMPLES_PER_VCPU)
> +				latency_samples_offset[maybe_sample] = count_after - count_before;
>  		}
>  
>  		GUEST_SYNC(1);
> -- 
> 2.38.1.431.g37b22c650d-goog
>
Sean Christopherson Jan. 18, 2023, 4:32 p.m. UTC | #3
On Tue, Jan 17, 2023, Ricardo Koller wrote:
> On Tue, Nov 15, 2022 at 05:32:57PM +0000, Colton Lewis wrote:
> > @@ -44,6 +47,18 @@ static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
> >  /* Store all samples in a flat array so they can be easily sorted later. */
> >  uint64_t latency_samples[SAMPLE_CAPACITY];
> >  
> > +static uint64_t perf_test_timer_read(void)
> > +{
> > +#if defined(__aarch64__)
> > +	return timer_get_cntct(VIRTUAL);
> > +#elif defined(__x86_64__)
> > +	return rdtsc();
> > +#else
> > +#warn __func__ " is not implemented for this architecture, will return 0"
> > +	return 0;
> > +#endif
> > +}

I would prefer to put the guest-side timer helpers into common code, e.g. as
guest_read_system_counter(), replacing system_counter_offset_test.c's one-off
version.

> >  /*
> >   * Continuously write to the first 8 bytes of each page in the
> >   * specified region.
> > @@ -59,6 +74,10 @@ void perf_test_guest_code(uint32_t vcpu_idx)
> >  	int i;
> >  	struct guest_random_state rand_state =
> >  		new_guest_random_state(pta->random_seed + vcpu_idx);
> > +	uint64_t *latency_samples_offset = latency_samples + SAMPLES_PER_VCPU * vcpu_idx;

"offset" is confusing because the system counter (TSC in x86) has an offset for
the guest-perceived value.  Maybe just "latencies"?

> > +	uint64_t count_before;
> > +	uint64_t count_after;

Maybe s/count/time?  Yeah, it's technically wrong to call it "time", but "count"
is too generic.

> > +	uint32_t maybe_sample;
> >  
> >  	gva = vcpu_args->gva;
> >  	pages = vcpu_args->pages;
> > @@ -75,10 +94,21 @@ void perf_test_guest_code(uint32_t vcpu_idx)
> >  
> >  			addr = gva + (page * pta->guest_page_size);
> >  
> > -			if (guest_random_u32(&rand_state) % 100 < pta->write_percent)
> > +			if (guest_random_u32(&rand_state) % 100 < pta->write_percent) {
> > +				count_before = perf_test_timer_read();
> >  				*(uint64_t *)addr = 0x0123456789ABCDEF;
> > -			else
> > +				count_after = perf_test_timer_read();
> > +			} else {
> > +				count_before = perf_test_timer_read();
> >  				READ_ONCE(*(uint64_t *)addr);
> > +				count_after = perf_test_timer_read();
> 
> "count_before ... ACCESS count_after" could be moved to some macro,
> e.g.,:
> 	t = MEASURE(READ_ONCE(*(uint64_t *)addr));

Even better, capture the read vs. write in a local variable to self-document the
use of the RNG, then the motivation for reading the system counter inside the
if/else-statements goes away.  That way we don't need to come up with a name
that documents what MEASURE() measures.

			write = guest_random_u32(&rand_state) % 100 < args->write_percent;

			time_before = guest_system_counter_read();
			if (write)
				*(uint64_t *)addr = 0x0123456789ABCDEF;
			else
				READ_ONCE(*(uint64_t *)addr);
			time_after = guest_system_counter_read();

> > +			}
> > +
> > +			maybe_sample = guest_random_u32(&rand_state) % (i + 1);

No need to generate a random number for iterations that always sample.  And I
think it will make the code easier to follow if there is a single write to the
array.  The derivation of the index is what's interesting and different, we should
use code to highlight that.

			/*
			 * Always sample early iterations to ensure at least the
			 * number of requested samples is collected.  Once the
			 * array has been filled, <here is a comment from Colton
			 * briefly explaining the math>.
			 * 
			if (i < SAMPLES_PER_VCPU)
				idx = i;
			else
				idx = guest_random_u32(&rand_state) % (i + 1);

			if (idx < SAMPLES_PER_VCPU)
				latencies[idx] = time_after - time_before;

> > +			if (i < SAMPLES_PER_VCPU)

Would it make sense to let the user configure the number of samples?  Seems easy
enough and would let the user ultimately decide how much memory to burn on samples.

> > +				latency_samples_offset[i] = count_after - count_before;
> > +			else if (maybe_sample < SAMPLES_PER_VCPU)
> > +				latency_samples_offset[maybe_sample] = count_after - count_before;
> 
> I would prefer these reservoir sampling details to be in a helper, 
> e.g.,:
> 	reservoir_sample_record(t, i);

Heh, I vote to open code the behavior.  I dislike fancy names that hide relatively
simple logic.  IMO, readers won't care how the modulo math provides even distribution,
just that it does and that the early iterations always samples to ensure ever bucket
is filled.

In this case, I can pretty much guarantee that I'd end up spending more time
digging into what "reservoir" means than I would to understand the basic flow.
Ricardo Koller Jan. 26, 2023, 6:30 p.m. UTC | #4
On Thu, Jan 26, 2023 at 9:58 AM Colton Lewis <coltonlewis@google.com> wrote:
>
> Ricardo Koller <ricarkol@google.com> writes:
>
>
> > nit: reservoir
>
>
> Will fix
>
>
> > Didn't see this before my previous comment. But, I guess it still
> > applies: isn't it possible to know the number of events to store?  to
> > avoid the "100" obtained via trial and error.
>
>
> That's what I thought I was calculating with sample_pages =
> sizeof(latency_samples) / pta->guest_page_size. Both values are in bytes
> so that should give the number of guest pages I need to allocate to hold
> latency_samples. The 100 is a fudge factor added to the calculation
> after encountering crashes. Any idea why the math above is incorrect?
>

Mm, I don't know to be honest. But I do think it's required to calculate
the exact number of bytes needed. Otherwise, 100 might not be enough for
a bigger VM with more samples.

>
> > reservoir
>
>
> Will fix.
Sean Christopherson Jan. 26, 2023, 7:07 p.m. UTC | #5
On Thu, Jan 26, 2023, Colton Lewis wrote:
> Sean Christopherson <seanjc@google.com> writes:
> > Maybe s/count/time?  Yeah, it's technically wrong to call it "time", but
> > "count" is too generic.
> 
> I could say "cycles".

Works for me.

> > > > +	uint32_t maybe_sample;
> > > >
> > > >  	gva = vcpu_args->gva;
> > > >  	pages = vcpu_args->pages;
> > > > @@ -75,10 +94,21 @@ void perf_test_guest_code(uint32_t vcpu_idx)
> > > >
> > > >  			addr = gva + (page * pta->guest_page_size);
> > > >
> > > > -			if (guest_random_u32(&rand_state) % 100 < pta->write_percent)
> > > > +			if (guest_random_u32(&rand_state) % 100 < pta->write_percent) {
> > > > +				count_before = perf_test_timer_read();
> > > >  				*(uint64_t *)addr = 0x0123456789ABCDEF;
> > > > -			else
> > > > +				count_after = perf_test_timer_read();
> > > > +			} else {
> > > > +				count_before = perf_test_timer_read();
> > > >  				READ_ONCE(*(uint64_t *)addr);
> > > > +				count_after = perf_test_timer_read();
> 
> > > "count_before ... ACCESS count_after" could be moved to some macro,
> > > e.g.,:
> > > 	t = MEASURE(READ_ONCE(*(uint64_t *)addr));
> 
> > Even better, capture the read vs. write in a local variable to
> > self-document the
> > use of the RNG, then the motivation for reading the system counter
> > inside the
> > if/else-statements goes away.  That way we don't need to come up with a
> > name
> > that documents what MEASURE() measures.
> 
> > 			write = guest_random_u32(&rand_state) % 100 < args->write_percent;
> 
> > 			time_before = guest_system_counter_read();
> > 			if (write)
> > 				*(uint64_t *)addr = 0x0123456789ABCDEF;
> > 			else
> > 				READ_ONCE(*(uint64_t *)addr);
> > 			time_after = guest_system_counter_read();
> 
> Couldn't timing before and after the if statement produce bad
> measurements? We might be including a branch mispredict in our memory
> access latency and this could happen a lot because it's random so no way
> for the CPU to predict.

Hmm, I was assuming the latency due to a (mispredicted) branch would be in the
noise compared to the latency of a VM-Exit needed to handle the fault.

On the other hand, adding a common macro would be trivial, it's only the naming
that's hard.  What if we keep with the "cycles" theme and do as Ricardo suggested?
E.g. minus backslashes, this doesn't look awful.

#define MEASURE_CYCLES(x)
({
	uint64_t start;

	start = guest_system_counter_read();
	x;
	guest_system_counter_read() - start;
})



> > > > +			if (i < SAMPLES_PER_VCPU)
> 
> > Would it make sense to let the user configure the number of samples?
> > Seems easy enough and would let the user ultimately decide how much memory
> > to burn on samples.
> 
> Theoretically users may wish to tweak the accuracy vs memory use
> tradeoff. Seemed like a shakey value proposition to me because of
> diminishing returns to increased accuracy, but I will include an option
> if you insist.

It's not so much that I expect users to want better accuracy, it's that I dislike
I arbitrary magic numbers.  E.g. why 1000 and not 500 or 2000?  Especially since
the number of accesses _is_ controllable by the user.  Hmm, the other thing to
consider is that, as proposed, the vast majority of the capacity will go unused,
e.g. default is 4, max is 512 (and that should really be tied to KVM's max of 4096).

What if we invert the calculation and define the capacity in bytes, and derive
the number of samples based on capacity / nr_vcpus?  And then push the capacity
as high as possible, e.g. I assume there's a threshold where the test will fail
because of selftests poor page table management.  That way the sampling is as
efficient as possible, and the arbitrary capacity comes from limitations within
the framework, as opposed to a human defined magic number.
diff mbox series

Patch

diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
index a48904b64e19..0311da76bae0 100644
--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
+++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
@@ -4,6 +4,9 @@ 
  */
 #include <inttypes.h>
 
+#if defined(__aarch64__)
+#include "aarch64/arch_timer.h"
+#endif
 #include "kvm_util.h"
 #include "perf_test_util.h"
 #include "processor.h"
@@ -44,6 +47,18 @@  static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 /* Store all samples in a flat array so they can be easily sorted later. */
 uint64_t latency_samples[SAMPLE_CAPACITY];
 
+static uint64_t perf_test_timer_read(void)
+{
+#if defined(__aarch64__)
+	return timer_get_cntct(VIRTUAL);
+#elif defined(__x86_64__)
+	return rdtsc();
+#else
+#warn __func__ " is not implemented for this architecture, will return 0"
+	return 0;
+#endif
+}
+
 /*
  * Continuously write to the first 8 bytes of each page in the
  * specified region.
@@ -59,6 +74,10 @@  void perf_test_guest_code(uint32_t vcpu_idx)
 	int i;
 	struct guest_random_state rand_state =
 		new_guest_random_state(pta->random_seed + vcpu_idx);
+	uint64_t *latency_samples_offset = latency_samples + SAMPLES_PER_VCPU * vcpu_idx;
+	uint64_t count_before;
+	uint64_t count_after;
+	uint32_t maybe_sample;
 
 	gva = vcpu_args->gva;
 	pages = vcpu_args->pages;
@@ -75,10 +94,21 @@  void perf_test_guest_code(uint32_t vcpu_idx)
 
 			addr = gva + (page * pta->guest_page_size);
 
-			if (guest_random_u32(&rand_state) % 100 < pta->write_percent)
+			if (guest_random_u32(&rand_state) % 100 < pta->write_percent) {
+				count_before = perf_test_timer_read();
 				*(uint64_t *)addr = 0x0123456789ABCDEF;
-			else
+				count_after = perf_test_timer_read();
+			} else {
+				count_before = perf_test_timer_read();
 				READ_ONCE(*(uint64_t *)addr);
+				count_after = perf_test_timer_read();
+			}
+
+			maybe_sample = guest_random_u32(&rand_state) % (i + 1);
+			if (i < SAMPLES_PER_VCPU)
+				latency_samples_offset[i] = count_after - count_before;
+			else if (maybe_sample < SAMPLES_PER_VCPU)
+				latency_samples_offset[maybe_sample] = count_after - count_before;
 		}
 
 		GUEST_SYNC(1);