diff mbox series

[v2,1/2] KVM: selftests: Provide generic way to read system counter

Message ID 20230316222752.1911001-2-coltonlewis@google.com (mailing list archive)
State New, archived
Headers show
Series Calculate memory access latency stats | expand

Commit Message

Colton Lewis March 16, 2023, 10:27 p.m. UTC
Provide a generic function to read the system counter from the guest
for timing purposes. A common and important way to measure guest
performance is to measure the amount of time different actions take in
the guest. Provide also a mathematical conversion from cycles to
nanoseconds and a macro for timing individual statements.

Substitute the previous custom implementation of a similar function in
system_counter_offset_test with this new implementation.

Signed-off-by: Colton Lewis <coltonlewis@google.com>
---
 .../testing/selftests/kvm/include/kvm_util.h  | 15 ++++++++++
 tools/testing/selftests/kvm/lib/kvm_util.c    | 30 +++++++++++++++++++
 .../kvm/system_counter_offset_test.c          | 10 ++-----
 3 files changed, 47 insertions(+), 8 deletions(-)

--
2.40.0.rc1.284.g88254d51c5-goog

Comments

Andrew Jones March 17, 2023, 9:26 a.m. UTC | #1
On Thu, Mar 16, 2023 at 10:27:51PM +0000, Colton Lewis wrote:
> Provide a generic function to read the system counter from the guest
> for timing purposes. A common and important way to measure guest
> performance is to measure the amount of time different actions take in
> the guest. Provide also a mathematical conversion from cycles to
> nanoseconds and a macro for timing individual statements.
> 
> Substitute the previous custom implementation of a similar function in
> system_counter_offset_test with this new implementation.
> 
> Signed-off-by: Colton Lewis <coltonlewis@google.com>
> ---
>  .../testing/selftests/kvm/include/kvm_util.h  | 15 ++++++++++
>  tools/testing/selftests/kvm/lib/kvm_util.c    | 30 +++++++++++++++++++
>  .../kvm/system_counter_offset_test.c          | 10 ++-----
>  3 files changed, 47 insertions(+), 8 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
> index c9286811a4cb..8b478eabee4c 100644
> --- a/tools/testing/selftests/kvm/include/kvm_util.h
> +++ b/tools/testing/selftests/kvm/include/kvm_util.h
> @@ -10,4 +10,19 @@
>  #include "kvm_util_base.h"
>  #include "ucall_common.h"
> 
> +#if defined(__aarch64__) || defined(__x86_64__)
> +
> +uint64_t cycles_read(void);
> +double cycles_to_ns(struct kvm_vcpu *vcpu, double cycles);
> +
> +#define MEASURE_CYCLES(x)			\
> +	({					\
> +		uint64_t start;			\
> +		start = cycles_read();		\
> +		x;				\
> +		cycles_read() - start;		\
> +	})
> +
> +#endif
> +
>  #endif /* SELFTEST_KVM_UTIL_H */
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
> index 3ea24a5f4c43..780481a92efe 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -2135,3 +2135,34 @@ void __attribute((constructor)) kvm_selftest_init(void)
> 
>  	kvm_selftest_arch_init();
>  }
> +
> +#if defined(__aarch64__)
> +
> +#include "arch_timer.h"
> +
> +uint64_t cycles_read(void)
> +{
> +	return timer_get_cntct(VIRTUAL);
> +}
> +
> +double cycles_to_ns(struct kvm_vcpu *vcpu, double cycles)
> +{
> +	return cycles * (1e9 / timer_get_cntfrq());
> +}
> +
> +#elif defined(__x86_64__)
> +
> +#include "processor.h"
> +
> +uint64_t cycles_read(void)
> +{
> +	return rdtsc();
> +}
> +
> +double cycles_to_ns(struct kvm_vcpu *vcpu, double cycles)
> +{
> +	uint64_t tsc_khz = __vcpu_ioctl(vcpu, KVM_GET_TSC_KHZ, NULL);
> +
> +	return cycles * (1e9 / (tsc_khz * 1000));
> +}
> +#endif

Instead of the #ifdef's why not must put these functions in
lib/ARCH/processor.c?

Thanks,
drew
Vipin Sharma March 17, 2023, 5:04 p.m. UTC | #2
On Thu, Mar 16, 2023 at 3:29 PM Colton Lewis <coltonlewis@google.com> wrote:
>
> Provide a generic function to read the system counter from the guest
> for timing purposes. A common and important way to measure guest
> performance is to measure the amount of time different actions take in
> the guest. Provide also a mathematical conversion from cycles to
> nanoseconds and a macro for timing individual statements.
>
> Substitute the previous custom implementation of a similar function in

May be specify specific name:  guest_read_system_counter()

> system_counter_offset_test with this new implementation.
>
> Signed-off-by: Colton Lewis <coltonlewis@google.com>
> ---
>  .../testing/selftests/kvm/include/kvm_util.h  | 15 ++++++++++
>  tools/testing/selftests/kvm/lib/kvm_util.c    | 30 +++++++++++++++++++
>  .../kvm/system_counter_offset_test.c          | 10 ++-----
>  3 files changed, 47 insertions(+), 8 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
> index c9286811a4cb..8b478eabee4c 100644
> --- a/tools/testing/selftests/kvm/include/kvm_util.h
> +++ b/tools/testing/selftests/kvm/include/kvm_util.h
> @@ -10,4 +10,19 @@
>  #include "kvm_util_base.h"
>  #include "ucall_common.h"
>
> +#if defined(__aarch64__) || defined(__x86_64__)
> +
> +uint64_t cycles_read(void);
> +double cycles_to_ns(struct kvm_vcpu *vcpu, double cycles);
> +
> +#define MEASURE_CYCLES(x)                      \
> +       ({                                      \
> +               uint64_t start;                 \
> +               start = cycles_read();          \
> +               x;                              \
> +               cycles_read() - start;          \
> +       })
> +

MEASURE_CYCLES should be moved to the next patch where it is getting
used. Does it have to be macro or can it be replaced with a function?

> +#endif
> +
>  #endif /* SELFTEST_KVM_UTIL_H */
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
> index 3ea24a5f4c43..780481a92efe 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -2135,3 +2135,34 @@ void __attribute((constructor)) kvm_selftest_init(void)
>
>         kvm_selftest_arch_init();
>  }
> +
> +#if defined(__aarch64__)
> +
> +#include "arch_timer.h"
> +
> +uint64_t cycles_read(void)
> +{
> +       return timer_get_cntct(VIRTUAL);
> +}
> +
> +double cycles_to_ns(struct kvm_vcpu *vcpu, double cycles)
> +{
> +       return cycles * (1e9 / timer_get_cntfrq());
> +}
> +
> +#elif defined(__x86_64__)
> +
> +#include "processor.h"
> +
> +uint64_t cycles_read(void)
> +{
> +       return rdtsc();
> +}
> +
> +double cycles_to_ns(struct kvm_vcpu *vcpu, double cycles)
> +{
> +       uint64_t tsc_khz = __vcpu_ioctl(vcpu, KVM_GET_TSC_KHZ, NULL);
> +
> +       return cycles * (1e9 / (tsc_khz * 1000));
> +}
> +#endif

As Andrew noted,  these should be in the respective processor files.
Marc Zyngier March 17, 2023, 5:09 p.m. UTC | #3
On Thu, 16 Mar 2023 22:27:51 +0000,
Colton Lewis <coltonlewis@google.com> wrote:
> 
> Provide a generic function to read the system counter from the guest
> for timing purposes. A common and important way to measure guest
> performance is to measure the amount of time different actions take in
> the guest. Provide also a mathematical conversion from cycles to
> nanoseconds and a macro for timing individual statements.
> 
> Substitute the previous custom implementation of a similar function in
> system_counter_offset_test with this new implementation.
> 
> Signed-off-by: Colton Lewis <coltonlewis@google.com>
> ---
>  .../testing/selftests/kvm/include/kvm_util.h  | 15 ++++++++++
>  tools/testing/selftests/kvm/lib/kvm_util.c    | 30 +++++++++++++++++++
>  .../kvm/system_counter_offset_test.c          | 10 ++-----
>  3 files changed, 47 insertions(+), 8 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
> index c9286811a4cb..8b478eabee4c 100644
> --- a/tools/testing/selftests/kvm/include/kvm_util.h
> +++ b/tools/testing/selftests/kvm/include/kvm_util.h
> @@ -10,4 +10,19 @@
>  #include "kvm_util_base.h"
>  #include "ucall_common.h"
> 
> +#if defined(__aarch64__) || defined(__x86_64__)
> +
> +uint64_t cycles_read(void);
> +double cycles_to_ns(struct kvm_vcpu *vcpu, double cycles);
> +
> +#define MEASURE_CYCLES(x)			\
> +	({					\
> +		uint64_t start;			\
> +		start = cycles_read();		\
> +		x;				\

You insert memory accesses inside a sequence that has no dependency
with it. On a weakly ordered memory system, there is absolutely no
reason why the memory access shouldn't be moved around. What do you
exactly measure in that case?

> +		cycles_read() - start;		\

I also question the usefulness of this exercise. You're comparing the
time it takes for a multi-GHz system to put a write in a store buffer
(assuming it didn't miss in the TLBs) vs a counter that gets updated
at a frequency of a few tens of MHz.

My guts feeling is that this results in a big fat zero most of the
time, but I'm happy to be explained otherwise.

> +	})
> +
> +#endif
> +
>  #endif /* SELFTEST_KVM_UTIL_H */
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
> index 3ea24a5f4c43..780481a92efe 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -2135,3 +2135,34 @@ void __attribute((constructor)) kvm_selftest_init(void)
> 
>  	kvm_selftest_arch_init();
>  }
> +
> +#if defined(__aarch64__)
> +
> +#include "arch_timer.h"
> +
> +uint64_t cycles_read(void)
> +{
> +	return timer_get_cntct(VIRTUAL);
> +}
> +
> +double cycles_to_ns(struct kvm_vcpu *vcpu, double cycles)
> +{
> +	return cycles * (1e9 / timer_get_cntfrq());

We already have all the required code to deal with ns conversions
using a multiplier and a shift, avoiding floating point like the
plague it is. Please reuse the kernel code for this, as you're quite
likely to only measure the time it takes for KVM to trap the FP
registers and perform a FP/SIMD switch...

	M.
Marc Zyngier March 28, 2023, 10:09 a.m. UTC | #4
On Tue, 21 Mar 2023 19:10:04 +0000,
Colton Lewis <coltonlewis@google.com> wrote:
> 
> Marc Zyngier <maz@kernel.org> writes:
> 
> >> +#define MEASURE_CYCLES(x)			\
> >> +	({					\
> >> +		uint64_t start;			\
> >> +		start = cycles_read();		\
> >> +		x;				\
> 
> > You insert memory accesses inside a sequence that has no dependency
> > with it. On a weakly ordered memory system, there is absolutely no
> > reason why the memory access shouldn't be moved around. What do you
> > exactly measure in that case?
> 
> cycles_read is built on another function timer_get_cntct which includes
> its own barriers. Stripped of some abstraction, the sequence is:
> 
> timer_get_cntct (isb+read timer)
> whatever is being measured
> timer_get_cntct
> 
> I hadn't looked at it too closely before but on review of the manual
> I think you are correct. Borrowing from example D7-2 in the manual, it
> should be:
> 
> timer_get_cntct
> isb
> whatever is being measured
> dsb
> timer_get_cntct

That's better, but also very heavy handed. You'd be better off
constructing an address dependency from the timer value, and feed that
into a load-acquire/store-release pair wrapping your payload.

> 
> >> +		cycles_read() - start;		\
> 
> > I also question the usefulness of this exercise. You're comparing the
> > time it takes for a multi-GHz system to put a write in a store buffer
> > (assuming it didn't miss in the TLBs) vs a counter that gets updated
> > at a frequency of a few tens of MHz.
> 
> > My guts feeling is that this results in a big fat zero most of the
> > time, but I'm happy to be explained otherwise.
> 
> 
> In context, I'm trying to measure the time it takes to write to a buffer
> *with dirty memory logging enabled*. What do you mean by zero? I can
> confirm from running this code I am not measuring zero time.

See my earlier point: the counter tick is a few MHz, and the CPU
multiple GHz. So unless "whatever" is something that takes a
significant time (several thousands of CPU cycles), you'll measure
nothing using the counter. Page faults will probably show, but not a
normal access.

The right tool for this job is to use PMU events, as they count at the
CPU frequency.

Thanks,

	M.
Sean Christopherson March 28, 2023, 3:38 p.m. UTC | #5
On Tue, Mar 28, 2023, Marc Zyngier wrote:
> On Tue, 21 Mar 2023 19:10:04 +0000,
> Colton Lewis <coltonlewis@google.com> wrote:
> > In context, I'm trying to measure the time it takes to write to a buffer
> > *with dirty memory logging enabled*. What do you mean by zero? I can
> > confirm from running this code I am not measuring zero time.
> 
> See my earlier point: the counter tick is a few MHz, and the CPU
> multiple GHz.

On x86, the system counter (TSC) counts at multiple GHz, so we should be able to
continue with that approach for x86.

> So unless "whatever" is something that takes a significant time (several
> thousands of CPU cycles), you'll measure nothing using the counter. Page
> faults will probably show, but not a normal access.
> 
> The right tool for this job is to use PMU events, as they count at the CPU
> frequency.

Out of curiosity, what does the kernel end up using for things like ndelay()?  I
tried to follow the breadcrumbs for ARM and got as far as arm_arch_timer.c, but
after that I'm more than a bit lost.
Marc Zyngier March 28, 2023, 3:50 p.m. UTC | #6
On Tue, 28 Mar 2023 16:38:37 +0100,
Sean Christopherson <seanjc@google.com> wrote:
> 
> On Tue, Mar 28, 2023, Marc Zyngier wrote:
> > On Tue, 21 Mar 2023 19:10:04 +0000,
> > Colton Lewis <coltonlewis@google.com> wrote:
> > > In context, I'm trying to measure the time it takes to write to a buffer
> > > *with dirty memory logging enabled*. What do you mean by zero? I can
> > > confirm from running this code I am not measuring zero time.
> > 
> > See my earlier point: the counter tick is a few MHz, and the CPU
> > multiple GHz.
> 
> On x86, the system counter (TSC) counts at multiple GHz, so we
> should be able to continue with that approach for x86.
> 
> > So unless "whatever" is something that takes a significant time (several
> > thousands of CPU cycles), you'll measure nothing using the counter. Page
> > faults will probably show, but not a normal access.
> > 
> > The right tool for this job is to use PMU events, as they count at the CPU
> > frequency.
> 
> Out of curiosity, what does the kernel end up using for things like ndelay()?  I
> tried to follow the breadcrumbs for ARM and got as far as arm_arch_timer.c, but
> after that I'm more than a bit lost.

That's where it ends. We use the counter for everything. Even on
ARMv8.6+ HW that is supposed to give you a 1GHz counter,
implementations are allowed to perform "lumpy" increments (50MHz
counter with increments of 20 per tick, for example).

	M.
diff mbox series

Patch

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index c9286811a4cb..8b478eabee4c 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -10,4 +10,19 @@ 
 #include "kvm_util_base.h"
 #include "ucall_common.h"

+#if defined(__aarch64__) || defined(__x86_64__)
+
+uint64_t cycles_read(void);
+double cycles_to_ns(struct kvm_vcpu *vcpu, double cycles);
+
+#define MEASURE_CYCLES(x)			\
+	({					\
+		uint64_t start;			\
+		start = cycles_read();		\
+		x;				\
+		cycles_read() - start;		\
+	})
+
+#endif
+
 #endif /* SELFTEST_KVM_UTIL_H */
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 3ea24a5f4c43..780481a92efe 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -2135,3 +2135,34 @@  void __attribute((constructor)) kvm_selftest_init(void)

 	kvm_selftest_arch_init();
 }
+
+#if defined(__aarch64__)
+
+#include "arch_timer.h"
+
+uint64_t cycles_read(void)
+{
+	return timer_get_cntct(VIRTUAL);
+}
+
+double cycles_to_ns(struct kvm_vcpu *vcpu, double cycles)
+{
+	return cycles * (1e9 / timer_get_cntfrq());
+}
+
+#elif defined(__x86_64__)
+
+#include "processor.h"
+
+uint64_t cycles_read(void)
+{
+	return rdtsc();
+}
+
+double cycles_to_ns(struct kvm_vcpu *vcpu, double cycles)
+{
+	uint64_t tsc_khz = __vcpu_ioctl(vcpu, KVM_GET_TSC_KHZ, NULL);
+
+	return cycles * (1e9 / (tsc_khz * 1000));
+}
+#endif
diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c b/tools/testing/selftests/kvm/system_counter_offset_test.c
index 7f5b330b6a1b..44101d0fcb48 100644
--- a/tools/testing/selftests/kvm/system_counter_offset_test.c
+++ b/tools/testing/selftests/kvm/system_counter_offset_test.c
@@ -39,14 +39,9 @@  static void setup_system_counter(struct kvm_vcpu *vcpu, struct test_case *test)
 			     &test->tsc_offset);
 }

-static uint64_t guest_read_system_counter(struct test_case *test)
-{
-	return rdtsc();
-}
-
 static uint64_t host_read_guest_system_counter(struct test_case *test)
 {
-	return rdtsc() + test->tsc_offset;
+	return cycles_read() + test->tsc_offset;
 }

 #else /* __x86_64__ */
@@ -63,9 +58,8 @@  static void guest_main(void)
 	int i;

 	for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
-		struct test_case *test = &test_cases[i];

-		GUEST_SYNC_CLOCK(i, guest_read_system_counter(test));
+		GUEST_SYNC_CLOCK(i, cycles_read());
 	}
 }