diff mbox series

[kvm-unit-tests,RFC] Inlining in PMU Test

Message ID CAGG=3QXUfFksVLF=gzU3EYkyf7RQKvr5_FU6Ea5enf39vinY3A@mail.gmail.com (mailing list archive)
State New, archived
Headers show
Series [kvm-unit-tests,RFC] Inlining in PMU Test | expand

Commit Message

Bill Wendling May 27, 2022, 1:32 a.m. UTC
I'm into an issue when I compile kvm-unit-tests with a new-ish Clang
version. It results in a failure similar to this:

Serial contents after VMM exited:
SeaBIOS (version 1.8.2-20160510_123855-google)
Total RAM Size = 0x0000000100000000 = 4096 MiB
CPU Mhz=2000
CPUs found: 1     Max CPUs supported: 1
Booting from ROM...
enabling apic
paging enabled
cr0 = 80010011
cr3 = bfefc000
cr4 = 20
PMU version:         4
GP counters:         3
GP counter width:    48
Mask length:         7
Fixed counters:      3
Fixed counter width: 48
 ---8<---
PASS: all counters
FAIL: overflow: cntr-0
PASS: overflow: status-0
PASS: overflow: status clear-0
PASS: overflow: irq-0
FAIL: overflow: cntr-1
PASS: overflow: status-1
PASS: overflow: status clear-1
PASS: overflow: irq-1
FAIL: overflow: cntr-2
PASS: overflow: status-2
PASS: overflow: status clear-2
PASS: overflow: irq-2
FAIL: overflow: cntr-3
PASS: overflow: status-3
PASS: overflow: status clear-3
PASS: overflow: irq-3
 ---8<---

It turns out that newer Clangs are much more aggressive at inlining
than GCC. I could replicate this failure with GCC with the patch
below[1] (the patch probably isn't minimal). If I add the "noinline"
attribute "measure()" in the patch below, the test passes.

Is there a subtle assumption being made by the test that breaks with
aggressive inlining? If so, is adding the "noinline" attribute to
"measure()" the correct fix, or should the test be made more robust?

-bw

[1]

@@ -166,14 +166,14 @@ static void global_enable(pmu_counter_t *cnt)
                        (1ull << cnt->idx));
 }

-static void global_disable(pmu_counter_t *cnt)
+static __always_inline void global_disable(pmu_counter_t *cnt)
 {
        wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, rdmsr(MSR_CORE_PERF_GLOBAL_CTRL) &
                        ~(1ull << cnt->idx));
 }


-static void start_event(pmu_counter_t *evt)
+static __always_inline void start_event(pmu_counter_t *evt)
 {
     wrmsr(evt->ctr, evt->count);
     if (is_gp(evt))
@@ -197,7 +197,7 @@ static void start_event(pmu_counter_t *evt)
     apic_write(APIC_LVTPC, PC_VECTOR);
 }

-static void stop_event(pmu_counter_t *evt)
+static __always_inline void stop_event(pmu_counter_t *evt)
 {
        global_disable(evt);
        if (is_gp(evt))
@@ -211,7 +211,7 @@ static void stop_event(pmu_counter_t *evt)
        evt->count = rdmsr(evt->ctr);
 }

-static void measure(pmu_counter_t *evt, int count)
+static __always_inline void measure(pmu_counter_t *evt, int count)
 {
        int i;
        for (i = 0; i < count; i++)

Comments

Bill Wendling May 31, 2022, 5:20 p.m. UTC | #1
Bump for exposure.

-bw

On Thu, May 26, 2022 at 6:32 PM Bill Wendling <morbo@google.com> wrote:
>
> I'm into an issue when I compile kvm-unit-tests with a new-ish Clang
> version. It results in a failure similar to this:
>
> Serial contents after VMM exited:
> SeaBIOS (version 1.8.2-20160510_123855-google)
> Total RAM Size = 0x0000000100000000 = 4096 MiB
> CPU Mhz=2000
> CPUs found: 1     Max CPUs supported: 1
> Booting from ROM...
> enabling apic
> paging enabled
> cr0 = 80010011
> cr3 = bfefc000
> cr4 = 20
> PMU version:         4
> GP counters:         3
> GP counter width:    48
> Mask length:         7
> Fixed counters:      3
> Fixed counter width: 48
>  ---8<---
> PASS: all counters
> FAIL: overflow: cntr-0
> PASS: overflow: status-0
> PASS: overflow: status clear-0
> PASS: overflow: irq-0
> FAIL: overflow: cntr-1
> PASS: overflow: status-1
> PASS: overflow: status clear-1
> PASS: overflow: irq-1
> FAIL: overflow: cntr-2
> PASS: overflow: status-2
> PASS: overflow: status clear-2
> PASS: overflow: irq-2
> FAIL: overflow: cntr-3
> PASS: overflow: status-3
> PASS: overflow: status clear-3
> PASS: overflow: irq-3
>  ---8<---
>
> It turns out that newer Clangs are much more aggressive at inlining
> than GCC. I could replicate this failure with GCC with the patch
> below[1] (the patch probably isn't minimal). If I add the "noinline"
> attribute "measure()" in the patch below, the test passes.
>
> Is there a subtle assumption being made by the test that breaks with
> aggressive inlining? If so, is adding the "noinline" attribute to
> "measure()" the correct fix, or should the test be made more robust?
>
> -bw
>
> [1]
> diff --git a/x86/pmu.c b/x86/pmu.c
> index a46bdbf4788c..4295e0c83aa0 100644
> --- a/x86/pmu.c
> +++ b/x86/pmu.c
> @@ -104,7 +104,7 @@ static int num_counters;
>
>  char *buf;
>
> -static inline void loop(void)
> +static __always_inline void loop(void)
>  {
>         unsigned long tmp, tmp2, tmp3;
>
> @@ -144,7 +144,7 @@ static int event_to_global_idx(pmu_counter_t *cnt)
>                 (MSR_CORE_PERF_FIXED_CTR0 - FIXED_CNT_INDEX));
>  }
>
> -static struct pmu_event* get_counter_event(pmu_counter_t *cnt)
> +static __always_inline struct pmu_event* get_counter_event(pmu_counter_t *cnt)
>  {
>         if (is_gp(cnt)) {
>                 int i;
> @@ -158,7 +158,7 @@ static struct pmu_event*
> get_counter_event(pmu_counter_t *cnt)
>         return (void*)0;
>  }
>
> -static void global_enable(pmu_counter_t *cnt)
> +static __always_inline void global_enable(pmu_counter_t *cnt)
>  {
>         cnt->idx = event_to_global_idx(cnt);
>
> @@ -166,14 +166,14 @@ static void global_enable(pmu_counter_t *cnt)
>                         (1ull << cnt->idx));
>  }
>
> -static void global_disable(pmu_counter_t *cnt)
> +static __always_inline void global_disable(pmu_counter_t *cnt)
>  {
>         wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, rdmsr(MSR_CORE_PERF_GLOBAL_CTRL) &
>                         ~(1ull << cnt->idx));
>  }
>
>
> -static void start_event(pmu_counter_t *evt)
> +static __always_inline void start_event(pmu_counter_t *evt)
>  {
>      wrmsr(evt->ctr, evt->count);
>      if (is_gp(evt))
> @@ -197,7 +197,7 @@ static void start_event(pmu_counter_t *evt)
>      apic_write(APIC_LVTPC, PC_VECTOR);
>  }
>
> -static void stop_event(pmu_counter_t *evt)
> +static __always_inline void stop_event(pmu_counter_t *evt)
>  {
>         global_disable(evt);
>         if (is_gp(evt))
> @@ -211,7 +211,7 @@ static void stop_event(pmu_counter_t *evt)
>         evt->count = rdmsr(evt->ctr);
>  }
>
> -static void measure(pmu_counter_t *evt, int count)
> +static __always_inline void measure(pmu_counter_t *evt, int count)
>  {
>         int i;
>         for (i = 0; i < count; i++)
Jim Mattson May 31, 2022, 11 p.m. UTC | #2
On Thu, May 26, 2022 at 6:32 PM Bill Wendling <morbo@google.com> wrote:
>
> I'm into an issue when I compile kvm-unit-tests with a new-ish Clang
> version. It results in a failure similar to this:
>
> Serial contents after VMM exited:
> SeaBIOS (version 1.8.2-20160510_123855-google)
> Total RAM Size = 0x0000000100000000 = 4096 MiB
> CPU Mhz=2000
> CPUs found: 1     Max CPUs supported: 1
> Booting from ROM...
> enabling apic
> paging enabled
> cr0 = 80010011
> cr3 = bfefc000
> cr4 = 20
> PMU version:         4
> GP counters:         3
> GP counter width:    48
> Mask length:         7
> Fixed counters:      3
> Fixed counter width: 48
>  ---8<---
> PASS: all counters
> FAIL: overflow: cntr-0
> PASS: overflow: status-0
> PASS: overflow: status clear-0
> PASS: overflow: irq-0
> FAIL: overflow: cntr-1
> PASS: overflow: status-1
> PASS: overflow: status clear-1
> PASS: overflow: irq-1
> FAIL: overflow: cntr-2
> PASS: overflow: status-2
> PASS: overflow: status clear-2
> PASS: overflow: irq-2
> FAIL: overflow: cntr-3
> PASS: overflow: status-3
> PASS: overflow: status clear-3
> PASS: overflow: irq-3
>  ---8<---
>
> It turns out that newer Clangs are much more aggressive at inlining
> than GCC. I could replicate this failure with GCC with the patch
> below[1] (the patch probably isn't minimal). If I add the "noinline"
> attribute "measure()" in the patch below, the test passes.
>
> Is there a subtle assumption being made by the test that breaks with
> aggressive inlining? If so, is adding the "noinline" attribute to
> "measure()" the correct fix, or should the test be made more robust?

It's not all that subtle. :-)

The test assumes that every invocation of measure() will retire the
same number of instructions over the part of measure() where a PMC is
programmed to count instructions retired.

To set up PMC overflow, check_counter_overflow() first records the
number of instructions retired in an invocation of measure(). That
value is stored in 'count.' Then, it initializes a PMC to (1 - count),
and it invokes measure() again. It expects that 'count' instructions
will have been retired, and the PMC will now have the value '1.'

If the first measure() and the second measure() are different code
sequences, this doesn't work.

Adding 'noinline' to measure() is probably the right thing to do.
Bill Wendling June 1, 2022, 4:08 p.m. UTC | #3
On Tue, May 31, 2022 at 4:00 PM Jim Mattson <jmattson@google.com> wrote:
>
> On Thu, May 26, 2022 at 6:32 PM Bill Wendling <morbo@google.com> wrote:
> >
> > I'm into an issue when I compile kvm-unit-tests with a new-ish Clang
> > version. It results in a failure similar to this:
> >
> > Serial contents after VMM exited:
> > SeaBIOS (version 1.8.2-20160510_123855-google)
> > Total RAM Size = 0x0000000100000000 = 4096 MiB
> > CPU Mhz=2000
> > CPUs found: 1     Max CPUs supported: 1
> > Booting from ROM...
> > enabling apic
> > paging enabled
> > cr0 = 80010011
> > cr3 = bfefc000
> > cr4 = 20
> > PMU version:         4
> > GP counters:         3
> > GP counter width:    48
> > Mask length:         7
> > Fixed counters:      3
> > Fixed counter width: 48
> >  ---8<---
> > PASS: all counters
> > FAIL: overflow: cntr-0
> > PASS: overflow: status-0
> > PASS: overflow: status clear-0
> > PASS: overflow: irq-0
> > FAIL: overflow: cntr-1
> > PASS: overflow: status-1
> > PASS: overflow: status clear-1
> > PASS: overflow: irq-1
> > FAIL: overflow: cntr-2
> > PASS: overflow: status-2
> > PASS: overflow: status clear-2
> > PASS: overflow: irq-2
> > FAIL: overflow: cntr-3
> > PASS: overflow: status-3
> > PASS: overflow: status clear-3
> > PASS: overflow: irq-3
> >  ---8<---
> >
> > It turns out that newer Clangs are much more aggressive at inlining
> > than GCC. I could replicate this failure with GCC with the patch
> > below[1] (the patch probably isn't minimal). If I add the "noinline"
> > attribute "measure()" in the patch below, the test passes.
> >
> > Is there a subtle assumption being made by the test that breaks with
> > aggressive inlining? If so, is adding the "noinline" attribute to
> > "measure()" the correct fix, or should the test be made more robust?
>
> It's not all that subtle. :-)
>
> The test assumes that every invocation of measure() will retire the
> same number of instructions over the part of measure() where a PMC is
> programmed to count instructions retired.
>
> To set up PMC overflow, check_counter_overflow() first records the
> number of instructions retired in an invocation of measure(). That
> value is stored in 'count.' Then, it initializes a PMC to (1 - count),
> and it invokes measure() again. It expects that 'count' instructions
> will have been retired, and the PMC will now have the value '1.'
>
> If the first measure() and the second measure() are different code
> sequences, this doesn't work.
>
> Adding 'noinline' to measure() is probably the right thing to do.

Woo! Thanks. :-)

-bw
diff mbox series

Patch

diff --git a/x86/pmu.c b/x86/pmu.c
index a46bdbf4788c..4295e0c83aa0 100644
--- a/x86/pmu.c
+++ b/x86/pmu.c
@@ -104,7 +104,7 @@  static int num_counters;

 char *buf;

-static inline void loop(void)
+static __always_inline void loop(void)
 {
        unsigned long tmp, tmp2, tmp3;

@@ -144,7 +144,7 @@  static int event_to_global_idx(pmu_counter_t *cnt)
                (MSR_CORE_PERF_FIXED_CTR0 - FIXED_CNT_INDEX));
 }

-static struct pmu_event* get_counter_event(pmu_counter_t *cnt)
+static __always_inline struct pmu_event* get_counter_event(pmu_counter_t *cnt)
 {
        if (is_gp(cnt)) {
                int i;
@@ -158,7 +158,7 @@  static struct pmu_event*
get_counter_event(pmu_counter_t *cnt)
        return (void*)0;
 }

-static void global_enable(pmu_counter_t *cnt)
+static __always_inline void global_enable(pmu_counter_t *cnt)
 {
        cnt->idx = event_to_global_idx(cnt);