diff mbox series

perf/amd: Implement errata #1292 workaround for F19h M00-0Fh

Message ID 20220202042838.6532-1-ravi.bangoria@amd.com (mailing list archive)
State New, archived
Headers show
Series perf/amd: Implement errata #1292 workaround for F19h M00-0Fh | expand

Commit Message

Ravi Bangoria Feb. 2, 2022, 4:28 a.m. UTC
Perf counter may overcount for a list of Retire Based Events. Implement
workaround for Zen3 Family 19 Model 00-0F processors as suggested in
Revision Guide[1]:

  To count the non-FP affected PMC events correctly:
    o Use Core::X86::Msr::PERF_CTL2 to count the events, and
    o Program Core::X86::Msr::PERF_CTL2[43] to 1b, and
    o Program Core::X86::Msr::PERF_CTL2[20] to 0b.

Above workaround suggests to clear PERF_CTL2[20], but that will disable
sampling mode. Given the fact that, there is already a skew between
actual counter overflow vs PMI hit, we are anyway not getting accurate
count for sampling events. Also, using PMC2 with both bit43 and bit20
set can result in additional issues. Hence Linux implementation of
workaround uses non-PMC2 counter for sampling events.

Although the issue exists on all previous Zen revisions, the workaround
is different and thus not included in this patch.

This patch needs Like's patch[2] to make it work on kvm guest.

[1] https://bugzilla.kernel.org/attachment.cgi?id=298241
[2] https://lore.kernel.org/lkml/20220117055703.52020-1-likexu@tencent.com

Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
---
 arch/x86/events/amd/core.c | 75 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 74 insertions(+), 1 deletion(-)

Comments

Stephane Eranian Feb. 2, 2022, 5:27 a.m. UTC | #1
On Tue, Feb 1, 2022 at 8:29 PM Ravi Bangoria <ravi.bangoria@amd.com> wrote:
>
> Perf counter may overcount for a list of Retire Based Events. Implement
> workaround for Zen3 Family 19 Model 00-0F processors as suggested in
> Revision Guide[1]:
>
>   To count the non-FP affected PMC events correctly:
>     o Use Core::X86::Msr::PERF_CTL2 to count the events, and
>     o Program Core::X86::Msr::PERF_CTL2[43] to 1b, and
>     o Program Core::X86::Msr::PERF_CTL2[20] to 0b.
>
> Above workaround suggests to clear PERF_CTL2[20], but that will disable
> sampling mode. Given the fact that, there is already a skew between
> actual counter overflow vs PMI hit, we are anyway not getting accurate
> count for sampling events. Also, using PMC2 with both bit43 and bit20
> set can result in additional issues. Hence Linux implementation of
> workaround uses non-PMC2 counter for sampling events.
>
Something is missing from your description here. If you are not
clearing bit[20] and
not setting bit[43], then how does running on CTL2 by itself improve
the count. Is that
enough to make the counter count correctly?

For sampling events, your patch makes CTL2 not available. That seems
to contradict the
workaround. Are you doing this to free CTL2 for counting mode events
instead? If you are
not using CTL2, then you are not correcting the count. Are you saying
this is okay in sampling mode
because of the skid, anyway?

> Although the issue exists on all previous Zen revisions, the workaround
> is different and thus not included in this patch.
>
> This patch needs Like's patch[2] to make it work on kvm guest.
>
> [1] https://bugzilla.kernel.org/attachment.cgi?id=298241
> [2] https://lore.kernel.org/lkml/20220117055703.52020-1-likexu@tencent.com
>
> Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
> ---
>  arch/x86/events/amd/core.c | 75 +++++++++++++++++++++++++++++++++++++-
>  1 file changed, 74 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
> index 9687a8aef01c..e2f172e75ce8 100644
> --- a/arch/x86/events/amd/core.c
> +++ b/arch/x86/events/amd/core.c
> @@ -874,8 +874,78 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
>         }
>  }
>
> +/* Errata 1292: Overcounting of Retire Based Events */
> +static struct event_constraint retire_event_count_constraints[] __read_mostly = {
> +       EVENT_CONSTRAINT(0xC0, 0x4, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC1, 0x4, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC2, 0x4, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC3, 0x4, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC4, 0x4, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC5, 0x4, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC8, 0x4, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC9, 0x4, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xCA, 0x4, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xCC, 0x4, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xD1, 0x4, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0x1000000C7, 0x4, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0x1000000D0, 0x4, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT_END
> +};
> +
> +#define SAMPLE_IDX_MASK        (((1ULL << AMD64_NUM_COUNTERS_CORE) - 1) & ~0x4ULL)
> +
> +static struct event_constraint retire_event_sample_constraints[] __read_mostly = {
> +       EVENT_CONSTRAINT(0xC0, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC0, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC1, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC2, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC3, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC4, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC5, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC8, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xC9, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xCA, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xCC, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0xD1, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0x1000000C7, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT(0x1000000D0, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
> +       EVENT_CONSTRAINT_END
> +};
> +
>  static struct event_constraint pair_constraint;
>
> +/*
> + * Although 'Overcounting of Retire Based Events' errata exists
> + * for older generation cpus, workaround to set bit 43 works only
> + * for Family 19h Model 00-0Fh as per the Revision Guide.
> + */
> +static struct event_constraint *
> +amd_get_event_constraints_f19h_m00_0fh(struct cpu_hw_events *cpuc, int idx,
> +                                      struct perf_event *event)
> +{
> +       struct event_constraint *c;
> +
> +       if (amd_is_pair_event_code(&event->hw))
> +               return &pair_constraint;
> +
> +       if (is_sampling_event(event)) {
> +               for_each_event_constraint(c, retire_event_sample_constraints) {
> +                       if (constraint_match(c, event->hw.config))
> +                               return c;
> +               }
> +       } else {
> +               for_each_event_constraint(c, retire_event_count_constraints) {
> +                       if (constraint_match(c, event->hw.config)) {
> +                               event->hw.config |= (1ULL << 43);
> +                               event->hw.config &= ~(1ULL << 20);
> +                               return c;
> +                       }
> +               }
> +       }
> +
> +       return &unconstrained;
> +}
> +
>  static struct event_constraint *
>  amd_get_event_constraints_f17h(struct cpu_hw_events *cpuc, int idx,
>                                struct perf_event *event)
> @@ -983,7 +1053,10 @@ static int __init amd_core_pmu_init(void)
>                                     x86_pmu.num_counters / 2, 0,
>                                     PERF_X86_EVENT_PAIR);
>
> -               x86_pmu.get_event_constraints = amd_get_event_constraints_f17h;
> +               if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xf)
> +                       x86_pmu.get_event_constraints = amd_get_event_constraints_f19h_m00_0fh;
> +               else
> +                       x86_pmu.get_event_constraints = amd_get_event_constraints_f17h;
>                 x86_pmu.put_event_constraints = amd_put_event_constraints_f17h;
>                 x86_pmu.perf_ctr_pair_en = AMD_MERGE_EVENT_ENABLE;
>                 x86_pmu.flags |= PMU_FL_PAIR;
> --
> 2.27.0
>
Ravi Bangoria Feb. 2, 2022, 6:02 a.m. UTC | #2
Hi Stephane,

On 02-Feb-22 10:57 AM, Stephane Eranian wrote:
> On Tue, Feb 1, 2022 at 8:29 PM Ravi Bangoria <ravi.bangoria@amd.com> wrote:
>>
>> Perf counter may overcount for a list of Retire Based Events. Implement
>> workaround for Zen3 Family 19 Model 00-0F processors as suggested in
>> Revision Guide[1]:
>>
>>   To count the non-FP affected PMC events correctly:
>>     o Use Core::X86::Msr::PERF_CTL2 to count the events, and
>>     o Program Core::X86::Msr::PERF_CTL2[43] to 1b, and
>>     o Program Core::X86::Msr::PERF_CTL2[20] to 0b.
>>
>> Above workaround suggests to clear PERF_CTL2[20], but that will disable
>> sampling mode. Given the fact that, there is already a skew between
>> actual counter overflow vs PMI hit, we are anyway not getting accurate
>> count for sampling events. Also, using PMC2 with both bit43 and bit20
>> set can result in additional issues. Hence Linux implementation of
>> workaround uses non-PMC2 counter for sampling events.
>>
> Something is missing from your description here. If you are not
> clearing bit[20] and
> not setting bit[43], then how does running on CTL2 by itself improve
> the count. Is that
> enough to make the counter count correctly?

Yes. For counting retire based events, we need PMC2[43] set and
PMC2[20] clear so that it will not overcount.

> 
> For sampling events, your patch makes CTL2 not available. That seems
> to contradict the
> workaround. Are you doing this to free CTL2 for counting mode events
> instead? If you are
> not using CTL2, then you are not correcting the count. Are you saying
> this is okay in sampling mode
> because of the skid, anyway?

Correct. The constraint I am placing is to count retire events on
PMC2 and sample retire events on other counters.

Thanks,
Ravi
Stephane Eranian Feb. 2, 2022, 6:16 a.m. UTC | #3
On Tue, Feb 1, 2022 at 10:03 PM Ravi Bangoria <ravi.bangoria@amd.com> wrote:
>
> Hi Stephane,
>
> On 02-Feb-22 10:57 AM, Stephane Eranian wrote:
> > On Tue, Feb 1, 2022 at 8:29 PM Ravi Bangoria <ravi.bangoria@amd.com> wrote:
> >>
> >> Perf counter may overcount for a list of Retire Based Events. Implement
> >> workaround for Zen3 Family 19 Model 00-0F processors as suggested in
> >> Revision Guide[1]:
> >>
> >>   To count the non-FP affected PMC events correctly:
> >>     o Use Core::X86::Msr::PERF_CTL2 to count the events, and
> >>     o Program Core::X86::Msr::PERF_CTL2[43] to 1b, and
> >>     o Program Core::X86::Msr::PERF_CTL2[20] to 0b.
> >>
> >> Above workaround suggests to clear PERF_CTL2[20], but that will disable
> >> sampling mode. Given the fact that, there is already a skew between
> >> actual counter overflow vs PMI hit, we are anyway not getting accurate
> >> count for sampling events. Also, using PMC2 with both bit43 and bit20
> >> set can result in additional issues. Hence Linux implementation of
> >> workaround uses non-PMC2 counter for sampling events.
> >>
> > Something is missing from your description here. If you are not
> > clearing bit[20] and
> > not setting bit[43], then how does running on CTL2 by itself improve
> > the count. Is that
> > enough to make the counter count correctly?
>
> Yes. For counting retire based events, we need PMC2[43] set and
> PMC2[20] clear so that it will not overcount.
>
Ok, I get that part now. You are forcing the bits in the
get_constraint() function.

> >
> > For sampling events, your patch makes CTL2 not available. That seems
> > to contradict the
> > workaround. Are you doing this to free CTL2 for counting mode events
> > instead? If you are
> > not using CTL2, then you are not correcting the count. Are you saying
> > this is okay in sampling mode
> > because of the skid, anyway?
>
> Correct. The constraint I am placing is to count retire events on
> PMC2 and sample retire events on other counters.
>
Why do you need to permanently exclude CTL2 for retired events given
you are forcing the bits
in the get_constraints() for counting events config only, i.e., as
opposed to in CTL2 itself.
If the sampling retired events are unconstrained, they can use any
counters. If a counting retired
event is added, it has a "stronger" constraints and will be scheduled
before the unconstrained events,
yield the same behavior you wanted, except on demand which is preferable.

> Thanks,
> Ravi
Ravi Bangoria Feb. 2, 2022, 6:32 a.m. UTC | #4
Hi Stephane,

On 02-Feb-22 11:46 AM, Stephane Eranian wrote:
> On Tue, Feb 1, 2022 at 10:03 PM Ravi Bangoria <ravi.bangoria@amd.com> wrote:
>>
>> Hi Stephane,
>>
>> On 02-Feb-22 10:57 AM, Stephane Eranian wrote:
>>> On Tue, Feb 1, 2022 at 8:29 PM Ravi Bangoria <ravi.bangoria@amd.com> wrote:
>>>>
>>>> Perf counter may overcount for a list of Retire Based Events. Implement
>>>> workaround for Zen3 Family 19 Model 00-0F processors as suggested in
>>>> Revision Guide[1]:
>>>>
>>>>   To count the non-FP affected PMC events correctly:
>>>>     o Use Core::X86::Msr::PERF_CTL2 to count the events, and
>>>>     o Program Core::X86::Msr::PERF_CTL2[43] to 1b, and
>>>>     o Program Core::X86::Msr::PERF_CTL2[20] to 0b.
>>>>
>>>> Above workaround suggests to clear PERF_CTL2[20], but that will disable
>>>> sampling mode. Given the fact that, there is already a skew between
>>>> actual counter overflow vs PMI hit, we are anyway not getting accurate
>>>> count for sampling events. Also, using PMC2 with both bit43 and bit20
>>>> set can result in additional issues. Hence Linux implementation of
>>>> workaround uses non-PMC2 counter for sampling events.
>>>>
>>> Something is missing from your description here. If you are not
>>> clearing bit[20] and
>>> not setting bit[43], then how does running on CTL2 by itself improve
>>> the count. Is that
>>> enough to make the counter count correctly?
>>
>> Yes. For counting retire based events, we need PMC2[43] set and
>> PMC2[20] clear so that it will not overcount.
>>
> Ok, I get that part now. You are forcing the bits in the
> get_constraint() function.
> 
>>>
>>> For sampling events, your patch makes CTL2 not available. That seems
>>> to contradict the
>>> workaround. Are you doing this to free CTL2 for counting mode events
>>> instead? If you are
>>> not using CTL2, then you are not correcting the count. Are you saying
>>> this is okay in sampling mode
>>> because of the skid, anyway?
>>
>> Correct. The constraint I am placing is to count retire events on
>> PMC2 and sample retire events on other counters.
>>
> Why do you need to permanently exclude CTL2 for retired events given
> you are forcing the bits
> in the get_constraints() for counting events config only, i.e., as
> opposed to in CTL2 itself.
> If the sampling retired events are unconstrained, they can use any
> counters. If a counting retired
> event is added, it has a "stronger" constraints and will be scheduled
> before the unconstrained events,
> yield the same behavior you wanted, except on demand which is preferable.

Got it. Let me respin.

Thanks,
Ravi
diff mbox series

Patch

diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 9687a8aef01c..e2f172e75ce8 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -874,8 +874,78 @@  amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
 	}
 }
 
+/* Errata 1292: Overcounting of Retire Based Events */
+static struct event_constraint retire_event_count_constraints[] __read_mostly = {
+	EVENT_CONSTRAINT(0xC0, 0x4, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC1, 0x4, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC2, 0x4, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC3, 0x4, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC4, 0x4, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC5, 0x4, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC8, 0x4, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC9, 0x4, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xCA, 0x4, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xCC, 0x4, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xD1, 0x4, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0x1000000C7, 0x4, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0x1000000D0, 0x4, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT_END
+};
+
+#define SAMPLE_IDX_MASK	(((1ULL << AMD64_NUM_COUNTERS_CORE) - 1) & ~0x4ULL)
+
+static struct event_constraint retire_event_sample_constraints[] __read_mostly = {
+	EVENT_CONSTRAINT(0xC0, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC0, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC1, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC2, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC3, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC4, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC5, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC8, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xC9, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xCA, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xCC, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0xD1, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0x1000000C7, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT(0x1000000D0, SAMPLE_IDX_MASK, AMD64_EVENTSEL_EVENT),
+	EVENT_CONSTRAINT_END
+};
+
 static struct event_constraint pair_constraint;
 
+/*
+ * Although 'Overcounting of Retire Based Events' errata exists
+ * for older generation cpus, workaround to set bit 43 works only
+ * for Family 19h Model 00-0Fh as per the Revision Guide.
+ */
+static struct event_constraint *
+amd_get_event_constraints_f19h_m00_0fh(struct cpu_hw_events *cpuc, int idx,
+				       struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (amd_is_pair_event_code(&event->hw))
+		return &pair_constraint;
+
+	if (is_sampling_event(event)) {
+		for_each_event_constraint(c, retire_event_sample_constraints) {
+			if (constraint_match(c, event->hw.config))
+				return c;
+		}
+	} else {
+		for_each_event_constraint(c, retire_event_count_constraints) {
+			if (constraint_match(c, event->hw.config)) {
+				event->hw.config |= (1ULL << 43);
+				event->hw.config &= ~(1ULL << 20);
+				return c;
+			}
+		}
+	}
+
+	return &unconstrained;
+}
+
 static struct event_constraint *
 amd_get_event_constraints_f17h(struct cpu_hw_events *cpuc, int idx,
 			       struct perf_event *event)
@@ -983,7 +1053,10 @@  static int __init amd_core_pmu_init(void)
 				    x86_pmu.num_counters / 2, 0,
 				    PERF_X86_EVENT_PAIR);
 
-		x86_pmu.get_event_constraints = amd_get_event_constraints_f17h;
+		if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xf)
+			x86_pmu.get_event_constraints = amd_get_event_constraints_f19h_m00_0fh;
+		else
+			x86_pmu.get_event_constraints = amd_get_event_constraints_f17h;
 		x86_pmu.put_event_constraints = amd_put_event_constraints_f17h;
 		x86_pmu.perf_ctr_pair_en = AMD_MERGE_EVENT_ENABLE;
 		x86_pmu.flags |= PMU_FL_PAIR;