diff mbox series

KVM: x86/pmu: Isolate TSX specific perf_event_attr.attr logic for AMD

Message ID 20220307063805.65030-1-likexu@tencent.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/pmu: Isolate TSX specific perf_event_attr.attr logic for AMD | expand

Commit Message

Like Xu March 7, 2022, 6:38 a.m. UTC
From: Like Xu <likexu@tencent.com>

HSW_IN_TX* bits are used in generic code which are not supported on
AMD. Worse, these bits overlap with AMD EventSelect[11:8] and hence
using HSW_IN_TX* bits unconditionally in generic code is resulting in
unintentional pmu behavior on AMD. For example, if EventSelect[11:8]
is 0x2, pmc_reprogram_counter() wrongly assumes that
HSW_IN_TX_CHECKPOINTED is set and thus forces sampling period to be 0.

Opportunistically remove two TSX specific incoming parameters for
the generic interface reprogram_counter().

Fixes: 103af0a98788 ("perf, kvm: Support the in_tx/in_tx_cp modifiers in KVM arch perfmon emulation v5")
Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Like Xu <likexu@tencent.com>
---
Note: this patch is based on [1] which is considered to be a necessary cornerstone.
[1] https://lore.kernel.org/kvm/20220302111334.12689-1-likexu@tencent.com/

 arch/x86/kvm/pmu.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

Comments

Jim Mattson March 7, 2022, 9:39 p.m. UTC | #1
On Sun, Mar 6, 2022 at 10:38 PM Like Xu <like.xu.linux@gmail.com> wrote:
>
> From: Like Xu <likexu@tencent.com>
>
> HSW_IN_TX* bits are used in generic code which are not supported on
> AMD. Worse, these bits overlap with AMD EventSelect[11:8] and hence
> using HSW_IN_TX* bits unconditionally in generic code is resulting in
> unintentional pmu behavior on AMD. For example, if EventSelect[11:8]
> is 0x2, pmc_reprogram_counter() wrongly assumes that
> HSW_IN_TX_CHECKPOINTED is set and thus forces sampling period to be 0.
>
> Opportunistically remove two TSX specific incoming parameters for
> the generic interface reprogram_counter().
>
> Fixes: 103af0a98788 ("perf, kvm: Support the in_tx/in_tx_cp modifiers in KVM arch perfmon emulation v5")
> Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
> Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
> Signed-off-by: Like Xu <likexu@tencent.com>
> ---
> Note: this patch is based on [1] which is considered to be a necessary cornerstone.
> [1] https://lore.kernel.org/kvm/20220302111334.12689-1-likexu@tencent.com/
>
>  arch/x86/kvm/pmu.c | 29 ++++++++++++++---------------
>  1 file changed, 14 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index 17c61c990282..d0f9515c37dd 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -99,8 +99,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
>
>  static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
>                                   u64 config, bool exclude_user,
> -                                 bool exclude_kernel, bool intr,
> -                                 bool in_tx, bool in_tx_cp)
> +                                 bool exclude_kernel, bool intr)
>  {
>         struct perf_event *event;
>         struct perf_event_attr attr = {
> @@ -116,16 +115,18 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
>
>         attr.sample_period = get_sample_period(pmc, pmc->counter);
>
> -       if (in_tx)
> -               attr.config |= HSW_IN_TX;
> -       if (in_tx_cp) {
> -               /*
> -                * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
> -                * period. Just clear the sample period so at least
> -                * allocating the counter doesn't fail.
> -                */
> -               attr.sample_period = 0;
> -               attr.config |= HSW_IN_TX_CHECKPOINTED;
> +       if (guest_cpuid_is_intel(pmc->vcpu)) {

This is not the right condition to check. Per the SDM, both bits 32
and 33 "may only be set if the processor supports HLE or RTM." On
other Intel processors, this bit is reserved and any attempts to set
them result in a #GP.

> +               if (pmc->eventsel & HSW_IN_TX)
> +                       attr.config |= HSW_IN_TX;

This statement does nothing. If HSW_IN_TX is set in pmc->eventsel, it
is set in attr.config already.

> +               if (pmc->eventsel & HSW_IN_TX_CHECKPOINTED) {
> +                       /*
> +                        * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
> +                        * period. Just clear the sample period so at least
> +                        * allocating the counter doesn't fail.
> +                        */
> +                       attr.sample_period = 0;
> +                       attr.config |= HSW_IN_TX_CHECKPOINTED;

As above, this statement does nothing. We should just set
attr.sample_period to 0. Note, however, that the SDM documents an
additional constraint which is ignored here: "This bit may only be set
for IA32_PERFEVTSEL2." I have confirmed that a #GP is raised for an
attempt to set bit 33 in any PerfEvtSeln other than PerfEvtSel2 on a
Broadwell Xeon E5.

> +               }
>         }
>
>         event = perf_event_create_kernel_counter(&attr, -1, current,
> @@ -268,9 +269,7 @@ void reprogram_counter(struct kvm_pmc *pmc)
>                         (eventsel & AMD64_RAW_EVENT_MASK),
>                         !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
>                         !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
> -                       eventsel & ARCH_PERFMON_EVENTSEL_INT,
> -                       (eventsel & HSW_IN_TX),
> -                       (eventsel & HSW_IN_TX_CHECKPOINTED));
> +                       eventsel & ARCH_PERFMON_EVENTSEL_INT);
>  }
>  EXPORT_SYMBOL_GPL(reprogram_counter);
>
> --
> 2.35.1
>
Like Xu March 8, 2022, 11:59 a.m. UTC | #2
On 8/3/2022 5:39 am, Jim Mattson wrote:
> On Sun, Mar 6, 2022 at 10:38 PM Like Xu <like.xu.linux@gmail.com> wrote:
>>
>> From: Like Xu <likexu@tencent.com>
>>
>> HSW_IN_TX* bits are used in generic code which are not supported on
>> AMD. Worse, these bits overlap with AMD EventSelect[11:8] and hence
>> using HSW_IN_TX* bits unconditionally in generic code is resulting in
>> unintentional pmu behavior on AMD. For example, if EventSelect[11:8]
>> is 0x2, pmc_reprogram_counter() wrongly assumes that
>> HSW_IN_TX_CHECKPOINTED is set and thus forces sampling period to be 0.
>>
>> Opportunistically remove two TSX specific incoming parameters for
>> the generic interface reprogram_counter().
>>
>> Fixes: 103af0a98788 ("perf, kvm: Support the in_tx/in_tx_cp modifiers in KVM arch perfmon emulation v5")
>> Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
>> Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
>> Signed-off-by: Like Xu <likexu@tencent.com>
>> ---
>> Note: this patch is based on [1] which is considered to be a necessary cornerstone.
>> [1] https://lore.kernel.org/kvm/20220302111334.12689-1-likexu@tencent.com/
>>
>>   arch/x86/kvm/pmu.c | 29 ++++++++++++++---------------
>>   1 file changed, 14 insertions(+), 15 deletions(-)
>>
>> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
>> index 17c61c990282..d0f9515c37dd 100644
>> --- a/arch/x86/kvm/pmu.c
>> +++ b/arch/x86/kvm/pmu.c
>> @@ -99,8 +99,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
>>
>>   static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
>>                                    u64 config, bool exclude_user,
>> -                                 bool exclude_kernel, bool intr,
>> -                                 bool in_tx, bool in_tx_cp)
>> +                                 bool exclude_kernel, bool intr)
>>   {
>>          struct perf_event *event;
>>          struct perf_event_attr attr = {
>> @@ -116,16 +115,18 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
>>
>>          attr.sample_period = get_sample_period(pmc, pmc->counter);
>>
>> -       if (in_tx)
>> -               attr.config |= HSW_IN_TX;
>> -       if (in_tx_cp) {
>> -               /*
>> -                * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
>> -                * period. Just clear the sample period so at least
>> -                * allocating the counter doesn't fail.
>> -                */
>> -               attr.sample_period = 0;
>> -               attr.config |= HSW_IN_TX_CHECKPOINTED;
>> +       if (guest_cpuid_is_intel(pmc->vcpu)) {
> 
> This is not the right condition to check. Per the SDM, both bits 32
> and 33 "may only be set if the processor supports HLE or RTM." On
> other Intel processors, this bit is reserved and any attempts to set
> them result in a #GP.

We already have this part of the code:

	entry = kvm_find_cpuid_entry(vcpu, 7, 0);
	if (entry &&
	    (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
	    (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
		pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;

> 
>> +               if (pmc->eventsel & HSW_IN_TX)
>> +                       attr.config |= HSW_IN_TX;
> 
> This statement does nothing. If HSW_IN_TX is set in pmc->eventsel, it
> is set in attr.config already.

Agree for the redundancy, since attr.config is "(eventsel & AMD64_RAW_EVENT_MASK)".

> 
>> +               if (pmc->eventsel & HSW_IN_TX_CHECKPOINTED) {
>> +                       /*
>> +                        * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
>> +                        * period. Just clear the sample period so at least
>> +                        * allocating the counter doesn't fail.
>> +                        */
>> +                       attr.sample_period = 0;
>> +                       attr.config |= HSW_IN_TX_CHECKPOINTED;
> 
> As above, this statement does nothing. We should just set
> attr.sample_period to 0. Note, however, that the SDM documents an

Thanks and applied.

> additional constraint which is ignored here: "This bit may only be set
> for IA32_PERFEVTSEL2." I have confirmed that a #GP is raised for an
> attempt to set bit 33 in any PerfEvtSeln other than PerfEvtSel2 on a
> Broadwell Xeon E5.

Yes, "19.3.6.5 Performance Monitoring and IntelĀ® TSX".

I'm not sure if the host perf scheduler indicate this restriction.

cc Kan.

> 
>> +               }
>>          }
>>
>>          event = perf_event_create_kernel_counter(&attr, -1, current,
>> @@ -268,9 +269,7 @@ void reprogram_counter(struct kvm_pmc *pmc)
>>                          (eventsel & AMD64_RAW_EVENT_MASK),
>>                          !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
>>                          !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
>> -                       eventsel & ARCH_PERFMON_EVENTSEL_INT,
>> -                       (eventsel & HSW_IN_TX),
>> -                       (eventsel & HSW_IN_TX_CHECKPOINTED));
>> +                       eventsel & ARCH_PERFMON_EVENTSEL_INT);
>>   }
>>   EXPORT_SYMBOL_GPL(reprogram_counter);
>>
>> --
>> 2.35.1
>>
Jim Mattson March 8, 2022, 4:09 p.m. UTC | #3
On Tue, Mar 8, 2022 at 3:59 AM Like Xu <like.xu.linux@gmail.com> wrote:
>
> On 8/3/2022 5:39 am, Jim Mattson wrote:
> > On Sun, Mar 6, 2022 at 10:38 PM Like Xu <like.xu.linux@gmail.com> wrote:
> >>
> >> From: Like Xu <likexu@tencent.com>
> >>
> >> HSW_IN_TX* bits are used in generic code which are not supported on
> >> AMD. Worse, these bits overlap with AMD EventSelect[11:8] and hence
> >> using HSW_IN_TX* bits unconditionally in generic code is resulting in
> >> unintentional pmu behavior on AMD. For example, if EventSelect[11:8]
> >> is 0x2, pmc_reprogram_counter() wrongly assumes that
> >> HSW_IN_TX_CHECKPOINTED is set and thus forces sampling period to be 0.
> >>
> >> Opportunistically remove two TSX specific incoming parameters for
> >> the generic interface reprogram_counter().
> >>
> >> Fixes: 103af0a98788 ("perf, kvm: Support the in_tx/in_tx_cp modifiers in KVM arch perfmon emulation v5")
> >> Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
> >> Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
> >> Signed-off-by: Like Xu <likexu@tencent.com>
> >> ---
> >> Note: this patch is based on [1] which is considered to be a necessary cornerstone.
> >> [1] https://lore.kernel.org/kvm/20220302111334.12689-1-likexu@tencent.com/
> >>
> >>   arch/x86/kvm/pmu.c | 29 ++++++++++++++---------------
> >>   1 file changed, 14 insertions(+), 15 deletions(-)
> >>
> >> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> >> index 17c61c990282..d0f9515c37dd 100644
> >> --- a/arch/x86/kvm/pmu.c
> >> +++ b/arch/x86/kvm/pmu.c
> >> @@ -99,8 +99,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
> >>
> >>   static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> >>                                    u64 config, bool exclude_user,
> >> -                                 bool exclude_kernel, bool intr,
> >> -                                 bool in_tx, bool in_tx_cp)
> >> +                                 bool exclude_kernel, bool intr)
> >>   {
> >>          struct perf_event *event;
> >>          struct perf_event_attr attr = {
> >> @@ -116,16 +115,18 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> >>
> >>          attr.sample_period = get_sample_period(pmc, pmc->counter);
> >>
> >> -       if (in_tx)
> >> -               attr.config |= HSW_IN_TX;
> >> -       if (in_tx_cp) {
> >> -               /*
> >> -                * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
> >> -                * period. Just clear the sample period so at least
> >> -                * allocating the counter doesn't fail.
> >> -                */
> >> -               attr.sample_period = 0;
> >> -               attr.config |= HSW_IN_TX_CHECKPOINTED;
> >> +       if (guest_cpuid_is_intel(pmc->vcpu)) {
> >
> > This is not the right condition to check. Per the SDM, both bits 32
> > and 33 "may only be set if the processor supports HLE or RTM." On
> > other Intel processors, this bit is reserved and any attempts to set
> > them result in a #GP.
>
> We already have this part of the code:
>
>         entry = kvm_find_cpuid_entry(vcpu, 7, 0);
>         if (entry &&
>             (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
>             (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
>                 pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;

I stand corrected.

> > additional constraint which is ignored here: "This bit may only be set
> > for IA32_PERFEVTSEL2." I have confirmed that a #GP is raised for an
> > attempt to set bit 33 in any PerfEvtSeln other than PerfEvtSel2 on a
> > Broadwell Xeon E5.
>
> Yes, "19.3.6.5 Performance Monitoring and IntelĀ® TSX".
>
> I'm not sure if the host perf scheduler indicate this restriction.

Whether it does or it doesn't, it's KVM's responsibility to synthesize
the #GP if IN_TXCP is set for any PerfEvtSeln other than PerfEvtSel2.
diff mbox series

Patch

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 17c61c990282..d0f9515c37dd 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -99,8 +99,7 @@  static void kvm_perf_overflow(struct perf_event *perf_event,
 
 static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
 				  u64 config, bool exclude_user,
-				  bool exclude_kernel, bool intr,
-				  bool in_tx, bool in_tx_cp)
+				  bool exclude_kernel, bool intr)
 {
 	struct perf_event *event;
 	struct perf_event_attr attr = {
@@ -116,16 +115,18 @@  static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
 
 	attr.sample_period = get_sample_period(pmc, pmc->counter);
 
-	if (in_tx)
-		attr.config |= HSW_IN_TX;
-	if (in_tx_cp) {
-		/*
-		 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
-		 * period. Just clear the sample period so at least
-		 * allocating the counter doesn't fail.
-		 */
-		attr.sample_period = 0;
-		attr.config |= HSW_IN_TX_CHECKPOINTED;
+	if (guest_cpuid_is_intel(pmc->vcpu)) {
+		if (pmc->eventsel & HSW_IN_TX)
+			attr.config |= HSW_IN_TX;
+		if (pmc->eventsel & HSW_IN_TX_CHECKPOINTED) {
+			/*
+			 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
+			 * period. Just clear the sample period so at least
+			 * allocating the counter doesn't fail.
+			 */
+			attr.sample_period = 0;
+			attr.config |= HSW_IN_TX_CHECKPOINTED;
+		}
 	}
 
 	event = perf_event_create_kernel_counter(&attr, -1, current,
@@ -268,9 +269,7 @@  void reprogram_counter(struct kvm_pmc *pmc)
 			(eventsel & AMD64_RAW_EVENT_MASK),
 			!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
 			!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
-			eventsel & ARCH_PERFMON_EVENTSEL_INT,
-			(eventsel & HSW_IN_TX),
-			(eventsel & HSW_IN_TX_CHECKPOINTED));
+			eventsel & ARCH_PERFMON_EVENTSEL_INT);
 }
 EXPORT_SYMBOL_GPL(reprogram_counter);