diff mbox series

[08/13] perf/x86/amd: Add framework to save/restore host IBS state

Message ID 20230904095347.14994-9-manali.shukla@amd.com (mailing list archive)
State New, archived
Headers show
Series Implement support for IBS virtualization | expand

Commit Message

Manali Shukla Sept. 4, 2023, 9:53 a.m. UTC
Since IBS registers falls under swap type C [1], only the guest state
is saved and restored automatically by the hardware. Host state needs
to be saved and restored manually by the hypervisor. Note that, saving
and restoring of host IBS state happens only when IBS is active on
host to avoid unnecessary rdmsrs/wrmsrs.

Also, hypervisor needs to disable host IBS before VMRUN and re-enable
it after VMEXIT [2]. However, disabling and enabling of IBS leads to
subtle races between software and hardware since IBS_*_CTL registers
contain both control and result bits in the same MSR.

Consider the following scenario, hypervisor reads IBS control MSR and
finds enable=1 (control bit) and valid=0 (result bit). While kernel is
clearing enable bit in its local copy, IBS hardware sets valid bit to
1 in the MSR. Software, who is unaware of the change done by IBS
hardware, overwrites IBS MSR with enable=0 and valid=0. Note that,
this situation occurs while NMIs are disabled. So CPU will receive IBS
NMI only after STGI. However, the IBS driver won't handle NMI because
of the valid bit being 0. Since the real source of NMI was IBS, nobody
else will also handle it which will result in the unknown NMIs.

Handle the above mentioned race by keeping track of different actions
performed by KVM on IBS:

  WINDOW_START: After CLGI and before VMRUN. KVM informs IBS driver
                about its intention to enable IBS for the guest. Thus
		IBS should be disabled on host and IBS host register
		state should be saved.
  WINDOW_STOPPING: After VMEXIT and before STGI. KVM informs IBS driver
                that it's done using IBS inside the guest and thus host
		IBS state should be restored followed by re-enabling
		IBS for host.
  WINDOW_STOPPED: After STGI. CPU will receive any pending NMI if it
                was raised between CLGI and STGI. NMI will be marked
		as handled by IBS driver if WINDOW_STOPPED action is
                _not performed, valid bit is _not_ set and a valid
                IBS event exists. However, IBS sample won't be generated.

[1]: https://bugzilla.kernel.org/attachment.cgi?id=304653
     AMD64 Architecture Programmer’s Manual, Vol 2, Appendix B Layout
     of VMCB, Table B-3 Swap Types.

[2]: https://bugzilla.kernel.org/attachment.cgi?id=304653
     AMD64 Architecture Programmer’s Manual, Vol 2, Section 15.38
     Instruction-Based Sampling Virtualization.

Signed-off-by: Manali Shukla <manali.shukla@amd.com>
---
 arch/x86/events/amd/Makefile      |   2 +-
 arch/x86/events/amd/ibs.c         |  23 +++++++
 arch/x86/events/amd/vibs.c        | 101 ++++++++++++++++++++++++++++++
 arch/x86/include/asm/perf_event.h |  27 ++++++++
 4 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/events/amd/vibs.c

Comments

Tom Lendacky Sept. 5, 2023, 2:54 p.m. UTC | #1
On 9/4/23 04:53, Manali Shukla wrote:
> Since IBS registers falls under swap type C [1], only the guest state

s/falls/fall/

> is saved and restored automatically by the hardware. Host state needs
> to be saved and restored manually by the hypervisor. Note that, saving
> and restoring of host IBS state happens only when IBS is active on
> host to avoid unnecessary rdmsrs/wrmsrs.
> 
> Also, hypervisor needs to disable host IBS before VMRUN and re-enable
> it after VMEXIT [2]. However, disabling and enabling of IBS leads to
> subtle races between software and hardware since IBS_*_CTL registers
> contain both control and result bits in the same MSR.
> 
> Consider the following scenario, hypervisor reads IBS control MSR and
> finds enable=1 (control bit) and valid=0 (result bit). While kernel is
> clearing enable bit in its local copy, IBS hardware sets valid bit to
> 1 in the MSR. Software, who is unaware of the change done by IBS
> hardware, overwrites IBS MSR with enable=0 and valid=0. Note that,
> this situation occurs while NMIs are disabled. So CPU will receive IBS
> NMI only after STGI. However, the IBS driver won't handle NMI because
> of the valid bit being 0. Since the real source of NMI was IBS, nobody
> else will also handle it which will result in the unknown NMIs.
> 
> Handle the above mentioned race by keeping track of different actions
> performed by KVM on IBS:
> 
>    WINDOW_START: After CLGI and before VMRUN. KVM informs IBS driver
>                  about its intention to enable IBS for the guest. Thus
> 		IBS should be disabled on host and IBS host register
> 		state should be saved.
>    WINDOW_STOPPING: After VMEXIT and before STGI. KVM informs IBS driver
>                  that it's done using IBS inside the guest and thus host
> 		IBS state should be restored followed by re-enabling
> 		IBS for host.
>    WINDOW_STOPPED: After STGI. CPU will receive any pending NMI if it
>                  was raised between CLGI and STGI. NMI will be marked
> 		as handled by IBS driver if WINDOW_STOPPED action is
>                  _not performed, valid bit is _not_ set and a valid
>                  IBS event exists. However, IBS sample won't be generated.
> 
> [1]: https://bugzilla.kernel.org/attachment.cgi?id=304653
>       AMD64 Architecture Programmer’s Manual, Vol 2, Appendix B Layout
>       of VMCB, Table B-3 Swap Types.
> 
> [2]: https://bugzilla.kernel.org/attachment.cgi?id=304653
>       AMD64 Architecture Programmer’s Manual, Vol 2, Section 15.38
>       Instruction-Based Sampling Virtualization.
> 
> Signed-off-by: Manali Shukla <manali.shukla@amd.com>
> ---
>   arch/x86/events/amd/Makefile      |   2 +-
>   arch/x86/events/amd/ibs.c         |  23 +++++++
>   arch/x86/events/amd/vibs.c        | 101 ++++++++++++++++++++++++++++++
>   arch/x86/include/asm/perf_event.h |  27 ++++++++
>   4 files changed, 152 insertions(+), 1 deletion(-)
>   create mode 100644 arch/x86/events/amd/vibs.c
> 
> diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile
> index 527d947eb76b..13c2980db9a7 100644
> --- a/arch/x86/events/amd/Makefile
> +++ b/arch/x86/events/amd/Makefile
> @@ -2,7 +2,7 @@
>   obj-$(CONFIG_CPU_SUP_AMD)		+= core.o lbr.o
>   obj-$(CONFIG_PERF_EVENTS_AMD_BRS)	+= brs.o
>   obj-$(CONFIG_PERF_EVENTS_AMD_POWER)	+= power.o
> -obj-$(CONFIG_X86_LOCAL_APIC)		+= ibs.o
> +obj-$(CONFIG_X86_LOCAL_APIC)		+= ibs.o vibs.o
>   obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE)	+= amd-uncore.o
>   amd-uncore-objs				:= uncore.o
>   ifdef CONFIG_AMD_IOMMU
> diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
> index 6911c5399d02..359464f2910d 100644
> --- a/arch/x86/events/amd/ibs.c
> +++ b/arch/x86/events/amd/ibs.c
> @@ -1039,6 +1039,16 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
>   		 */
>   		if (test_and_clear_bit(IBS_STOPPED, pcpu->state))
>   			return 1;
> +		/*
> +		 * Catch NMIs generated in an active IBS window: Incoming NMIs
> +		 * from an active IBS window might have the VALID bit cleared
> +		 * when it is supposed to be set due to a race. The reason for
> +		 * the race is ENABLE and VALID bits for MSR_AMD64_IBSFETCHCTL
> +		 * and MSR_AMD64_IBSOPCTL being in their same respective MSRs.
> +		 * Ignore all such NMIs and treat them as handled.
> +		 */
> +		if (amd_vibs_ignore_nmi())
> +			return 1;
>   
>   		return 0;
>   	}
> @@ -1542,3 +1552,16 @@ static __init int amd_ibs_init(void)
>   
>   /* Since we need the pci subsystem to init ibs we can't do this earlier: */
>   device_initcall(amd_ibs_init);
> +
> +static inline bool get_ibs_state(struct perf_ibs *perf_ibs)
> +{
> +	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
> +
> +	return test_bit(IBS_STARTED, pcpu->state);
> +}
> +
> +bool is_amd_ibs_started(void)
> +{
> +	return get_ibs_state(&perf_ibs_fetch) || get_ibs_state(&perf_ibs_op);
> +}
> +EXPORT_SYMBOL_GPL(is_amd_ibs_started);

If this is only used by the IBS support, it shouldn't have an 
EXPORT_SYMBOL_GPL.

> diff --git a/arch/x86/events/amd/vibs.c b/arch/x86/events/amd/vibs.c
> new file mode 100644
> index 000000000000..273a60f1cb7f
> --- /dev/null
> +++ b/arch/x86/events/amd/vibs.c
> @@ -0,0 +1,101 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + *  Virtualized Performance events - AMD VIBS
> + *
> + *  Copyright (C) 2023 Advanced Micro Devices, Inc., Manali Shukla
> + *
> + *  For licencing details see kernel-base/COPYING
> + */
> +
> +#include <linux/perf_event.h>
> +
> +DEFINE_PER_CPU(bool, vibs_window_active);
> +
> +static bool amd_disable_ibs_fetch(u64 *ibs_fetch_ctl)
> +{
> +	*ibs_fetch_ctl = __rdmsr(MSR_AMD64_IBSFETCHCTL);
> +	if (!(*ibs_fetch_ctl & IBS_FETCH_ENABLE))
> +		return false;
> +
> +	native_wrmsrl(MSR_AMD64_IBSFETCHCTL, *ibs_fetch_ctl & ~IBS_FETCH_ENABLE);
> +
> +	return true;
> +}
> +
> +static u64 amd_disable_ibs_op(u64 *ibs_op_ctl)
> +{
> +	*ibs_op_ctl = __rdmsr(MSR_AMD64_IBSOPCTL);
> +	if (!(*ibs_op_ctl & IBS_OP_ENABLE))
> +		return false;
> +
> +	native_wrmsrl(MSR_AMD64_IBSOPCTL, *ibs_op_ctl & ~IBS_OP_ENABLE);
> +
> +	return true;
> +}
> +
> +static void amd_restore_ibs_fetch(u64 ibs_fetch_ctl)
> +{
> +	native_wrmsrl(MSR_AMD64_IBSFETCHCTL, ibs_fetch_ctl);
> +}
> +
> +static void amd_restore_ibs_op(u64 ibs_op_ctl)
> +{
> +	native_wrmsrl(MSR_AMD64_IBSOPCTL, ibs_op_ctl);
> +}
> +
> +bool amd_vibs_ignore_nmi(void)
> +{
> +	return __this_cpu_read(vibs_window_active);
> +}
> +EXPORT_SYMBOL_GPL(amd_vibs_ignore_nmi);

If this is only used by the IBS support, it shouldn't have an 
EXPORT_SYMBOL_GPL.

> +
> +bool amd_vibs_window(enum amd_vibs_window_state state, u64 *f_ctl,
> +		     u64 *o_ctl)
> +{
> +	bool f_active, o_active;
> +
> +	switch (state) {
> +	case WINDOW_START:
> +		if (!f_ctl || !o_ctl)
> +			return false;
> +
> +		if (!is_amd_ibs_started())
> +			return false;
> +
> +		f_active = amd_disable_ibs_fetch(f_ctl);
> +		o_active = amd_disable_ibs_op(o_ctl);
> +		__this_cpu_write(vibs_window_active, (f_active || o_active));
> +		break;
> +
> +	case WINDOW_STOPPING:
> +		if (!f_ctl || !o_ctl)
> +			return false;
> +
> +		if (__this_cpu_read(vibs_window_active))

Shouldn't this be if (!__this_cpu_read(vibs_window_active)) ?

> +			return false;
> +
> +		if (*f_ctl & IBS_FETCH_ENABLE)
> +			amd_restore_ibs_fetch(*f_ctl);
> +		if (*o_ctl & IBS_OP_ENABLE)
> +			amd_restore_ibs_op(*o_ctl);

Nit, these if checks could go into the restore functions to make this look 
a bit cleaner, but that's just from my point of view.

> +
> +		break;
> +
> +	case WINDOW_STOPPED:
> +		/*
> +		 * This state is executed right after STGI (which is executed
> +		 * after VMEXIT).  By this time, host IBS states are already
> +		 * restored in WINDOW_STOPPING state, so f_ctl and o_ctl will
> +		 * be passed as NULL for this state.
> +		 */
> +		if (__this_cpu_read(vibs_window_active))
> +			__this_cpu_write(vibs_window_active, false);

Any reason not to just issue __this_cpu_write(vibs_window_active, false) 
without the if check?

> +		break;
> +
> +	default:
> +		return false;
> +	}
> +
> +	return __this_cpu_read(vibs_window_active);

You could just issue a return true or return false (depending on the case) 
instead of having the break statements, e.g.:

For WINDOW_START replace break with return (f_active || o_active)
For WINDOW_STOPPING replace break with return true
For WINDOW_STOPPED replace break with return false

> +}
> +EXPORT_SYMBOL_GPL(amd_vibs_window);
> diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
> index 85a9fd5a3ec3..b87c235e0e1e 100644
> --- a/arch/x86/include/asm/perf_event.h
> +++ b/arch/x86/include/asm/perf_event.h
> @@ -486,6 +486,12 @@ struct pebs_xmm {
>   #define IBS_OP_MAX_CNT_EXT_MASK	(0x7FULL<<20)	/* separate upper 7 bits */
>   #define IBS_RIP_INVALID		(1ULL<<38)
>   
> +enum amd_vibs_window_state {
> +	WINDOW_START = 0,
> +	WINDOW_STOPPING,
> +	WINDOW_STOPPED,
> +};
> +
>   #ifdef CONFIG_X86_LOCAL_APIC
>   extern u32 get_ibs_caps(void);
>   extern int forward_event_to_ibs(struct perf_event *event);
> @@ -584,6 +590,27 @@ static inline void intel_pt_handle_vmx(int on)
>   }
>   #endif
>   
> +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_CPU_SUP_AMD)
> +extern bool amd_vibs_window(enum amd_vibs_window_state state, u64 *vibs_fetch_ctl,
> +			    u64 *vibs_op_ctl);
> +extern bool is_amd_ibs_started(void);
> +extern bool amd_vibs_ignore_nmi(void);

And if the two above functions aren't EXPORT_SYMBOL_GPL, then they could 
go somewhere else instead of here.

Thanks,
Tom

> +#else
> +static inline bool amd_vibs_window(enum amd_vibs_window_state state, u64 *vibs_fetch_ctl,
> +				  u64 *vibs_op_ctl)
> +{
> +	return false;
> +}
> +static inline bool is_amd_ibs_started(void)
> +{
> +	return false;
> +}
> +static inline bool amd_vibs_ignore_nmi(void)
> +{
> +	return false;
> +}
> +#endif
> +
>   #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
>    extern void amd_pmu_enable_virt(void);
>    extern void amd_pmu_disable_virt(void);
diff mbox series

Patch

diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile
index 527d947eb76b..13c2980db9a7 100644
--- a/arch/x86/events/amd/Makefile
+++ b/arch/x86/events/amd/Makefile
@@ -2,7 +2,7 @@ 
 obj-$(CONFIG_CPU_SUP_AMD)		+= core.o lbr.o
 obj-$(CONFIG_PERF_EVENTS_AMD_BRS)	+= brs.o
 obj-$(CONFIG_PERF_EVENTS_AMD_POWER)	+= power.o
-obj-$(CONFIG_X86_LOCAL_APIC)		+= ibs.o
+obj-$(CONFIG_X86_LOCAL_APIC)		+= ibs.o vibs.o
 obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE)	+= amd-uncore.o
 amd-uncore-objs				:= uncore.o
 ifdef CONFIG_AMD_IOMMU
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 6911c5399d02..359464f2910d 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -1039,6 +1039,16 @@  static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 		 */
 		if (test_and_clear_bit(IBS_STOPPED, pcpu->state))
 			return 1;
+		/*
+		 * Catch NMIs generated in an active IBS window: Incoming NMIs
+		 * from an active IBS window might have the VALID bit cleared
+		 * when it is supposed to be set due to a race. The reason for
+		 * the race is ENABLE and VALID bits for MSR_AMD64_IBSFETCHCTL
+		 * and MSR_AMD64_IBSOPCTL being in their same respective MSRs.
+		 * Ignore all such NMIs and treat them as handled.
+		 */
+		if (amd_vibs_ignore_nmi())
+			return 1;
 
 		return 0;
 	}
@@ -1542,3 +1552,16 @@  static __init int amd_ibs_init(void)
 
 /* Since we need the pci subsystem to init ibs we can't do this earlier: */
 device_initcall(amd_ibs_init);
+
+static inline bool get_ibs_state(struct perf_ibs *perf_ibs)
+{
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+	return test_bit(IBS_STARTED, pcpu->state);
+}
+
+bool is_amd_ibs_started(void)
+{
+	return get_ibs_state(&perf_ibs_fetch) || get_ibs_state(&perf_ibs_op);
+}
+EXPORT_SYMBOL_GPL(is_amd_ibs_started);
diff --git a/arch/x86/events/amd/vibs.c b/arch/x86/events/amd/vibs.c
new file mode 100644
index 000000000000..273a60f1cb7f
--- /dev/null
+++ b/arch/x86/events/amd/vibs.c
@@ -0,0 +1,101 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Virtualized Performance events - AMD VIBS
+ *
+ *  Copyright (C) 2023 Advanced Micro Devices, Inc., Manali Shukla
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+DEFINE_PER_CPU(bool, vibs_window_active);
+
+static bool amd_disable_ibs_fetch(u64 *ibs_fetch_ctl)
+{
+	*ibs_fetch_ctl = __rdmsr(MSR_AMD64_IBSFETCHCTL);
+	if (!(*ibs_fetch_ctl & IBS_FETCH_ENABLE))
+		return false;
+
+	native_wrmsrl(MSR_AMD64_IBSFETCHCTL, *ibs_fetch_ctl & ~IBS_FETCH_ENABLE);
+
+	return true;
+}
+
+static u64 amd_disable_ibs_op(u64 *ibs_op_ctl)
+{
+	*ibs_op_ctl = __rdmsr(MSR_AMD64_IBSOPCTL);
+	if (!(*ibs_op_ctl & IBS_OP_ENABLE))
+		return false;
+
+	native_wrmsrl(MSR_AMD64_IBSOPCTL, *ibs_op_ctl & ~IBS_OP_ENABLE);
+
+	return true;
+}
+
+static void amd_restore_ibs_fetch(u64 ibs_fetch_ctl)
+{
+	native_wrmsrl(MSR_AMD64_IBSFETCHCTL, ibs_fetch_ctl);
+}
+
+static void amd_restore_ibs_op(u64 ibs_op_ctl)
+{
+	native_wrmsrl(MSR_AMD64_IBSOPCTL, ibs_op_ctl);
+}
+
+bool amd_vibs_ignore_nmi(void)
+{
+	return __this_cpu_read(vibs_window_active);
+}
+EXPORT_SYMBOL_GPL(amd_vibs_ignore_nmi);
+
+bool amd_vibs_window(enum amd_vibs_window_state state, u64 *f_ctl,
+		     u64 *o_ctl)
+{
+	bool f_active, o_active;
+
+	switch (state) {
+	case WINDOW_START:
+		if (!f_ctl || !o_ctl)
+			return false;
+
+		if (!is_amd_ibs_started())
+			return false;
+
+		f_active = amd_disable_ibs_fetch(f_ctl);
+		o_active = amd_disable_ibs_op(o_ctl);
+		__this_cpu_write(vibs_window_active, (f_active || o_active));
+		break;
+
+	case WINDOW_STOPPING:
+		if (!f_ctl || !o_ctl)
+			return false;
+
+		if (__this_cpu_read(vibs_window_active))
+			return false;
+
+		if (*f_ctl & IBS_FETCH_ENABLE)
+			amd_restore_ibs_fetch(*f_ctl);
+		if (*o_ctl & IBS_OP_ENABLE)
+			amd_restore_ibs_op(*o_ctl);
+
+		break;
+
+	case WINDOW_STOPPED:
+		/*
+		 * This state is executed right after STGI (which is executed
+		 * after VMEXIT).  By this time, host IBS states are already
+		 * restored in WINDOW_STOPPING state, so f_ctl and o_ctl will
+		 * be passed as NULL for this state.
+		 */
+		if (__this_cpu_read(vibs_window_active))
+			__this_cpu_write(vibs_window_active, false);
+		break;
+
+	default:
+		return false;
+	}
+
+	return __this_cpu_read(vibs_window_active);
+}
+EXPORT_SYMBOL_GPL(amd_vibs_window);
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 85a9fd5a3ec3..b87c235e0e1e 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -486,6 +486,12 @@  struct pebs_xmm {
 #define IBS_OP_MAX_CNT_EXT_MASK	(0x7FULL<<20)	/* separate upper 7 bits */
 #define IBS_RIP_INVALID		(1ULL<<38)
 
+enum amd_vibs_window_state {
+	WINDOW_START = 0,
+	WINDOW_STOPPING,
+	WINDOW_STOPPED,
+};
+
 #ifdef CONFIG_X86_LOCAL_APIC
 extern u32 get_ibs_caps(void);
 extern int forward_event_to_ibs(struct perf_event *event);
@@ -584,6 +590,27 @@  static inline void intel_pt_handle_vmx(int on)
 }
 #endif
 
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_CPU_SUP_AMD)
+extern bool amd_vibs_window(enum amd_vibs_window_state state, u64 *vibs_fetch_ctl,
+			    u64 *vibs_op_ctl);
+extern bool is_amd_ibs_started(void);
+extern bool amd_vibs_ignore_nmi(void);
+#else
+static inline bool amd_vibs_window(enum amd_vibs_window_state state, u64 *vibs_fetch_ctl,
+				  u64 *vibs_op_ctl)
+{
+	return false;
+}
+static inline bool is_amd_ibs_started(void)
+{
+	return false;
+}
+static inline bool amd_vibs_ignore_nmi(void)
+{
+	return false;
+}
+#endif
+
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
  extern void amd_pmu_enable_virt(void);
  extern void amd_pmu_disable_virt(void);