diff mbox series

[RFC,v1,13/15] x86/msr: Use the alternatives mechanism to read MSR

Message ID 20250331082251.3171276-14-xin@zytor.com (mailing list archive)
State RFC
Headers show
Series MSR refactor with new MSR instructions support | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Xin Li (Intel) March 31, 2025, 8:22 a.m. UTC
Also add support for the immediate form MSR read support.

Suggested-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li (Intel) <xin@zytor.com>
---
 arch/x86/include/asm/msr.h            | 274 ++++++++++++++++++++------
 arch/x86/include/asm/paravirt.h       |  40 ----
 arch/x86/include/asm/paravirt_types.h |   9 -
 arch/x86/kernel/paravirt.c            |   2 -
 arch/x86/xen/enlighten_pv.c           |  45 ++---
 arch/x86/xen/xen-asm.S                |  34 ++++
 arch/x86/xen/xen-ops.h                |   7 +
 7 files changed, 276 insertions(+), 135 deletions(-)
diff mbox series

Patch

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 066cde11254a..fc93c2601853 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -74,6 +74,8 @@  static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {}
 #endif
 
 #ifdef CONFIG_CC_IS_GCC
+#define ASM_RDMSR_IMM			\
+	" .insn VEX.128.F2.M7.W0 0xf6 /0, %[msr]%{:u32}, %[val]\n\t"
 #define ASM_WRMSRNS_IMM			\
 	" .insn VEX.128.F3.M7.W0 0xf6 /0, %[val], %[msr]%{:u32}\n\t"
 #endif
@@ -85,10 +87,17 @@  static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {}
  * The register operand is encoded as %rax because all uses of the immediate
  * form MSR access instructions reference %rax as the register operand.
  */
+#define ASM_RDMSR_IMM			\
+	" .byte 0xc4,0xe7,0x7b,0xf6,0xc0; .long %c[msr]"
 #define ASM_WRMSRNS_IMM			\
 	" .byte 0xc4,0xe7,0x7a,0xf6,0xc0; .long %c[msr]"
 #endif
 
+#define RDMSR_AND_SAVE_RESULT		\
+	"rdmsr\n\t"			\
+	"shl $0x20, %%rdx\n\t"		\
+	"or %%rdx, %[val]\n\t"
+
 #define PREPARE_RDX_FOR_WRMSR		\
 	"mov %%rax, %%rdx\n\t"		\
 	"shr $0x20, %%rdx\n\t"
@@ -142,6 +151,7 @@  static __always_inline enum pv_msr_action get_pv_msr_action(const u32 msr)
 	}
 }
 
+extern void asm_xen_read_msr(void);
 extern void asm_xen_write_msr(void);
 #else
 static __always_inline enum pv_msr_action get_pv_msr_action(const u32 msr)
@@ -150,35 +160,95 @@  static __always_inline enum pv_msr_action get_pv_msr_action(const u32 msr)
 }
 #endif
 
-static __always_inline unsigned long long __rdmsr(unsigned int msr)
+static __always_inline bool __native_rdmsr_variable(const u32 msr, u64 *val, const int type)
 {
-	DECLARE_ARGS(val, low, high);
+#ifdef CONFIG_X86_64
+	BUILD_BUG_ON(__builtin_constant_p(msr));
 
-	asm volatile("1: rdmsr\n"
-		     "2:\n"
-		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
-		     : EAX_EDX_RET(val, low, high) : "c" (msr));
+	asm_inline volatile goto(
+		"1:\n"
+		RDMSR_AND_SAVE_RESULT
+		_ASM_EXTABLE_TYPE(1b, %l[badmsr], %c[type])	/* For RDMSR */
 
-	return EAX_EDX_VAL(val, low, high);
+		: [val] "=a" (*val)
+		: "c" (msr), [type] "i" (type)
+		: "memory", "rdx"
+		: badmsr);
+#else
+	asm_inline volatile goto(
+		"1: rdmsr\n\t"
+		_ASM_EXTABLE_TYPE(1b, %l[badmsr], %c[type])	/* For RDMSR */
+
+		: "=A" (*val)
+		: "c" (msr), [type] "i" (type)
+		: "memory"
+		: badmsr);
+#endif
+
+	return false;
+
+badmsr:
+	return true;
+}
+
+#ifdef CONFIG_X86_64
+static __always_inline bool __native_rdmsr_constant(const u32 msr, u64 *val, const int type)
+{
+	BUILD_BUG_ON(!__builtin_constant_p(msr));
+
+	asm_inline volatile goto(
+		"1:\n"
+		ALTERNATIVE("mov %[msr], %%ecx\n\t"
+			    "2:\n"
+			    RDMSR_AND_SAVE_RESULT,
+			    ASM_RDMSR_IMM,
+			    X86_FEATURE_MSR_IMM)
+		_ASM_EXTABLE_TYPE(1b, %l[badmsr], %c[type])	/* For RDMSR immediate */
+		_ASM_EXTABLE_TYPE(2b, %l[badmsr], %c[type])	/* For RDMSR */
+
+		: [val] "=a" (*val)
+		: [msr] "i" (msr), [type] "i" (type)
+		: "memory", "ecx", "rdx"
+		: badmsr);
+
+	return false;
+
+badmsr:
+	return true;
+}
+#endif
+
+static __always_inline bool __native_rdmsr(const u32 msr, u64 *val, const int type)
+{
+#ifdef CONFIG_X86_64
+	if (__builtin_constant_p(msr))
+		return __native_rdmsr_constant(msr, val, type);
+#endif
+
+	return __native_rdmsr_variable(msr, val, type);
 }
 
-#define native_rdmsr(msr, val1, val2)			\
+#define native_rdmsr(msr, low, high)			\
 do {							\
-	u64 __val = __rdmsr((msr));			\
-	(void)((val1) = (u32)__val);			\
-	(void)((val2) = (u32)(__val >> 32));		\
+	u64 __val = 0;					\
+	__native_rdmsr((msr), &__val, EX_TYPE_RDMSR);	\
+	(void)((low) = (u32)__val);			\
+	(void)((high) = (u32)(__val >> 32));		\
 } while (0)
 
 static __always_inline u64 native_rdmsrl(const u32 msr)
 {
-	return __rdmsr(msr);
+	u64 val = 0;
+
+	__native_rdmsr(msr, &val, EX_TYPE_RDMSR);
+	return val;
 }
 
-static inline unsigned long long native_read_msr(unsigned int msr)
+static inline u64 native_read_msr(const u32 msr)
 {
-	unsigned long long val;
+	u64 val = 0;
 
-	val = __rdmsr(msr);
+	__native_rdmsr(msr, &val, EX_TYPE_RDMSR);
 
 	if (tracepoint_enabled(read_msr))
 		do_trace_read_msr(msr, val, 0);
@@ -186,19 +256,139 @@  static inline unsigned long long native_read_msr(unsigned int msr)
 	return val;
 }
 
-static inline unsigned long long native_read_msr_safe(unsigned int msr,
-						      int *err)
+static inline u64 native_read_msr_safe(const u32 msr, int *err)
 {
-	DECLARE_ARGS(val, low, high);
+	u64 val = 0;
+
+	*err = __native_rdmsr(msr, &val, EX_TYPE_RDMSR_SAFE) ? -EIO : 0;
 
-	asm volatile("1: rdmsr ; xor %[err],%[err]\n"
-		     "2:\n\t"
-		     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err])
-		     : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
-		     : "c" (msr));
 	if (tracepoint_enabled(read_msr))
-		do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err);
-	return EAX_EDX_VAL(val, low, high);
+		do_trace_read_msr(msr, val, *err);
+
+	return val;
+}
+
+static __always_inline bool __rdmsr_variable(const u32 msr, u64 *val, const int type)
+{
+#ifdef CONFIG_X86_64
+	BUILD_BUG_ON(__builtin_constant_p(msr));
+
+	asm_inline volatile goto(
+		"1:\n"
+		ALTERNATIVE(RDMSR_AND_SAVE_RESULT,
+			    "call asm_xen_read_msr\n\t",
+			    X86_FEATURE_XENPV)
+		_ASM_EXTABLE_TYPE(1b, %l[badmsr], %c[type])	/* For RDMSR and CALL */
+
+		: [val] "=a" (*val), ASM_CALL_CONSTRAINT
+		: "c" (msr), [type] "i" (type)
+		: "memory", "rdx"
+		: badmsr);
+
+	return false;
+
+badmsr:
+	return true;
+#else
+	return __native_rdmsr_variable(msr, val, type);
+#endif
+}
+
+static __always_inline bool __rdmsr_variable_all(const u32 msr, u64 *val, const int type)
+{
+	const enum pv_msr_action action = get_pv_msr_action(msr);
+
+	if (action == PV_MSR_PV) {
+		return __rdmsr_variable(msr, val, type);
+	} else if (action == PV_MSR_IGNORE) {
+		if (cpu_feature_enabled(X86_FEATURE_XENPV))
+			return false;
+	}
+
+	return __native_rdmsr_variable(msr, val, type);
+}
+
+#ifdef CONFIG_X86_64
+static __always_inline bool __rdmsr_constant(const u32 msr, u64 *val, const int type)
+{
+	BUILD_BUG_ON(!__builtin_constant_p(msr));
+
+	asm_inline volatile goto(
+		"1:\n"
+		ALTERNATIVE_2("mov %[msr], %%ecx\n\t"
+			      "2:\n"
+			      RDMSR_AND_SAVE_RESULT,
+			      ASM_RDMSR_IMM,
+			      X86_FEATURE_MSR_IMM,
+			      "mov %[msr], %%ecx\n\t"
+			      "call asm_xen_read_msr\n\t",
+			      X86_FEATURE_XENPV)
+		_ASM_EXTABLE_TYPE(1b, %l[badmsr], %c[type])	/* For RDMSR immediate */
+		_ASM_EXTABLE_TYPE(2b, %l[badmsr], %c[type])	/* For RDMSR and CALL */
+
+		: [val] "=a" (*val), ASM_CALL_CONSTRAINT
+		: [msr] "i" (msr), [type] "i" (type)
+		: "memory", "ecx", "rdx"
+		: badmsr);
+
+	return false;
+
+badmsr:
+	return true;
+}
+
+static __always_inline bool __rdmsr_constant_all(const u32 msr, u64 *val, const int type)
+{
+	const enum pv_msr_action action = get_pv_msr_action(msr);
+
+	if (action == PV_MSR_PV) {
+		return __rdmsr_constant(msr, val, type);
+	} else if (action == PV_MSR_IGNORE) {
+		if (cpu_feature_enabled(X86_FEATURE_XENPV))
+			return false;
+	}
+
+	return __native_rdmsr_constant(msr, val, type);
+}
+#endif
+
+static __always_inline bool __rdmsr(const u32 msr, u64 *val, const int type)
+{
+#ifdef CONFIG_X86_64
+	if (__builtin_constant_p(msr))
+		return __rdmsr_constant_all(msr, val, type);
+#endif
+
+	return __rdmsr_variable_all(msr, val, type);
+}
+
+#define rdmsr(msr, low, high)				\
+do {							\
+	u64 __val = 0;					\
+	__rdmsr((msr), &__val, EX_TYPE_RDMSR);		\
+	(void)((low) = (u32)__val);			\
+	(void)((high) = (u32)(__val >> 32));		\
+} while (0)
+
+#define rdmsrl(msr, val)				\
+do {							\
+	u64 __val = 0;					\
+	__rdmsr((msr), &__val, EX_TYPE_RDMSR);		\
+	(val) = __val;					\
+} while (0)
+
+#define rdmsr_safe(msr, low, high)						\
+({										\
+	u64 __val = 0;								\
+	int __err = __rdmsr((msr), &__val, EX_TYPE_RDMSR_SAFE) ? -EIO : 0;	\
+	(*low) = (u32)__val;							\
+	(*high) = (u32)(__val >> 32);						\
+	__err;									\
+})
+
+static __always_inline int rdmsrl_safe(const u32 msr, u64 *val)
+{
+	return __rdmsr(msr, val, EX_TYPE_RDMSR_SAFE) ? -EIO : 0;
 }
 
 /*
@@ -503,40 +693,6 @@  static inline unsigned long long native_read_pmc(int counter)
  * Note: the rd* operations modify the parameters directly (without using
  * pointer indirection), this allows gcc to optimize better
  */
-
-#define rdmsr(msr, low, high)					\
-do {								\
-	u64 __val = native_read_msr((msr));			\
-	(void)((low) = (u32)__val);				\
-	(void)((high) = (u32)(__val >> 32));			\
-} while (0)
-
-#define rdmsrl(msr, val)			\
-	((val) = native_read_msr((msr)))
-
-static inline void wrmsrl(u32 msr, u64 val)
-{
-	native_write_msr(msr, val);
-}
-
-/* rdmsr with exception handling */
-#define rdmsr_safe(msr, low, high)				\
-({								\
-	int __err;						\
-	u64 __val = native_read_msr_safe((msr), &__err);	\
-	(*low) = (u32)__val;					\
-	(*high) = (u32)(__val >> 32);				\
-	__err;							\
-})
-
-static inline int rdmsrl_safe(unsigned int msr, unsigned long long *p)
-{
-	int err;
-
-	*p = native_read_msr_safe(msr, &err);
-	return err;
-}
-
 #define rdpmc(counter, low, high)			\
 do {							\
 	u64 _l = native_read_pmc((counter));		\
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 351feb890ab0..7ddb71ed9d0c 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -175,46 +175,6 @@  static inline void __write_cr4(unsigned long x)
 	PVOP_VCALL1(cpu.write_cr4, x);
 }
 
-static inline u64 paravirt_read_msr(unsigned msr)
-{
-	return PVOP_CALL1(u64, cpu.read_msr, msr);
-}
-
-static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
-{
-	return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err);
-}
-
-#define rdmsr(msr, val1, val2)			\
-do {						\
-	u64 _l = paravirt_read_msr(msr);	\
-	val1 = (u32)_l;				\
-	val2 = _l >> 32;			\
-} while (0)
-
-#define rdmsrl(msr, val)			\
-do {						\
-	val = paravirt_read_msr(msr);		\
-} while (0)
-
-/* rdmsr with exception handling */
-#define rdmsr_safe(msr, a, b)				\
-({							\
-	int _err;					\
-	u64 _l = paravirt_read_msr_safe(msr, &_err);	\
-	(*a) = (u32)_l;					\
-	(*b) = _l >> 32;				\
-	_err;						\
-})
-
-static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
-{
-	int err;
-
-	*p = paravirt_read_msr_safe(msr, &err);
-	return err;
-}
-
 static inline unsigned long long paravirt_read_pmc(int counter)
 {
 	return PVOP_CALL1(u64, cpu.read_pmc, counter);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 8a563576d70e..5c57e6e4115f 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -90,15 +90,6 @@  struct pv_cpu_ops {
 	void (*cpuid)(unsigned int *eax, unsigned int *ebx,
 		      unsigned int *ecx, unsigned int *edx);
 
-	/* Unsafe MSR operations.  These will warn or panic on failure. */
-	u64 (*read_msr)(unsigned int msr);
-
-	/*
-	 * Safe MSR operations.
-	 * read sets err to 0 or -EIO.  write returns 0 or -EIO.
-	 */
-	u64 (*read_msr_safe)(unsigned int msr, int *err);
-
 	u64 (*read_pmc)(int counter);
 
 	void (*start_context_switch)(struct task_struct *prev);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ffb04445f97e..e3d4f9869779 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -128,8 +128,6 @@  struct paravirt_patch_template pv_ops = {
 	.cpu.read_cr0		= native_read_cr0,
 	.cpu.write_cr0		= native_write_cr0,
 	.cpu.write_cr4		= native_write_cr4,
-	.cpu.read_msr		= native_read_msr,
-	.cpu.read_msr_safe	= native_read_msr_safe,
 	.cpu.read_pmc		= native_read_pmc,
 	.cpu.load_tr_desc	= native_load_tr_desc,
 	.cpu.set_ldt		= native_set_ldt,
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index d02f55bfa869..e49f16278487 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1087,19 +1087,26 @@  static void xen_write_cr4(unsigned long cr4)
 	native_write_cr4(cr4);
 }
 
-static u64 xen_do_read_msr(unsigned int msr, int *err)
+/*
+ * Return true in xen_rdmsr_ret_type to indicate the requested MSR read has
+ * been done successfully.
+ */
+struct xen_rdmsr_ret_type xen_do_read_msr(u32 msr)
 {
-	u64 val = 0;	/* Avoid uninitialized value for safe variant. */
-	bool emulated;
+	struct xen_rdmsr_ret_type ret;
 
-	if (pmu_msr_chk_emulated(msr, &val, true, &emulated) && emulated)
-		return val;
+	ret.done = true;
 
-	if (err)
-		val = native_read_msr_safe(msr, err);
-	else
-		val = native_read_msr(msr);
+	if (pmu_msr_chk_emulated(msr, &ret.val, true, &ret.done) && ret.done)
+		return ret;
+
+	ret.val = 0;
+	ret.done = false;
+	return ret;
+}
 
+u64 xen_do_read_msr_fixup(u32 msr, u64 val)
+{
 	switch (msr) {
 	case MSR_IA32_APICBASE:
 		val &= ~X2APIC_ENABLE;
@@ -1108,7 +1115,11 @@  static u64 xen_do_read_msr(unsigned int msr, int *err)
 		else
 			val &= ~MSR_IA32_APICBASE_BSP;
 		break;
+
+	default:
+		break;
 	}
+
 	return val;
 }
 
@@ -1160,18 +1171,6 @@  bool xen_do_write_msr(u32 msr, u64 val)
 	}
 }
 
-static u64 xen_read_msr_safe(unsigned int msr, int *err)
-{
-	return xen_do_read_msr(msr, err);
-}
-
-static u64 xen_read_msr(unsigned int msr)
-{
-	int err;
-
-	return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL);
-}
-
 /* This is called once we have the cpu_possible_mask */
 void __init xen_setup_vcpu_info_placement(void)
 {
@@ -1206,10 +1205,6 @@  static const typeof(pv_ops) xen_cpu_ops __initconst = {
 
 		.write_cr4 = xen_write_cr4,
 
-		.read_msr = xen_read_msr,
-
-		.read_msr_safe = xen_read_msr_safe,
-
 		.read_pmc = xen_read_pmc,
 
 		.load_tr_desc = paravirt_nop,
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index e672632b1cc0..6e7a9daa03d4 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -399,3 +399,37 @@  SYM_CODE_END(xen_entry_SYSCALL_compat)
 	RET
 SYM_FUNC_END(asm_xen_write_msr)
 EXPORT_SYMBOL_GPL(asm_xen_write_msr)
+
+/*
+ * The prototype of the Xen C code:
+ * 	struct { u64 val, bool done } xen_do_read_msr(u32 msr)
+ */
+SYM_FUNC_START(asm_xen_read_msr)
+	ENDBR
+	FRAME_BEGIN
+	XEN_SAVE_CALLEE_REGS_FOR_MSR
+	mov %ecx, %edi		/* MSR number */
+	call xen_do_read_msr
+	test %dl, %dl		/* %dl=1, i.e., ZF=0, meaning successfully done */
+	XEN_RESTORE_CALLEE_REGS_FOR_MSR
+	jnz 2f
+
+1:	rdmsr
+	_ASM_EXTABLE_FUNC_REWIND(1b, -5, FRAME_OFFSET / (BITS_PER_LONG / 8))
+	shl $0x20, %rdx
+	or %rdx, %rax
+	/*
+	 * The top of the stack points directly at the return address;
+	 * back up by 5 bytes from the return address.
+	 */
+
+	XEN_SAVE_CALLEE_REGS_FOR_MSR
+	mov %ecx, %edi
+	mov %rax, %rsi
+	call xen_do_read_msr_fixup
+	XEN_RESTORE_CALLEE_REGS_FOR_MSR
+
+2:	FRAME_END
+	RET
+SYM_FUNC_END(asm_xen_read_msr)
+EXPORT_SYMBOL_GPL(asm_xen_read_msr)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index fc3c55871037..46efeaa4bbd3 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -146,7 +146,14 @@  __visible unsigned long xen_read_cr2_direct(void);
 /* These are not functions, and cannot be called normally */
 __visible void xen_iret(void);
 
+struct xen_rdmsr_ret_type {
+	u64 val;
+	bool done;
+};
+
 extern bool xen_do_write_msr(u32 msr, u64 val);
+extern struct xen_rdmsr_ret_type xen_do_read_msr(u32 msr);
+extern u64 xen_do_read_msr_fixup(u32 msr, u64 val);
 
 extern int xen_panic_handler_init(void);