diff mbox series

[v2,1/2] x86/boot: Clear XD_DISABLE from the early boot path

Message ID 20230615153157.444-2-alejandro.vallejo@cloud.com (mailing list archive)
State Superseded
Headers show
Series Introduce a REQUIRE_NX Kconfig option | expand

Commit Message

Alejandro Vallejo June 15, 2023, 3:31 p.m. UTC
Intel CPUs have a bit in MSR_IA32_MISC_ENABLE that may prevent the NX bit
from being advertised. Clear it unconditionally if we can't find the NX
feature right away on boot.

The conditions for the MSR being read on early boot are (in this order):

* Long Mode is supported
* NX isn't advertised
* The vendor is Intel

The order of checks has been chosen carefully so a virtualized Xen on a
hypervisor that doesn't emulate that MSR (but supports NX) doesn't triple
fault trying to access the non-existing MSR.

While at it, make sure we use rdmsr_safe rather than rdmsrl in the
Intel-specific init path so we don't accidentally crash if the MSR isn't
emulated while Xen is virtualized.

Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com>
---
 xen/arch/x86/boot/head.S             | 60 ++++++++++++++++++++++++----
 xen/arch/x86/cpu/intel.c             | 32 +++++++--------
 xen/arch/x86/include/asm/msr-index.h |  2 +-
 3 files changed, 69 insertions(+), 25 deletions(-)

Comments

Andrew Cooper June 16, 2023, 7:43 p.m. UTC | #1
On 15/06/2023 4:31 pm, Alejandro Vallejo wrote:
> diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S
> index 09bebf8635..ce62eae6f3 100644
> --- a/xen/arch/x86/boot/head.S
> +++ b/xen/arch/x86/boot/head.S
> @@ -142,8 +142,8 @@ efi_platform:
>  
>          .section .init.text, "ax", @progbits
>  
> -bad_cpu:
> -        add     $sym_offs(.Lbad_cpu_msg),%esi   # Error message
> +.Lbad_cpu:
> +        add     $sym_offs(.Lbad_cpu_msg),%esi
>          jmp     .Lget_vtb
>  not_multiboot:
>          add     $sym_offs(.Lbad_ldr_msg),%esi   # Error message
> @@ -647,15 +647,59 @@ trampoline_setup:
>          cpuid
>  1:      mov     %edx, CPUINFO_FEATURE_OFFSET(X86_FEATURE_LM) + sym_esi(boot_cpu_data)
>  
> -        /* Check for NX. Adjust EFER setting if available. */
> +        /* Check for availability of long mode. */
> +        bt      $cpufeat_bit(X86_FEATURE_LM),%edx
> +        jnc     .Lbad_cpu
> +
> +        /* Check for NX */
>          bt      $cpufeat_bit(X86_FEATURE_NX), %edx
> +        jc     .Lhas_nx_bit

This part of the diff is far more legible when rebased over the 0.5/2
patch I've just emailed out.  I've got the rebase locally if you want it.

Although I'd suggest that this label name be .Lgot_nx.

> +
> +        /*
> +         * NX appears to be unsupported, but it might be hidden.
> +         *
> +         * Intel CPUs (may) implement MSR_IA32_MISC_ENABLE. Among other
> +         * things this MSR has a bit that artificially hides NX support in
> +         * CPUID. Xen _really_ wants that feature enabled if present, so we
> +         * have to determine if (a) the MSR exists and if so (b) clear the
> +         * bit.
> +         *
> +         * For native boots this is perfectly fine because the MSR was
> +         * introduced before Netburst, which was the first family to
> +         * provide 64bit support. So we're safe simply accessing it as long
> +         * as long mode support has already been checked.
> +         *
> +         * For the virtualized case the MSR might not be emulated though,
> +         * so we make sure to do an initial check for NX in order to bypass
> +         * this MSR read

I'd suggest reordering this a bit, and swapping some what for why.  How
about:

---
NX appears to be unsupported, but it might be hidden.

The feature is part of the AMD64 spec, but the very first Intel 64bit
CPUs lacked the feature, and thereafter there was a firmware knob to
disable the feature.  Undo the disable if possible.

All 64bit Intel CPUs support this MSR.  If virtualised, expect the
hypervisor to either emulate the MSR or give us NX.
---

> +         */
> +        xor     %eax,%eax
> +        cpuid
> +        cmpl    $X86_VENDOR_INTEL_EBX,%ebx

Where possible, spaces after commas and without size suffixes (the l on
cmpl here).  While not relevant here

> +        jnz     .Lno_nx_bit

The "_bit" is still redundant.  Simply .Lno_nx will do.

> +        cmpl    $X86_VENDOR_INTEL_EDX,%edx
> +        jnz     .Lno_nx_bit
> +        cmpl    $X86_VENDOR_INTEL_ECX,%ecx
> +        jnz     .Lno_nx_bit
> +
> +        /* Clear the XD_DISABLE bit */
> +        movl    $MSR_IA32_MISC_ENABLE, %ecx
> +        rdmsr
> +        btrl    $2, %edx

It's unfortunate that we don't have an asm-safe ilog2().  Oh well.

>          jnc     1f
> -        orb     $EFER_NXE >> 8, 1 + sym_esi(trampoline_efer)
> -1:
> +        wrmsr
> +        orb     $MSR_IA32_MISC_ENABLE_XD_DISABLE >> 32, 4 + sym_esi(trampoline_misc_enable_off)
>  
> -        /* Check for availability of long mode. */
> -        bt      $cpufeat_bit(X86_FEATURE_LM),%edx
> -        jnc     bad_cpu
> +1:      /* Check again for NX */

Failing to clear XD_DISABLE wants to go to .Lno_nx, not to re-scan CPUID
having done nothing to the MSR.

> +        mov     $0x80000001,%eax
> +        cpuid
> +        bt      $cpufeat_bit(X86_FEATURE_NX), %edx
> +        jnc     .Lno_nx_bit
> +
> +.Lhas_nx_bit:
> +        /* Adjust EFER is NX is present */

"Adjust EFER, given that NX is present".

> diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
> index 168cd58f36..46b0cd8dbb 100644
> --- a/xen/arch/x86/cpu/intel.c
> +++ b/xen/arch/x86/cpu/intel.c
> @@ -305,23 +305,23 @@ static void cf_check early_init_intel(struct cpuinfo_x86 *c)
>  		c->x86_cache_alignment = 128;
>  
>  	/* Unmask CPUID levels and NX if masked: */
> -	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
> -
> -	disable = misc_enable & (MSR_IA32_MISC_ENABLE_LIMIT_CPUID |
> -				 MSR_IA32_MISC_ENABLE_XD_DISABLE);
> -	if (disable) {
> -		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
> -		bootsym(trampoline_misc_enable_off) |= disable;
> -		bootsym(trampoline_efer) |= EFER_NXE;
> -	}
> +	if (rdmsr_safe(MSR_IA32_MISC_ENABLE, misc_enable) == 0) {

There's no need to change rdmsrl() to rdmsr_safe(), and not doing so
will shrink this diff a lot.

The only thing you need to alter the re-enable NX printk(), which
probably wants to move ahead of the "if (disable)" and source itself
from bootsym(trampoline_misc_enable_off) instead.

~Andrew
Alejandro Vallejo June 21, 2023, 4:43 p.m. UTC | #2
Sure, to everything before this

> > diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
> > index 168cd58f36..46b0cd8dbb 100644
> > --- a/xen/arch/x86/cpu/intel.c
> > +++ b/xen/arch/x86/cpu/intel.c
> > @@ -305,23 +305,23 @@ static void cf_check early_init_intel(struct cpuinfo_x86 *c)
> >  		c->x86_cache_alignment = 128;
> >  
> >  	/* Unmask CPUID levels and NX if masked: */
> > -	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
> > -
> > -	disable = misc_enable & (MSR_IA32_MISC_ENABLE_LIMIT_CPUID |
> > -				 MSR_IA32_MISC_ENABLE_XD_DISABLE);
> > -	if (disable) {
> > -		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
> > -		bootsym(trampoline_misc_enable_off) |= disable;
> > -		bootsym(trampoline_efer) |= EFER_NXE;
> > -	}
> > +	if (rdmsr_safe(MSR_IA32_MISC_ENABLE, misc_enable) == 0) {
> 
> There's no need to change rdmsrl() to rdmsr_safe(),
I thought we established before some hypervisors might not implement it. In
that case this function would crash. More gracefully than a triple fault,
sure, but still not a very friendly thing to do.

> and not doing so will shrink this diff a lot.
This particular one, yes. That said, looking more carefully I'd say I
should remove the XD_DISABLE parts of those lines altogether. It is
unconditionally cleared on boot now and will never ever have a non-zero
value at this point.

> The only thing you need to alter the re-enable NX printk(), which
> probably wants to move ahead of the "if (disable)" and source itself
> from bootsym(trampoline_misc_enable_off) instead.
Alternatively I could just get rid of the printk. As it is after this patch
even the BSP starts off with NX enabled.  EFER is updated as early as the
trampoline, which is as early as it would've happened before regardless of
XD_DISABLE.

Alejandro
Jan Beulich June 22, 2023, 7:54 a.m. UTC | #3
On 21.06.2023 18:43, Alejandro Vallejo wrote:
> Sure, to everything before this
> 
>>> diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
>>> index 168cd58f36..46b0cd8dbb 100644
>>> --- a/xen/arch/x86/cpu/intel.c
>>> +++ b/xen/arch/x86/cpu/intel.c
>>> @@ -305,23 +305,23 @@ static void cf_check early_init_intel(struct cpuinfo_x86 *c)
>>>  		c->x86_cache_alignment = 128;
>>>  
>>>  	/* Unmask CPUID levels and NX if masked: */
>>> -	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
>>> -
>>> -	disable = misc_enable & (MSR_IA32_MISC_ENABLE_LIMIT_CPUID |
>>> -				 MSR_IA32_MISC_ENABLE_XD_DISABLE);
>>> -	if (disable) {
>>> -		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
>>> -		bootsym(trampoline_misc_enable_off) |= disable;
>>> -		bootsym(trampoline_efer) |= EFER_NXE;
>>> -	}
>>> +	if (rdmsr_safe(MSR_IA32_MISC_ENABLE, misc_enable) == 0) {
>>
>> There's no need to change rdmsrl() to rdmsr_safe(),
> I thought we established before some hypervisors might not implement it. In
> that case this function would crash. More gracefully than a triple fault,
> sure, but still not a very friendly thing to do.

Yet then in early boot code you also don't (and can't) recover from getting
#GP(0), in case that might really happen.

Jan
Alejandro Vallejo June 22, 2023, 3:49 p.m. UTC | #4
On Thu, Jun 22, 2023 at 09:54:01AM +0200, Jan Beulich wrote:
> On 21.06.2023 18:43, Alejandro Vallejo wrote:
> > Sure, to everything before this
> > 
> >>> diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
> >>> index 168cd58f36..46b0cd8dbb 100644
> >>> --- a/xen/arch/x86/cpu/intel.c
> >>> +++ b/xen/arch/x86/cpu/intel.c
> >>> @@ -305,23 +305,23 @@ static void cf_check early_init_intel(struct cpuinfo_x86 *c)
> >>>  		c->x86_cache_alignment = 128;
> >>>  
> >>>  	/* Unmask CPUID levels and NX if masked: */
> >>> -	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
> >>> -
> >>> -	disable = misc_enable & (MSR_IA32_MISC_ENABLE_LIMIT_CPUID |
> >>> -				 MSR_IA32_MISC_ENABLE_XD_DISABLE);
> >>> -	if (disable) {
> >>> -		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
> >>> -		bootsym(trampoline_misc_enable_off) |= disable;
> >>> -		bootsym(trampoline_efer) |= EFER_NXE;
> >>> -	}
> >>> +	if (rdmsr_safe(MSR_IA32_MISC_ENABLE, misc_enable) == 0) {
> >>
> >> There's no need to change rdmsrl() to rdmsr_safe(),
> > I thought we established before some hypervisors might not implement it. In
> > that case this function would crash. More gracefully than a triple fault,
> > sure, but still not a very friendly thing to do.
> 
> Yet then in early boot code you also don't (and can't) recover from getting
> #GP(0), in case that might really happen.
> 
> Jan
That's true, strictly speaking, but that case is restricted to the
incredibly unlikely case of "no NX and no MSR". As is, if we ever boot
on an Intel machine without that MSR we'll hit #GP(0), regardless of NX.

Not that it matters a whole lot though. It's pretty unlikely we'll ever
trip there.

Alejandro
Jan Beulich June 23, 2023, 6:49 a.m. UTC | #5
On 22.06.2023 17:49, Alejandro Vallejo wrote:
> On Thu, Jun 22, 2023 at 09:54:01AM +0200, Jan Beulich wrote:
>> On 21.06.2023 18:43, Alejandro Vallejo wrote:
>>> Sure, to everything before this
>>>
>>>>> diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
>>>>> index 168cd58f36..46b0cd8dbb 100644
>>>>> --- a/xen/arch/x86/cpu/intel.c
>>>>> +++ b/xen/arch/x86/cpu/intel.c
>>>>> @@ -305,23 +305,23 @@ static void cf_check early_init_intel(struct cpuinfo_x86 *c)
>>>>>  		c->x86_cache_alignment = 128;
>>>>>  
>>>>>  	/* Unmask CPUID levels and NX if masked: */
>>>>> -	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
>>>>> -
>>>>> -	disable = misc_enable & (MSR_IA32_MISC_ENABLE_LIMIT_CPUID |
>>>>> -				 MSR_IA32_MISC_ENABLE_XD_DISABLE);
>>>>> -	if (disable) {
>>>>> -		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
>>>>> -		bootsym(trampoline_misc_enable_off) |= disable;
>>>>> -		bootsym(trampoline_efer) |= EFER_NXE;
>>>>> -	}
>>>>> +	if (rdmsr_safe(MSR_IA32_MISC_ENABLE, misc_enable) == 0) {
>>>>
>>>> There's no need to change rdmsrl() to rdmsr_safe(),
>>> I thought we established before some hypervisors might not implement it. In
>>> that case this function would crash. More gracefully than a triple fault,
>>> sure, but still not a very friendly thing to do.
>>
>> Yet then in early boot code you also don't (and can't) recover from getting
>> #GP(0), in case that might really happen.
>>
> That's true, strictly speaking, but that case is restricted to the
> incredibly unlikely case of "no NX and no MSR". As is, if we ever boot
> on an Intel machine without that MSR we'll hit #GP(0), regardless of NX.

Well, my comment was meant to support Andrew's earlier one. Why
check the MSR access here when we don't earlier on? Plus isn't
it the case that the MSR can be implied to exist when NX is (in
principle) available, as that "opt-out" existed specifically
when Intel first introduced NX on their CPUs? If so,
- when NX is not available, we would have crashed much earlier,
- when NX is available (or was turned on successfully) the MSR
  exists either because we wrote to it earlier or because we're
  on a newer CPU.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S
index 09bebf8635..ce62eae6f3 100644
--- a/xen/arch/x86/boot/head.S
+++ b/xen/arch/x86/boot/head.S
@@ -142,8 +142,8 @@  efi_platform:
 
         .section .init.text, "ax", @progbits
 
-bad_cpu:
-        add     $sym_offs(.Lbad_cpu_msg),%esi   # Error message
+.Lbad_cpu:
+        add     $sym_offs(.Lbad_cpu_msg),%esi
         jmp     .Lget_vtb
 not_multiboot:
         add     $sym_offs(.Lbad_ldr_msg),%esi   # Error message
@@ -647,15 +647,59 @@  trampoline_setup:
         cpuid
 1:      mov     %edx, CPUINFO_FEATURE_OFFSET(X86_FEATURE_LM) + sym_esi(boot_cpu_data)
 
-        /* Check for NX. Adjust EFER setting if available. */
+        /* Check for availability of long mode. */
+        bt      $cpufeat_bit(X86_FEATURE_LM),%edx
+        jnc     .Lbad_cpu
+
+        /* Check for NX */
         bt      $cpufeat_bit(X86_FEATURE_NX), %edx
+        jc     .Lhas_nx_bit
+
+        /*
+         * NX appears to be unsupported, but it might be hidden.
+         *
+         * Intel CPUs (may) implement MSR_IA32_MISC_ENABLE. Among other
+         * things this MSR has a bit that artificially hides NX support in
+         * CPUID. Xen _really_ wants that feature enabled if present, so we
+         * have to determine if (a) the MSR exists and if so (b) clear the
+         * bit.
+         *
+         * For native boots this is perfectly fine because the MSR was
+         * introduced before Netburst, which was the first family to
+         * provide 64bit support. So we're safe simply accessing it as long
+         * as long mode support has already been checked.
+         *
+         * For the virtualized case the MSR might not be emulated though,
+         * so we make sure to do an initial check for NX in order to bypass
+         * this MSR read
+         */
+        xor     %eax,%eax
+        cpuid
+        cmpl    $X86_VENDOR_INTEL_EBX,%ebx
+        jnz     .Lno_nx_bit
+        cmpl    $X86_VENDOR_INTEL_EDX,%edx
+        jnz     .Lno_nx_bit
+        cmpl    $X86_VENDOR_INTEL_ECX,%ecx
+        jnz     .Lno_nx_bit
+
+        /* Clear the XD_DISABLE bit */
+        movl    $MSR_IA32_MISC_ENABLE, %ecx
+        rdmsr
+        btrl    $2, %edx
         jnc     1f
-        orb     $EFER_NXE >> 8, 1 + sym_esi(trampoline_efer)
-1:
+        wrmsr
+        orb     $MSR_IA32_MISC_ENABLE_XD_DISABLE >> 32, 4 + sym_esi(trampoline_misc_enable_off)
 
-        /* Check for availability of long mode. */
-        bt      $cpufeat_bit(X86_FEATURE_LM),%edx
-        jnc     bad_cpu
+1:      /* Check again for NX */
+        mov     $0x80000001,%eax
+        cpuid
+        bt      $cpufeat_bit(X86_FEATURE_NX), %edx
+        jnc     .Lno_nx_bit
+
+.Lhas_nx_bit:
+        /* Adjust EFER is NX is present */
+        orb     $EFER_NXE >> 8, 1 + sym_esi(trampoline_efer)
+.Lno_nx_bit:
 
         /* Stash TSC to calculate a good approximation of time-since-boot */
         rdtsc
diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
index 168cd58f36..46b0cd8dbb 100644
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -305,23 +305,23 @@  static void cf_check early_init_intel(struct cpuinfo_x86 *c)
 		c->x86_cache_alignment = 128;
 
 	/* Unmask CPUID levels and NX if masked: */
-	rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
-
-	disable = misc_enable & (MSR_IA32_MISC_ENABLE_LIMIT_CPUID |
-				 MSR_IA32_MISC_ENABLE_XD_DISABLE);
-	if (disable) {
-		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
-		bootsym(trampoline_misc_enable_off) |= disable;
-		bootsym(trampoline_efer) |= EFER_NXE;
-	}
+	if (rdmsr_safe(MSR_IA32_MISC_ENABLE, misc_enable) == 0) {
+		disable = misc_enable & (MSR_IA32_MISC_ENABLE_LIMIT_CPUID |
+					 MSR_IA32_MISC_ENABLE_XD_DISABLE);
+		if (disable) {
+			wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
+			bootsym(trampoline_misc_enable_off) |= disable;
+			bootsym(trampoline_efer) |= EFER_NXE;
+		}
 
-	if (disable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID)
-		printk(KERN_INFO "revised cpuid level: %d\n",
-		       cpuid_eax(0));
-	if (disable & MSR_IA32_MISC_ENABLE_XD_DISABLE) {
-		write_efer(read_efer() | EFER_NXE);
-		printk(KERN_INFO
-		       "re-enabled NX (Execute Disable) protection\n");
+		if (disable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID)
+			printk(KERN_INFO "revised cpuid level: %d\n",
+			       cpuid_eax(0));
+		if (disable & MSR_IA32_MISC_ENABLE_XD_DISABLE) {
+			write_efer(read_efer() | EFER_NXE);
+			printk(KERN_INFO
+			       "re-enabled NX (Execute Disable) protection\n");
+		}
 	}
 
 	/* CPUID workaround for Intel 0F33/0F34 CPU */
diff --git a/xen/arch/x86/include/asm/msr-index.h b/xen/arch/x86/include/asm/msr-index.h
index 2749e433d2..4f861c0bb4 100644
--- a/xen/arch/x86/include/asm/msr-index.h
+++ b/xen/arch/x86/include/asm/msr-index.h
@@ -502,7 +502,7 @@ 
 #define MSR_IA32_MISC_ENABLE_MONITOR_ENABLE (1<<18)
 #define MSR_IA32_MISC_ENABLE_LIMIT_CPUID  (1<<22)
 #define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1<<23)
-#define MSR_IA32_MISC_ENABLE_XD_DISABLE	(1ULL << 34)
+#define MSR_IA32_MISC_ENABLE_XD_DISABLE   (_AC(1, ULL) << 34)
 
 #define MSR_IA32_TSC_DEADLINE		0x000006E0
 #define MSR_IA32_ENERGY_PERF_BIAS	0x000001b0