diff mbox series

[v4] x86/dom0: delay setting SMAP after dom0 build is done

Message ID 20240802111244.99340-1-roger.pau@citrix.com (mailing list archive)
State New
Headers show
Series [v4] x86/dom0: delay setting SMAP after dom0 build is done | expand

Commit Message

Roger Pau Monné Aug. 2, 2024, 11:12 a.m. UTC
Delay setting X86_CR4_SMAP on the BSP until the domain building is done, so
that there's no need to disable SMAP.  Note however that SMAP is enabled for
the APs on bringup, as domain builder code strictly run on the BSP.  Delaying
the setting for the APs would mean having to do a callfunc IPI later in order
to set it on all the APs.

The fixes tag is to account for the wrong usage of cpu_has_smap in
create_dom0(), it should instead have used
boot_cpu_has(X86_FEATURE_XEN_SMAP).

While there also make cr4_pv32_mask __ro_after_init.

Fixes: 493ab190e5b1 ('xen/sm{e, a}p: allow disabling sm{e, a}p for Xen itself')
Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
Changes since v3:
 - Be more selective when setting cr4_pv32_mask.

Changes since v2:
 - Change approach.
 - Add fixes tag.
---
 xen/arch/x86/setup.c | 48 +++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

Comments

Jan Beulich Aug. 5, 2024, 10:54 a.m. UTC | #1
On 02.08.2024 13:12, Roger Pau Monne wrote:
> @@ -1907,16 +1890,26 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
>      if ( cpu_has_smep && opt_smep != SMEP_HVM_ONLY )
>          setup_force_cpu_cap(X86_FEATURE_XEN_SMEP);
>      if ( boot_cpu_has(X86_FEATURE_XEN_SMEP) )
> +    {
>          set_in_cr4(X86_CR4_SMEP);
> +        BUILD_BUG_ON(!(X86_CR4_SMEP & XEN_CR4_PV32_BITS));
> +        cr4_pv32_mask |= X86_CR4_SMEP;
> +    }
>  
>      if ( !opt_smap )
>          setup_clear_cpu_cap(X86_FEATURE_SMAP);
>      if ( cpu_has_smap && opt_smap != SMAP_HVM_ONLY )
>          setup_force_cpu_cap(X86_FEATURE_XEN_SMAP);
>      if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) )
> -        set_in_cr4(X86_CR4_SMAP);
> -
> -    cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS;
> +        /*
> +         * Set SMAP on the %cr4 mask so that it's set for APs on bringup, but
> +         * don't set for the BSP until domain building is done.
> +         *
> +         * Don't set it in cr4_pv32_mask either, until it's also set on the
> +         * BSP.  Otherwise the BUG in cr4_pv32_restore would trigger for events
> +         * received on the BSP.
> +         */
> +        mmu_cr4_features |= X86_CR4_SMAP;
>  
>      if ( boot_cpu_has(X86_FEATURE_FSGSBASE) )
>          set_in_cr4(X86_CR4_FSGSBASE);

A further few lines down from here we have

    trap_init();

which in turn calls cpu_init(), which in turn calls write_ptbase(). That,
however, loads CR4 from mmu_cr4_features (that's still the idle vCPU we're
on, but pv_make_cr4() wouldn't really be any better). Which I think
explains both of Marek's systems hitting

(XEN) [    3.321965] Xen SMAP violation
(XEN) [    3.325885] ----[ Xen-4.20-unstable  x86_64  debug=y  Not tainted ]----
(XEN) [    3.333755] CPU:    0
(XEN) [    3.336818] RIP:    e008:[<ffff82d04031fd8f>] init_hypercall_page+0x11/0x6b
(XEN) [    3.345032] RFLAGS: 0000000000010287   CONTEXT: hypervisor
(XEN) [    3.351622] rax: cccccccccccccccc   rbx: ffff83047da9d000   rcx: 00007cfb7a7af8a4
(XEN) [    3.360399] rdx: ffff83047da9d000   rsi: ffffffff81df5000   rdi: ffff83047da9d000
(XEN) [    3.369241] rbp: ffff82d04045f890   rsp: ffff82d04045f890   r8:  ffff83048a9519c8
(XEN) [    3.378023] r9:  ffff8304800a83e0   r10: 0000000000000031   r11: 000000000000000a
(XEN) [    3.386852] r12: 0000000000000000   r13: 0000000000000000   r14: ffffffff83630000
(XEN) [    3.395653] r15: ffff83000009ef90   cr0: 000000008005003b   cr4: 0000000000f526e0
(XEN) [    3.404424] cr3: 000000047b631000   cr2: ffffffff81df5000
(XEN) [    3.410964] fsb: 0000000000000000   gsb: 0000000000000000   gss: 0000000000000000
(XEN) [    3.419746] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
(XEN) [    3.427978] Xen code around <ffff82d04031fd8f> (init_hypercall_page+0x11/0x6b):
(XEN) [    3.436599]  cc cc cc cc cc cc cc cc <48> 89 06 48 89 86 f8 0f 00 00 48 8d 7e 08 48 83
(XEN) [    3.445861] Xen stack trace from rsp=ffff82d04045f890:
(XEN) [    3.452094]    ffff82d04045fd38 ffff82d04043e39d ffff83000009ef90 ffffffff83630000
(XEN) [    3.461028]    0000000000000000 ffff83047b631000 0000000000003800 0000000000000ff0
(XEN) [    3.469974]    0000000000000000 0000000000000000 ffff83047da9d000 ffffffff83651000
(XEN) [    3.478873]    ffffffff81000000 ffffffff83630000 0000000000000000 0000000000000000
(XEN) [    3.487772]    000000000000001f 000000000047c000 ffffffff83630000 ffffffff83800000
(XEN) [    3.496657]    000000000000001f ffff83047b631ff8 0000000000478000 0000000000003800
(XEN) [    3.505569]    ffff83047b634ff0 ffffffff83631000 0000008000800000 ffffffff80000000
(XEN) [    3.514479]    0000000000004000 0000008000000000 0000000000024743 0000000000100000
(XEN) [    3.523394]    ffff83047b6330d8 0000000000000020 0000000024742cf0 ffff82d040494b60
(XEN) [    3.532293]    ffff83047da8b000 ffffffff003e856c 0000000000000000 ffff830489bbd768
(XEN) [    3.541198]    ffff830489bbd96c 0000000000000000 0000000000000000 0000000000000000
(XEN) [    3.550090]    0000000000000001 ffff82d040462b4e ffffffff83241160 0000000000000001
(XEN) [    3.559012]    ffff82d0403e7612 ffffffff81df5000 0000000000000001 ffff82d0403e75ec
(XEN) [    3.567915]    ffffffff80000000 0000000000000001 ffff82d040462ac4 0000000000000000
(XEN) [    3.576822]    0000000000000002 ffff82d040462ad1 ffff830489bbd7a4 0000000000000002
(XEN) [    3.585720]    ffff82d0403e75b0 ffff830489bbd778 0000000000000002 ffff82d040462add
(XEN) [    3.594618]    ffff830489bbd790 0000000000000002 ffff82d0403e75a9 ffff830489bbd880
(XEN) [    3.603499]    0000000000000002 ffff82d040462aeb ffff830489bbd86c 0000000000000002
(XEN) [    3.612422]    ffff82d0403e7621 ffff830489bbd81c 0000000000000000 0000000000000000
(XEN) [    3.621314]    0000000000000000 0000000000000001 ffff82d040462af4 ffff800000000000
(XEN) [    3.630212] Xen call trace:
(XEN) [    3.633848]    [<ffff82d04031fd8f>] R init_hypercall_page+0x11/0x6b
(XEN) [    3.641314]    [<ffff82d04043e39d>] F dom0_construct_pv+0x1414/0x1fc7
(XEN) [    3.648961]    [<ffff82d0404520d9>] F construct_dom0+0xa4/0xb7
(XEN) [    3.655955]    [<ffff82d04044a185>] F __start_xen+0x2247/0x246d
(XEN) [    3.663058]    [<ffff82d040203334>] F __high_start+0x94/0xa0

I'm a little puzzled though that this doesn't already hit during
elf_load_binary(). Nevertheless I'm afraid I see no reasonable alternative
to reverting, for the time being.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index eee20bb1753c..a7a85e94e3c8 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -80,7 +80,7 @@  int8_t __initdata opt_probe_port_aliases = -1;
 boolean_param("probe-port-aliases", opt_probe_port_aliases);
 
 /* Only used in asm code and within this source file */
-unsigned long asmlinkage __read_mostly cr4_pv32_mask;
+unsigned long asmlinkage __ro_after_init cr4_pv32_mask;
 
 /* **** Linux config option: propagated to domain0. */
 /* "acpi=off":    Sisables both ACPI table parsing and interpreter. */
@@ -955,26 +955,9 @@  static struct domain *__init create_dom0(const module_t *image,
         }
     }
 
-    /*
-     * Temporarily clear SMAP in CR4 to allow user-accesses in construct_dom0().
-     * This saves a large number of corner cases interactions with
-     * copy_from_user().
-     */
-    if ( cpu_has_smap )
-    {
-        cr4_pv32_mask &= ~X86_CR4_SMAP;
-        write_cr4(read_cr4() & ~X86_CR4_SMAP);
-    }
-
     if ( construct_dom0(d, image, headroom, initrd, cmdline) != 0 )
         panic("Could not construct domain 0\n");
 
-    if ( cpu_has_smap )
-    {
-        write_cr4(read_cr4() | X86_CR4_SMAP);
-        cr4_pv32_mask |= X86_CR4_SMAP;
-    }
-
     return d;
 }
 
@@ -1907,16 +1890,26 @@  void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
     if ( cpu_has_smep && opt_smep != SMEP_HVM_ONLY )
         setup_force_cpu_cap(X86_FEATURE_XEN_SMEP);
     if ( boot_cpu_has(X86_FEATURE_XEN_SMEP) )
+    {
         set_in_cr4(X86_CR4_SMEP);
+        BUILD_BUG_ON(!(X86_CR4_SMEP & XEN_CR4_PV32_BITS));
+        cr4_pv32_mask |= X86_CR4_SMEP;
+    }
 
     if ( !opt_smap )
         setup_clear_cpu_cap(X86_FEATURE_SMAP);
     if ( cpu_has_smap && opt_smap != SMAP_HVM_ONLY )
         setup_force_cpu_cap(X86_FEATURE_XEN_SMAP);
     if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) )
-        set_in_cr4(X86_CR4_SMAP);
-
-    cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS;
+        /*
+         * Set SMAP on the %cr4 mask so that it's set for APs on bringup, but
+         * don't set for the BSP until domain building is done.
+         *
+         * Don't set it in cr4_pv32_mask either, until it's also set on the
+         * BSP.  Otherwise the BUG in cr4_pv32_restore would trigger for events
+         * received on the BSP.
+         */
+        mmu_cr4_features |= X86_CR4_SMAP;
 
     if ( boot_cpu_has(X86_FEATURE_FSGSBASE) )
         set_in_cr4(X86_CR4_FSGSBASE);
@@ -2048,6 +2041,19 @@  void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
     if ( !dom0 )
         panic("Could not set up DOM0 guest OS\n");
 
+    /*
+     * Enable SMAP only after being done with the domain building phase, as the
+     * PV builder switches to the domain page-tables and must be run with SMAP
+     * disabled.
+     */
+    if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) )
+    {
+        ASSERT(mmu_cr4_features & X86_CR4_SMAP);
+        write_cr4(read_cr4() | X86_CR4_SMAP);
+        BUILD_BUG_ON(!(X86_CR4_SMAP & XEN_CR4_PV32_BITS));
+        cr4_pv32_mask |= X86_CR4_SMAP;
+    }
+
     heap_init_late();
 
     init_trace_bufs();