Message ID | 20240802111244.99340-1-roger.pau@citrix.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [v4] x86/dom0: delay setting SMAP after dom0 build is done | expand |
On 02.08.2024 13:12, Roger Pau Monne wrote: > @@ -1907,16 +1890,26 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p) > if ( cpu_has_smep && opt_smep != SMEP_HVM_ONLY ) > setup_force_cpu_cap(X86_FEATURE_XEN_SMEP); > if ( boot_cpu_has(X86_FEATURE_XEN_SMEP) ) > + { > set_in_cr4(X86_CR4_SMEP); > + BUILD_BUG_ON(!(X86_CR4_SMEP & XEN_CR4_PV32_BITS)); > + cr4_pv32_mask |= X86_CR4_SMEP; > + } > > if ( !opt_smap ) > setup_clear_cpu_cap(X86_FEATURE_SMAP); > if ( cpu_has_smap && opt_smap != SMAP_HVM_ONLY ) > setup_force_cpu_cap(X86_FEATURE_XEN_SMAP); > if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) ) > - set_in_cr4(X86_CR4_SMAP); > - > - cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; > + /* > + * Set SMAP on the %cr4 mask so that it's set for APs on bringup, but > + * don't set for the BSP until domain building is done. > + * > + * Don't set it in cr4_pv32_mask either, until it's also set on the > + * BSP. Otherwise the BUG in cr4_pv32_restore would trigger for events > + * received on the BSP. > + */ > + mmu_cr4_features |= X86_CR4_SMAP; > > if ( boot_cpu_has(X86_FEATURE_FSGSBASE) ) > set_in_cr4(X86_CR4_FSGSBASE); A further few lines down from here we have trap_init(); which in turn calls cpu_init(), which in turn calls write_ptbase(). That, however, loads CR4 from mmu_cr4_features (that's still the idle vCPU we're on, but pv_make_cr4() wouldn't really be any better). Which I think explains both of Marek's systems hitting (XEN) [ 3.321965] Xen SMAP violation (XEN) [ 3.325885] ----[ Xen-4.20-unstable x86_64 debug=y Not tainted ]---- (XEN) [ 3.333755] CPU: 0 (XEN) [ 3.336818] RIP: e008:[<ffff82d04031fd8f>] init_hypercall_page+0x11/0x6b (XEN) [ 3.345032] RFLAGS: 0000000000010287 CONTEXT: hypervisor (XEN) [ 3.351622] rax: cccccccccccccccc rbx: ffff83047da9d000 rcx: 00007cfb7a7af8a4 (XEN) [ 3.360399] rdx: ffff83047da9d000 rsi: ffffffff81df5000 rdi: ffff83047da9d000 (XEN) [ 3.369241] rbp: ffff82d04045f890 rsp: ffff82d04045f890 r8: ffff83048a9519c8 (XEN) [ 3.378023] r9: ffff8304800a83e0 r10: 0000000000000031 r11: 000000000000000a (XEN) [ 3.386852] r12: 0000000000000000 r13: 0000000000000000 r14: ffffffff83630000 (XEN) [ 3.395653] r15: ffff83000009ef90 cr0: 000000008005003b cr4: 0000000000f526e0 (XEN) [ 3.404424] cr3: 000000047b631000 cr2: ffffffff81df5000 (XEN) [ 3.410964] fsb: 0000000000000000 gsb: 0000000000000000 gss: 0000000000000000 (XEN) [ 3.419746] ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: 0000 cs: e008 (XEN) [ 3.427978] Xen code around <ffff82d04031fd8f> (init_hypercall_page+0x11/0x6b): (XEN) [ 3.436599] cc cc cc cc cc cc cc cc <48> 89 06 48 89 86 f8 0f 00 00 48 8d 7e 08 48 83 (XEN) [ 3.445861] Xen stack trace from rsp=ffff82d04045f890: (XEN) [ 3.452094] ffff82d04045fd38 ffff82d04043e39d ffff83000009ef90 ffffffff83630000 (XEN) [ 3.461028] 0000000000000000 ffff83047b631000 0000000000003800 0000000000000ff0 (XEN) [ 3.469974] 0000000000000000 0000000000000000 ffff83047da9d000 ffffffff83651000 (XEN) [ 3.478873] ffffffff81000000 ffffffff83630000 0000000000000000 0000000000000000 (XEN) [ 3.487772] 000000000000001f 000000000047c000 ffffffff83630000 ffffffff83800000 (XEN) [ 3.496657] 000000000000001f ffff83047b631ff8 0000000000478000 0000000000003800 (XEN) [ 3.505569] ffff83047b634ff0 ffffffff83631000 0000008000800000 ffffffff80000000 (XEN) [ 3.514479] 0000000000004000 0000008000000000 0000000000024743 0000000000100000 (XEN) [ 3.523394] ffff83047b6330d8 0000000000000020 0000000024742cf0 ffff82d040494b60 (XEN) [ 3.532293] ffff83047da8b000 ffffffff003e856c 0000000000000000 ffff830489bbd768 (XEN) [ 3.541198] ffff830489bbd96c 0000000000000000 0000000000000000 0000000000000000 (XEN) [ 3.550090] 0000000000000001 ffff82d040462b4e ffffffff83241160 0000000000000001 (XEN) [ 3.559012] ffff82d0403e7612 ffffffff81df5000 0000000000000001 ffff82d0403e75ec (XEN) [ 3.567915] ffffffff80000000 0000000000000001 ffff82d040462ac4 0000000000000000 (XEN) [ 3.576822] 0000000000000002 ffff82d040462ad1 ffff830489bbd7a4 0000000000000002 (XEN) [ 3.585720] ffff82d0403e75b0 ffff830489bbd778 0000000000000002 ffff82d040462add (XEN) [ 3.594618] ffff830489bbd790 0000000000000002 ffff82d0403e75a9 ffff830489bbd880 (XEN) [ 3.603499] 0000000000000002 ffff82d040462aeb ffff830489bbd86c 0000000000000002 (XEN) [ 3.612422] ffff82d0403e7621 ffff830489bbd81c 0000000000000000 0000000000000000 (XEN) [ 3.621314] 0000000000000000 0000000000000001 ffff82d040462af4 ffff800000000000 (XEN) [ 3.630212] Xen call trace: (XEN) [ 3.633848] [<ffff82d04031fd8f>] R init_hypercall_page+0x11/0x6b (XEN) [ 3.641314] [<ffff82d04043e39d>] F dom0_construct_pv+0x1414/0x1fc7 (XEN) [ 3.648961] [<ffff82d0404520d9>] F construct_dom0+0xa4/0xb7 (XEN) [ 3.655955] [<ffff82d04044a185>] F __start_xen+0x2247/0x246d (XEN) [ 3.663058] [<ffff82d040203334>] F __high_start+0x94/0xa0 I'm a little puzzled though that this doesn't already hit during elf_load_binary(). Nevertheless I'm afraid I see no reasonable alternative to reverting, for the time being. Jan
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index eee20bb1753c..a7a85e94e3c8 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -80,7 +80,7 @@ int8_t __initdata opt_probe_port_aliases = -1; boolean_param("probe-port-aliases", opt_probe_port_aliases); /* Only used in asm code and within this source file */ -unsigned long asmlinkage __read_mostly cr4_pv32_mask; +unsigned long asmlinkage __ro_after_init cr4_pv32_mask; /* **** Linux config option: propagated to domain0. */ /* "acpi=off": Sisables both ACPI table parsing and interpreter. */ @@ -955,26 +955,9 @@ static struct domain *__init create_dom0(const module_t *image, } } - /* - * Temporarily clear SMAP in CR4 to allow user-accesses in construct_dom0(). - * This saves a large number of corner cases interactions with - * copy_from_user(). - */ - if ( cpu_has_smap ) - { - cr4_pv32_mask &= ~X86_CR4_SMAP; - write_cr4(read_cr4() & ~X86_CR4_SMAP); - } - if ( construct_dom0(d, image, headroom, initrd, cmdline) != 0 ) panic("Could not construct domain 0\n"); - if ( cpu_has_smap ) - { - write_cr4(read_cr4() | X86_CR4_SMAP); - cr4_pv32_mask |= X86_CR4_SMAP; - } - return d; } @@ -1907,16 +1890,26 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p) if ( cpu_has_smep && opt_smep != SMEP_HVM_ONLY ) setup_force_cpu_cap(X86_FEATURE_XEN_SMEP); if ( boot_cpu_has(X86_FEATURE_XEN_SMEP) ) + { set_in_cr4(X86_CR4_SMEP); + BUILD_BUG_ON(!(X86_CR4_SMEP & XEN_CR4_PV32_BITS)); + cr4_pv32_mask |= X86_CR4_SMEP; + } if ( !opt_smap ) setup_clear_cpu_cap(X86_FEATURE_SMAP); if ( cpu_has_smap && opt_smap != SMAP_HVM_ONLY ) setup_force_cpu_cap(X86_FEATURE_XEN_SMAP); if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) ) - set_in_cr4(X86_CR4_SMAP); - - cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; + /* + * Set SMAP on the %cr4 mask so that it's set for APs on bringup, but + * don't set for the BSP until domain building is done. + * + * Don't set it in cr4_pv32_mask either, until it's also set on the + * BSP. Otherwise the BUG in cr4_pv32_restore would trigger for events + * received on the BSP. + */ + mmu_cr4_features |= X86_CR4_SMAP; if ( boot_cpu_has(X86_FEATURE_FSGSBASE) ) set_in_cr4(X86_CR4_FSGSBASE); @@ -2048,6 +2041,19 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p) if ( !dom0 ) panic("Could not set up DOM0 guest OS\n"); + /* + * Enable SMAP only after being done with the domain building phase, as the + * PV builder switches to the domain page-tables and must be run with SMAP + * disabled. + */ + if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) ) + { + ASSERT(mmu_cr4_features & X86_CR4_SMAP); + write_cr4(read_cr4() | X86_CR4_SMAP); + BUILD_BUG_ON(!(X86_CR4_SMAP & XEN_CR4_PV32_BITS)); + cr4_pv32_mask |= X86_CR4_SMAP; + } + heap_init_late(); init_trace_bufs();