diff mbox series

[v2,18/18] x86/mm: zero stack on context switch

Message ID 20250108142659.99490-19-roger.pau@citrix.com (mailing list archive)
State New
Headers show
Series x86: adventures in Address Space Isolation | expand

Commit Message

Roger Pau Monné Jan. 8, 2025, 2:26 p.m. UTC
With the stack mapped on a per-CPU basis there's no risk of other CPUs being
able to read the stack contents, but vCPUs running on the current pCPU could
read stack rubble from operations of previous vCPUs.

The #DF stack is not zeroed because handling of #DF results in a panic.

The contents of the shadow stack are not cleared as part of this change.  It's
arguable that leaking internal Xen return addresses is not guest confidential
data.  At most those could be used by an attacker to figure out the paths
inside of Xen previous execution flows have used.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Is it required to zero the stack when doing a non-lazy context switch from the
idle vPCU to the previously running vCPU?

d0v0 -> IDLE -> sync_execstate -> zero stack? -> d0v0

This is currently done in this proposal, as when running in the idle vCPU
context (iow: not lazy switched) stacks from remote pCPUs can be mapped or
tasklets executed.
---
Changes since v1:
 - Zero the stack forward to use ERMS.
 - Only zero the IST stacks if they have been used.
 - Only zero the primary stack for full context switches.
---
 docs/misc/xen-command-line.pandoc    |  4 +-
 xen/arch/x86/cpu/mcheck/mce.c        |  4 ++
 xen/arch/x86/domain.c                | 13 ++++++-
 xen/arch/x86/include/asm/current.h   | 53 +++++++++++++++++++++++---
 xen/arch/x86/include/asm/domain.h    |  3 ++
 xen/arch/x86/include/asm/spec_ctrl.h |  1 +
 xen/arch/x86/spec_ctrl.c             | 57 ++++++++++++++++++++++++----
 xen/arch/x86/traps.c                 |  5 +++
 8 files changed, 124 insertions(+), 16 deletions(-)
diff mbox series

Patch

diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
index e7828d092098..9cde9e84aff2 100644
--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -204,7 +204,7 @@  to appropriate auditing by Xen.  Argo is disabled by default.
 
 ### asi (x86)
 > `= List of [ <bool>, {pv,hvm}=<bool>,
-               {vcpu-pt,cpu-stack}=<bool>|{pv,hvm}=<bool> ]`
+               {vcpu-pt,cpu-stack,zero-stack}=<bool>|{pv,hvm}=<bool> ]`
 
 Offers control over whether the hypervisor will engage in Address Space
 Isolation, by not having potentially sensitive information permanently mapped
@@ -224,6 +224,8 @@  meant to be used for debugging purposes only.**
 * `cpu-stack` prevent CPUs from having permanent mappings of stacks different
   than their own.  Depends on the `vcpu-pt` option.
 
+* `zero-stack` zero CPU stacks when context switching vCPUs.
+
 ### asid (x86)
 > `= <boolean>`
 
diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
index 9028ccde5477..eaaaefe7f8ba 100644
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -92,10 +92,14 @@  struct mce_callbacks __ro_after_init mce_callbacks = {
 static const typeof(mce_callbacks.handler) __initconst_cf_clobber __used
     default_handler = unexpected_machine_check;
 
+DEFINE_PER_CPU(unsigned int, slice_mce_count);
+
 /* Call the installed machine check handler for this CPU setup. */
 
 void do_machine_check(const struct cpu_user_regs *regs)
 {
+    this_cpu(slice_mce_count)++;
+
     mce_enter();
     alternative_vcall(mce_callbacks.handler, regs);
     mce_exit();
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index ac6332266e95..1ff9200eb081 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2106,6 +2106,7 @@  void context_switch(struct vcpu *prev, struct vcpu *next)
     struct cpu_info *info = get_cpu_info();
     const struct domain *prevd = prev->domain, *nextd = next->domain;
     unsigned int dirty_cpu = read_atomic(&next->dirty_cpu);
+    bool lazy = false;
 
     ASSERT(prev != next);
     ASSERT(local_irq_is_enabled());
@@ -2138,6 +2139,7 @@  void context_switch(struct vcpu *prev, struct vcpu *next)
          */
         set_current(next);
         local_irq_enable();
+        lazy = true;
     }
     else
     {
@@ -2212,12 +2214,19 @@  void context_switch(struct vcpu *prev, struct vcpu *next)
     /* Ensure that the vcpu has an up-to-date time base. */
     update_vcpu_system_time(next);
 
-    reset_stack_and_call_ind(nextd->arch.ctxt_switch->tail);
+    /*
+     * Context switches to the idle vCPU (either lazy or full) will never
+     * trigger zeroing of the stack, because the idle domain doesn't have ASI
+     * enabled.  Switching back to the previously running vCPU after a lazy
+     * switch shouldn't zero the stack either.
+     */
+    reset_stack_and_call_ind(nextd->arch.ctxt_switch->tail,
+                             !lazy && nextd->arch.zero_stack);
 }
 
 void continue_running(struct vcpu *same)
 {
-    reset_stack_and_call_ind(same->domain->arch.ctxt_switch->tail);
+    reset_stack_and_call_ind(same->domain->arch.ctxt_switch->tail, false);
 }
 
 int __sync_local_execstate(void)
diff --git a/xen/arch/x86/include/asm/current.h b/xen/arch/x86/include/asm/current.h
index 4a9776f87a7a..9abb4e55aeea 100644
--- a/xen/arch/x86/include/asm/current.h
+++ b/xen/arch/x86/include/asm/current.h
@@ -170,6 +170,12 @@  unsigned long get_stack_dump_bottom (unsigned long sp);
 # define SHADOW_STACK_WORK ""
 #endif
 
+#define ZERO_STACK                                              \
+    "test %[stk_size], %[stk_size];"                            \
+    "jz .L_skip_zeroing.%=;"                                    \
+    "rep stosb;"                                                \
+    ".L_skip_zeroing.%=:"
+
 #if __GNUC__ >= 9
 # define ssaj_has_attr_noreturn(fn) __builtin_has_attribute(fn, __noreturn__)
 #else
@@ -177,13 +183,43 @@  unsigned long get_stack_dump_bottom (unsigned long sp);
 # define ssaj_has_attr_noreturn(fn) true
 #endif
 
-#define switch_stack_and_jump(fn, instr, constr)                        \
+DECLARE_PER_CPU(unsigned int, slice_mce_count);
+DECLARE_PER_CPU(unsigned int, slice_nmi_count);
+DECLARE_PER_CPU(unsigned int, slice_db_count);
+
+#define switch_stack_and_jump(fn, instr, constr, zero_stk)              \
     ({                                                                  \
         unsigned int tmp;                                               \
+                                                                        \
         BUILD_BUG_ON(!ssaj_has_attr_noreturn(fn));                      \
+        ASSERT(IS_ALIGNED((unsigned long)guest_cpu_user_regs() -        \
+                          PRIMARY_STACK_SIZE +                          \
+                          sizeof(struct cpu_info), PAGE_SIZE));         \
+        if ( zero_stk )                                                 \
+        {                                                               \
+            unsigned long stack_top = get_stack_bottom() &              \
+                                      ~(STACK_SIZE - 1);                \
+                                                                        \
+            if ( this_cpu(slice_mce_count) )                            \
+            {                                                           \
+                this_cpu(slice_mce_count) = 0;                          \
+                clear_page((void *)stack_top + IST_MCE * PAGE_SIZE);    \
+            }                                                           \
+            if ( this_cpu(slice_nmi_count) )                            \
+            {                                                           \
+                this_cpu(slice_nmi_count) = 0;                          \
+                clear_page((void *)stack_top + IST_NMI * PAGE_SIZE);    \
+            }                                                           \
+            if ( this_cpu(slice_db_count) )                             \
+            {                                                           \
+                this_cpu(slice_db_count) = 0;                           \
+                clear_page((void *)stack_top + IST_DB  * PAGE_SIZE);    \
+            }                                                           \
+        }                                                               \
         __asm__ __volatile__ (                                          \
             SHADOW_STACK_WORK                                           \
             "mov %[stk], %%rsp;"                                        \
+            ZERO_STACK                                                  \
             CHECK_FOR_LIVEPATCH_WORK                                    \
             instr "[fun]"                                               \
             : [val] "=&r" (tmp),                                        \
@@ -194,19 +230,26 @@  unsigned long get_stack_dump_bottom (unsigned long sp);
               ((PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE - 8),               \
               [stack_mask] "i" (STACK_SIZE - 1),                        \
               _ASM_BUGFRAME_INFO(BUGFRAME_bug, __LINE__,                \
-                                 __FILE__, NULL)                        \
+                                 __FILE__, NULL),                       \
+              /* For stack zeroing. */                                  \
+              "D" ((void *)guest_cpu_user_regs() -                      \
+                   PRIMARY_STACK_SIZE + sizeof(struct cpu_info)),       \
+              [stk_size] "c"                                            \
+              ((zero_stk) ? PRIMARY_STACK_SIZE - sizeof(struct cpu_info)\
+                          : 0),                                         \
+              "a" (0)                                                   \
             : "memory" );                                               \
         unreachable();                                                  \
     })
 
 #define reset_stack_and_jump(fn)                                        \
-    switch_stack_and_jump(fn, "jmp %c", "i")
+    switch_stack_and_jump(fn, "jmp %c", "i", false)
 
 /* The constraint may only specify non-call-clobbered registers. */
-#define reset_stack_and_call_ind(fn)                                    \
+#define reset_stack_and_call_ind(fn, zero_stk)                          \
     ({                                                                  \
         (void)((fn) == (void (*)(void))NULL);                           \
-        switch_stack_and_jump(fn, "INDIRECT_CALL %", "b");              \
+        switch_stack_and_jump(fn, "INDIRECT_CALL %", "b", zero_stk);    \
     })
 
 /*
diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index f83d2860c0b4..c2cbd73a42b4 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -468,6 +468,9 @@  struct arch_domain
     /* Use per-CPU mapped stacks. */
     bool cpu_stack;
 
+    /* Zero CPU stack on non lazy context switch. */
+    bool zero_stack;
+
     /* Emulated devices enabled bitmap. */
     uint32_t emulation_flags;
 } __cacheline_aligned;
diff --git a/xen/arch/x86/include/asm/spec_ctrl.h b/xen/arch/x86/include/asm/spec_ctrl.h
index c8943e81befa..c335c5eca35d 100644
--- a/xen/arch/x86/include/asm/spec_ctrl.h
+++ b/xen/arch/x86/include/asm/spec_ctrl.h
@@ -90,6 +90,7 @@  extern int8_t opt_xpti_hwdom, opt_xpti_domu;
 
 extern int8_t opt_vcpu_pt_pv, opt_vcpu_pt_hwdom, opt_vcpu_pt_hvm;
 extern int8_t opt_cpu_stack_pv, opt_cpu_stack_hwdom, opt_cpu_stack_hvm;
+extern int8_t opt_zero_stack_pv, opt_zero_stack_hwdom, opt_zero_stack_hvm;
 
 extern bool cpu_has_bug_l1tf;
 extern int8_t opt_pv_l1tf_hwdom, opt_pv_l1tf_domu;
diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index 4f1e912f8057..edae4b802e67 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -93,6 +93,10 @@  int8_t __ro_after_init opt_vcpu_pt_pv = -1;
 int8_t __ro_after_init opt_cpu_stack_hvm = -1;
 int8_t __ro_after_init opt_cpu_stack_hwdom = -1;
 int8_t __ro_after_init opt_cpu_stack_pv = -1;
+/* Zero CPU stacks. */
+int8_t __ro_after_init opt_zero_stack_hvm = -1;
+int8_t __ro_after_init opt_zero_stack_hwdom = -1;
+int8_t __ro_after_init opt_zero_stack_pv = -1;
 
 static int __init cf_check parse_spec_ctrl(const char *s)
 {
@@ -515,6 +519,7 @@  static int __init cf_check parse_asi(const char *s)
     {
         opt_vcpu_pt_pv = opt_vcpu_pt_hwdom = opt_vcpu_pt_hvm = 1;
         opt_cpu_stack_pv = opt_cpu_stack_hwdom = opt_cpu_stack_hvm = 1;
+        opt_zero_stack_pv = opt_zero_stack_hvm = opt_zero_stack_hwdom = 1;
     }
 
     do {
@@ -529,13 +534,14 @@  static int __init cf_check parse_asi(const char *s)
         case 1:
             opt_vcpu_pt_pv = opt_vcpu_pt_hwdom = opt_vcpu_pt_hvm = val;
             opt_cpu_stack_pv = opt_cpu_stack_hvm = opt_cpu_stack_hwdom = val;
+            opt_zero_stack_pv = opt_zero_stack_hvm = opt_zero_stack_hwdom = val;
             break;
 
         default:
             if ( (val = parse_boolean("pv", s, ss)) >= 0 )
-                opt_cpu_stack_pv = opt_vcpu_pt_pv = val;
+                opt_zero_stack_pv = opt_cpu_stack_pv = opt_vcpu_pt_pv = val;
             else if ( (val = parse_boolean("hvm", s, ss)) >= 0 )
-                opt_cpu_stack_hvm = opt_vcpu_pt_hvm = val;
+                opt_zero_stack_hvm = opt_cpu_stack_hvm = opt_vcpu_pt_hvm = val;
             else if ( (val = parse_boolean("vcpu-pt", s, ss)) != -1 )
             {
                 switch ( val )
@@ -579,6 +585,28 @@  static int __init cf_check parse_asi(const char *s)
                     break;
                 }
             }
+            else if ( (val = parse_boolean("zero-stack", s, ss)) != -1 )
+            {
+                switch ( val )
+                {
+                case 1:
+                case 0:
+                    opt_zero_stack_pv = opt_zero_stack_hvm =
+                        opt_zero_stack_hwdom = val;
+                    break;
+
+                case -2:
+                    s += strlen("zero-stack=");
+                    if ( (val = parse_boolean("pv", s, ss)) >= 0 )
+                        opt_zero_stack_pv = val;
+                    else if ( (val = parse_boolean("hvm", s, ss)) >= 0 )
+                        opt_zero_stack_hvm = val;
+                    else
+                default:
+                        rc = -EINVAL;
+                    break;
+                }
+            }
             else if ( *s )
                 rc = -EINVAL;
             break;
@@ -791,17 +819,21 @@  static void __init print_details(enum ind_thunk thunk)
 #endif
 
 #ifdef CONFIG_HVM
-    printk("  ASI features for HVM VMs:%s%s%s\n",
-           opt_vcpu_pt_hvm || opt_cpu_stack_hvm      ? ""               : " None",
+    printk("  ASI features for HVM VMs:%s%s%s%s\n",
+           opt_vcpu_pt_hvm || opt_cpu_stack_hvm ||
+           opt_zero_stack_hvm                        ? ""               : " None",
            opt_vcpu_pt_hvm                           ? " vCPU-PT"       : "",
-           opt_cpu_stack_hvm                         ? " CPU-STACK"     : "");
+           opt_cpu_stack_hvm                         ? " CPU-STACK"     : "",
+           opt_zero_stack_hvm                        ? " ZERO-STACK"    : "");
 
 #endif
 #ifdef CONFIG_PV
-    printk("  ASI features for PV VMs:%s%s%s\n",
-           opt_vcpu_pt_pv || opt_cpu_stack_pv        ? ""               : " None",
+    printk("  ASI features for PV VMs:%s%s%s%s\n",
+           opt_vcpu_pt_pv || opt_cpu_stack_pv ||
+           opt_zero_stack_pv                         ? ""               : " None",
            opt_vcpu_pt_pv                            ? " vCPU-PT"       : "",
-           opt_cpu_stack_pv                          ? " CPU-STACK"     : "");
+           opt_cpu_stack_pv                          ? " CPU-STACK"     : "",
+           opt_zero_stack_pv                         ? " ZERO-STACK"    : "");
 #endif
 }
 
@@ -1912,6 +1944,9 @@  void spec_ctrl_init_domain(struct domain *d)
     d->arch.cpu_stack = is_hardware_domain(d) ? opt_cpu_stack_hwdom
                                               : pv ? opt_cpu_stack_pv
                                                    : opt_cpu_stack_hvm;
+    d->arch.zero_stack = is_hardware_domain(d) ? opt_zero_stack_hwdom
+                                               : pv ? opt_zero_stack_pv
+                                                    : opt_zero_stack_hvm;
 }
 
 void __init init_speculation_mitigations(void)
@@ -2221,6 +2256,12 @@  void __init init_speculation_mitigations(void)
         opt_cpu_stack_hwdom = 0;
     if ( opt_cpu_stack_hvm == -1 )
         opt_cpu_stack_hvm = 0;
+    if ( opt_zero_stack_pv == -1 )
+        opt_zero_stack_pv = 0;
+    if ( opt_zero_stack_hwdom == -1 )
+        opt_zero_stack_hwdom = 0;
+    if ( opt_zero_stack_hvm == -1 )
+        opt_zero_stack_hvm = 0;
 
     if ( opt_vcpu_pt_pv || opt_vcpu_pt_hvm )
         warning_add(
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index c80ef2268e94..2aa53550e8e6 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -1792,6 +1792,7 @@  static void unknown_nmi_error(const struct cpu_user_regs *regs,
 static nmi_callback_t *__read_mostly nmi_callback;
 
 DEFINE_PER_CPU(unsigned int, nmi_count);
+DEFINE_PER_CPU(unsigned int, slice_nmi_count);
 
 void do_nmi(const struct cpu_user_regs *regs)
 {
@@ -1801,6 +1802,7 @@  void do_nmi(const struct cpu_user_regs *regs)
     bool handle_unknown = false;
 
     this_cpu(nmi_count)++;
+    this_cpu(slice_nmi_count)++;
     nmi_enter();
 
     /*
@@ -1919,6 +1921,8 @@  void asmlinkage do_device_not_available(struct cpu_user_regs *regs)
 
 void nocall sysenter_eflags_saved(void);
 
+DEFINE_PER_CPU(unsigned int, slice_db_count);
+
 void asmlinkage do_debug(struct cpu_user_regs *regs)
 {
     unsigned long dr6;
@@ -1927,6 +1931,7 @@  void asmlinkage do_debug(struct cpu_user_regs *regs)
     /* Stash dr6 as early as possible. */
     dr6 = read_debugreg(6);
 
+    this_cpu(slice_db_count)++;
     /*
      * At the time of writing (March 2018), on the subject of %dr6:
      *