@@ -204,7 +204,7 @@ to appropriate auditing by Xen. Argo is disabled by default.
### asi (x86)
> `= List of [ <bool>, {pv,hvm}=<bool>,
- {vcpu-pt,cpu-stack}=<bool>|{pv,hvm}=<bool> ]`
+ {vcpu-pt,cpu-stack,zero-stack}=<bool>|{pv,hvm}=<bool> ]`
Offers control over whether the hypervisor will engage in Address Space
Isolation, by not having potentially sensitive information permanently mapped
@@ -224,6 +224,8 @@ meant to be used for debugging purposes only.**
* `cpu-stack` prevent CPUs from having permanent mappings of stacks different
than their own. Depends on the `vcpu-pt` option.
+* `zero-stack` zero CPU stacks when context switching vCPUs.
+
### asid (x86)
> `= <boolean>`
@@ -92,10 +92,14 @@ struct mce_callbacks __ro_after_init mce_callbacks = {
static const typeof(mce_callbacks.handler) __initconst_cf_clobber __used
default_handler = unexpected_machine_check;
+DEFINE_PER_CPU(unsigned int, slice_mce_count);
+
/* Call the installed machine check handler for this CPU setup. */
void do_machine_check(const struct cpu_user_regs *regs)
{
+ this_cpu(slice_mce_count)++;
+
mce_enter();
alternative_vcall(mce_callbacks.handler, regs);
mce_exit();
@@ -2106,6 +2106,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
struct cpu_info *info = get_cpu_info();
const struct domain *prevd = prev->domain, *nextd = next->domain;
unsigned int dirty_cpu = read_atomic(&next->dirty_cpu);
+ bool lazy = false;
ASSERT(prev != next);
ASSERT(local_irq_is_enabled());
@@ -2138,6 +2139,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
*/
set_current(next);
local_irq_enable();
+ lazy = true;
}
else
{
@@ -2212,12 +2214,19 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
/* Ensure that the vcpu has an up-to-date time base. */
update_vcpu_system_time(next);
- reset_stack_and_call_ind(nextd->arch.ctxt_switch->tail);
+ /*
+ * Context switches to the idle vCPU (either lazy or full) will never
+ * trigger zeroing of the stack, because the idle domain doesn't have ASI
+ * enabled. Switching back to the previously running vCPU after a lazy
+ * switch shouldn't zero the stack either.
+ */
+ reset_stack_and_call_ind(nextd->arch.ctxt_switch->tail,
+ !lazy && nextd->arch.zero_stack);
}
void continue_running(struct vcpu *same)
{
- reset_stack_and_call_ind(same->domain->arch.ctxt_switch->tail);
+ reset_stack_and_call_ind(same->domain->arch.ctxt_switch->tail, false);
}
int __sync_local_execstate(void)
@@ -170,6 +170,12 @@ unsigned long get_stack_dump_bottom (unsigned long sp);
# define SHADOW_STACK_WORK ""
#endif
+#define ZERO_STACK \
+ "test %[stk_size], %[stk_size];" \
+ "jz .L_skip_zeroing.%=;" \
+ "rep stosb;" \
+ ".L_skip_zeroing.%=:"
+
#if __GNUC__ >= 9
# define ssaj_has_attr_noreturn(fn) __builtin_has_attribute(fn, __noreturn__)
#else
@@ -177,13 +183,43 @@ unsigned long get_stack_dump_bottom (unsigned long sp);
# define ssaj_has_attr_noreturn(fn) true
#endif
-#define switch_stack_and_jump(fn, instr, constr) \
+DECLARE_PER_CPU(unsigned int, slice_mce_count);
+DECLARE_PER_CPU(unsigned int, slice_nmi_count);
+DECLARE_PER_CPU(unsigned int, slice_db_count);
+
+#define switch_stack_and_jump(fn, instr, constr, zero_stk) \
({ \
unsigned int tmp; \
+ \
BUILD_BUG_ON(!ssaj_has_attr_noreturn(fn)); \
+ ASSERT(IS_ALIGNED((unsigned long)guest_cpu_user_regs() - \
+ PRIMARY_STACK_SIZE + \
+ sizeof(struct cpu_info), PAGE_SIZE)); \
+ if ( zero_stk ) \
+ { \
+ unsigned long stack_top = get_stack_bottom() & \
+ ~(STACK_SIZE - 1); \
+ \
+ if ( this_cpu(slice_mce_count) ) \
+ { \
+ this_cpu(slice_mce_count) = 0; \
+ clear_page((void *)stack_top + IST_MCE * PAGE_SIZE); \
+ } \
+ if ( this_cpu(slice_nmi_count) ) \
+ { \
+ this_cpu(slice_nmi_count) = 0; \
+ clear_page((void *)stack_top + IST_NMI * PAGE_SIZE); \
+ } \
+ if ( this_cpu(slice_db_count) ) \
+ { \
+ this_cpu(slice_db_count) = 0; \
+ clear_page((void *)stack_top + IST_DB * PAGE_SIZE); \
+ } \
+ } \
__asm__ __volatile__ ( \
SHADOW_STACK_WORK \
"mov %[stk], %%rsp;" \
+ ZERO_STACK \
CHECK_FOR_LIVEPATCH_WORK \
instr "[fun]" \
: [val] "=&r" (tmp), \
@@ -194,19 +230,26 @@ unsigned long get_stack_dump_bottom (unsigned long sp);
((PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE - 8), \
[stack_mask] "i" (STACK_SIZE - 1), \
_ASM_BUGFRAME_INFO(BUGFRAME_bug, __LINE__, \
- __FILE__, NULL) \
+ __FILE__, NULL), \
+ /* For stack zeroing. */ \
+ "D" ((void *)guest_cpu_user_regs() - \
+ PRIMARY_STACK_SIZE + sizeof(struct cpu_info)), \
+ [stk_size] "c" \
+ ((zero_stk) ? PRIMARY_STACK_SIZE - sizeof(struct cpu_info)\
+ : 0), \
+ "a" (0) \
: "memory" ); \
unreachable(); \
})
#define reset_stack_and_jump(fn) \
- switch_stack_and_jump(fn, "jmp %c", "i")
+ switch_stack_and_jump(fn, "jmp %c", "i", false)
/* The constraint may only specify non-call-clobbered registers. */
-#define reset_stack_and_call_ind(fn) \
+#define reset_stack_and_call_ind(fn, zero_stk) \
({ \
(void)((fn) == (void (*)(void))NULL); \
- switch_stack_and_jump(fn, "INDIRECT_CALL %", "b"); \
+ switch_stack_and_jump(fn, "INDIRECT_CALL %", "b", zero_stk); \
})
/*
@@ -468,6 +468,9 @@ struct arch_domain
/* Use per-CPU mapped stacks. */
bool cpu_stack;
+ /* Zero CPU stack on non lazy context switch. */
+ bool zero_stack;
+
/* Emulated devices enabled bitmap. */
uint32_t emulation_flags;
} __cacheline_aligned;
@@ -90,6 +90,7 @@ extern int8_t opt_xpti_hwdom, opt_xpti_domu;
extern int8_t opt_vcpu_pt_pv, opt_vcpu_pt_hwdom, opt_vcpu_pt_hvm;
extern int8_t opt_cpu_stack_pv, opt_cpu_stack_hwdom, opt_cpu_stack_hvm;
+extern int8_t opt_zero_stack_pv, opt_zero_stack_hwdom, opt_zero_stack_hvm;
extern bool cpu_has_bug_l1tf;
extern int8_t opt_pv_l1tf_hwdom, opt_pv_l1tf_domu;
@@ -93,6 +93,10 @@ int8_t __ro_after_init opt_vcpu_pt_pv = -1;
int8_t __ro_after_init opt_cpu_stack_hvm = -1;
int8_t __ro_after_init opt_cpu_stack_hwdom = -1;
int8_t __ro_after_init opt_cpu_stack_pv = -1;
+/* Zero CPU stacks. */
+int8_t __ro_after_init opt_zero_stack_hvm = -1;
+int8_t __ro_after_init opt_zero_stack_hwdom = -1;
+int8_t __ro_after_init opt_zero_stack_pv = -1;
static int __init cf_check parse_spec_ctrl(const char *s)
{
@@ -515,6 +519,7 @@ static int __init cf_check parse_asi(const char *s)
{
opt_vcpu_pt_pv = opt_vcpu_pt_hwdom = opt_vcpu_pt_hvm = 1;
opt_cpu_stack_pv = opt_cpu_stack_hwdom = opt_cpu_stack_hvm = 1;
+ opt_zero_stack_pv = opt_zero_stack_hvm = opt_zero_stack_hwdom = 1;
}
do {
@@ -529,13 +534,14 @@ static int __init cf_check parse_asi(const char *s)
case 1:
opt_vcpu_pt_pv = opt_vcpu_pt_hwdom = opt_vcpu_pt_hvm = val;
opt_cpu_stack_pv = opt_cpu_stack_hvm = opt_cpu_stack_hwdom = val;
+ opt_zero_stack_pv = opt_zero_stack_hvm = opt_zero_stack_hwdom = val;
break;
default:
if ( (val = parse_boolean("pv", s, ss)) >= 0 )
- opt_cpu_stack_pv = opt_vcpu_pt_pv = val;
+ opt_zero_stack_pv = opt_cpu_stack_pv = opt_vcpu_pt_pv = val;
else if ( (val = parse_boolean("hvm", s, ss)) >= 0 )
- opt_cpu_stack_hvm = opt_vcpu_pt_hvm = val;
+ opt_zero_stack_hvm = opt_cpu_stack_hvm = opt_vcpu_pt_hvm = val;
else if ( (val = parse_boolean("vcpu-pt", s, ss)) != -1 )
{
switch ( val )
@@ -579,6 +585,28 @@ static int __init cf_check parse_asi(const char *s)
break;
}
}
+ else if ( (val = parse_boolean("zero-stack", s, ss)) != -1 )
+ {
+ switch ( val )
+ {
+ case 1:
+ case 0:
+ opt_zero_stack_pv = opt_zero_stack_hvm =
+ opt_zero_stack_hwdom = val;
+ break;
+
+ case -2:
+ s += strlen("zero-stack=");
+ if ( (val = parse_boolean("pv", s, ss)) >= 0 )
+ opt_zero_stack_pv = val;
+ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 )
+ opt_zero_stack_hvm = val;
+ else
+ default:
+ rc = -EINVAL;
+ break;
+ }
+ }
else if ( *s )
rc = -EINVAL;
break;
@@ -791,17 +819,21 @@ static void __init print_details(enum ind_thunk thunk)
#endif
#ifdef CONFIG_HVM
- printk(" ASI features for HVM VMs:%s%s%s\n",
- opt_vcpu_pt_hvm || opt_cpu_stack_hvm ? "" : " None",
+ printk(" ASI features for HVM VMs:%s%s%s%s\n",
+ opt_vcpu_pt_hvm || opt_cpu_stack_hvm ||
+ opt_zero_stack_hvm ? "" : " None",
opt_vcpu_pt_hvm ? " vCPU-PT" : "",
- opt_cpu_stack_hvm ? " CPU-STACK" : "");
+ opt_cpu_stack_hvm ? " CPU-STACK" : "",
+ opt_zero_stack_hvm ? " ZERO-STACK" : "");
#endif
#ifdef CONFIG_PV
- printk(" ASI features for PV VMs:%s%s%s\n",
- opt_vcpu_pt_pv || opt_cpu_stack_pv ? "" : " None",
+ printk(" ASI features for PV VMs:%s%s%s%s\n",
+ opt_vcpu_pt_pv || opt_cpu_stack_pv ||
+ opt_zero_stack_pv ? "" : " None",
opt_vcpu_pt_pv ? " vCPU-PT" : "",
- opt_cpu_stack_pv ? " CPU-STACK" : "");
+ opt_cpu_stack_pv ? " CPU-STACK" : "",
+ opt_zero_stack_pv ? " ZERO-STACK" : "");
#endif
}
@@ -1912,6 +1944,9 @@ void spec_ctrl_init_domain(struct domain *d)
d->arch.cpu_stack = is_hardware_domain(d) ? opt_cpu_stack_hwdom
: pv ? opt_cpu_stack_pv
: opt_cpu_stack_hvm;
+ d->arch.zero_stack = is_hardware_domain(d) ? opt_zero_stack_hwdom
+ : pv ? opt_zero_stack_pv
+ : opt_zero_stack_hvm;
}
void __init init_speculation_mitigations(void)
@@ -2221,6 +2256,12 @@ void __init init_speculation_mitigations(void)
opt_cpu_stack_hwdom = 0;
if ( opt_cpu_stack_hvm == -1 )
opt_cpu_stack_hvm = 0;
+ if ( opt_zero_stack_pv == -1 )
+ opt_zero_stack_pv = 0;
+ if ( opt_zero_stack_hwdom == -1 )
+ opt_zero_stack_hwdom = 0;
+ if ( opt_zero_stack_hvm == -1 )
+ opt_zero_stack_hvm = 0;
if ( opt_vcpu_pt_pv || opt_vcpu_pt_hvm )
warning_add(
@@ -1792,6 +1792,7 @@ static void unknown_nmi_error(const struct cpu_user_regs *regs,
static nmi_callback_t *__read_mostly nmi_callback;
DEFINE_PER_CPU(unsigned int, nmi_count);
+DEFINE_PER_CPU(unsigned int, slice_nmi_count);
void do_nmi(const struct cpu_user_regs *regs)
{
@@ -1801,6 +1802,7 @@ void do_nmi(const struct cpu_user_regs *regs)
bool handle_unknown = false;
this_cpu(nmi_count)++;
+ this_cpu(slice_nmi_count)++;
nmi_enter();
/*
@@ -1919,6 +1921,8 @@ void asmlinkage do_device_not_available(struct cpu_user_regs *regs)
void nocall sysenter_eflags_saved(void);
+DEFINE_PER_CPU(unsigned int, slice_db_count);
+
void asmlinkage do_debug(struct cpu_user_regs *regs)
{
unsigned long dr6;
@@ -1927,6 +1931,7 @@ void asmlinkage do_debug(struct cpu_user_regs *regs)
/* Stash dr6 as early as possible. */
dr6 = read_debugreg(6);
+ this_cpu(slice_db_count)++;
/*
* At the time of writing (March 2018), on the subject of %dr6:
*
With the stack mapped on a per-CPU basis there's no risk of other CPUs being able to read the stack contents, but vCPUs running on the current pCPU could read stack rubble from operations of previous vCPUs. The #DF stack is not zeroed because handling of #DF results in a panic. The contents of the shadow stack are not cleared as part of this change. It's arguable that leaking internal Xen return addresses is not guest confidential data. At most those could be used by an attacker to figure out the paths inside of Xen previous execution flows have used. Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> --- Is it required to zero the stack when doing a non-lazy context switch from the idle vPCU to the previously running vCPU? d0v0 -> IDLE -> sync_execstate -> zero stack? -> d0v0 This is currently done in this proposal, as when running in the idle vCPU context (iow: not lazy switched) stacks from remote pCPUs can be mapped or tasklets executed. --- Changes since v1: - Zero the stack forward to use ERMS. - Only zero the IST stacks if they have been used. - Only zero the primary stack for full context switches. --- docs/misc/xen-command-line.pandoc | 4 +- xen/arch/x86/cpu/mcheck/mce.c | 4 ++ xen/arch/x86/domain.c | 13 ++++++- xen/arch/x86/include/asm/current.h | 53 +++++++++++++++++++++++--- xen/arch/x86/include/asm/domain.h | 3 ++ xen/arch/x86/include/asm/spec_ctrl.h | 1 + xen/arch/x86/spec_ctrl.c | 57 ++++++++++++++++++++++++---- xen/arch/x86/traps.c | 5 +++ 8 files changed, 124 insertions(+), 16 deletions(-)