diff mbox series

[v2,06/14] x86/shstk: Create shadow stacks

Message ID 20200527191847.17207-7-andrew.cooper3@citrix.com (mailing list archive)
State New, archived
Headers show
Series x86: Support for CET Supervisor Shadow Stacks | expand

Commit Message

Andrew Cooper May 27, 2020, 7:18 p.m. UTC
Introduce HYPERVISOR_SHSTK pagetable constants, which are Read-Only + Dirty.
Use these in place of _PAGE_NONE for memguard_guard_stack().

Supervisor shadow stacks need a token written at the top, which is most easily
done before making the frame read only.

Allocate the shadow IST stack block in struct tss_page.  It doesn't strictly
need to live here, but it is a convenient location (and XPTI-safe, for testing
purposes), and placing it ahead of the TSS doesn't risk colliding with a bad
IO Bitmap offset and turning into some IO port permissions.

Have load_system_tables() set up the shadow IST stack table when setting up
the regular IST in the TSS.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Wei Liu <wl@xen.org>
CC: Roger Pau Monné <roger.pau@citrix.com>

v2:
 * Introduce IST_SHSTK_SIZE (Name subject to improvement).
 * Skip writing the shadow stack token for !XEN_SHSTK builds.
 * Tweak clobbering to be correct and safe.
---
 xen/arch/x86/cpu/common.c         | 24 ++++++++++++++++++++++++
 xen/arch/x86/mm.c                 | 25 +++++++++++++++++++++++--
 xen/include/asm-x86/config.h      |  2 ++
 xen/include/asm-x86/page.h        |  1 +
 xen/include/asm-x86/processor.h   |  3 ++-
 xen/include/asm-x86/x86_64/page.h |  1 +
 6 files changed, 53 insertions(+), 3 deletions(-)

Comments

Jan Beulich May 28, 2020, 12:50 p.m. UTC | #1
On 27.05.2020 21:18, Andrew Cooper wrote:
> --- a/xen/arch/x86/cpu/common.c
> +++ b/xen/arch/x86/cpu/common.c
> @@ -769,6 +769,30 @@ void load_system_tables(void)
>  	tss->rsp1 = 0x8600111111111111ul;
>  	tss->rsp2 = 0x8600111111111111ul;
>  
> +	/* Set up the shadow stack IST. */
> +	if (cpu_has_xen_shstk) {
> +		volatile uint64_t *ist_ssp = this_cpu(tss_page).ist_ssp;
> +
> +		/*
> +		 * Used entries must point at the supervisor stack token.
> +		 * Unused entries are poisoned.
> +		 *
> +		 * This IST Table may be live, and the NMI/#MC entries must
> +		 * remain valid on every instruction boundary, hence the
> +		 * volatile qualifier.
> +		 */

Move this comment ahead of what it comments on, as we usually have it?

> +		ist_ssp[0] = 0x8600111111111111ul;
> +		ist_ssp[IST_MCE] = stack_top + (IST_MCE * IST_SHSTK_SIZE) - 8;
> +		ist_ssp[IST_NMI] = stack_top + (IST_NMI * IST_SHSTK_SIZE) - 8;
> +		ist_ssp[IST_DB]	 = stack_top + (IST_DB	* IST_SHSTK_SIZE) - 8;
> +		ist_ssp[IST_DF]	 = stack_top + (IST_DF	* IST_SHSTK_SIZE) - 8;

Strictly speaking you want to introduce

#define IST_SHSTK_SLOT 0

next to PRIMARY_SHSTK_SLOT and use

		ist_ssp[IST_MCE] = stack_top + (IST_SHSTK_SLOT * PAGE_SIZE) +
                                               (IST_MCE * IST_SHSTK_SIZE) - 8;

etc here. It's getting longish, so I'm not going to insist. But if you
go this route, then please also below / elsewhere.

> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -5994,12 +5994,33 @@ void memguard_unguard_range(void *p, unsigned long l)
>  
>  #endif
>  
> +static void write_sss_token(unsigned long *ptr)
> +{
> +    /*
> +     * A supervisor shadow stack token is its own linear address, with the
> +     * busy bit (0) clear.
> +     */
> +    *ptr = (unsigned long)ptr;
> +}
> +
>  void memguard_guard_stack(void *p)
>  {
> -    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, _PAGE_NONE);
> +    /* IST Shadow stacks.  4x 1k in stack page 0. */
> +    if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
> +    {
> +        write_sss_token(p + (IST_MCE * IST_SHSTK_SIZE) - 8);
> +        write_sss_token(p + (IST_NMI * IST_SHSTK_SIZE) - 8);
> +        write_sss_token(p + (IST_DB  * IST_SHSTK_SIZE) - 8);
> +        write_sss_token(p + (IST_DF  * IST_SHSTK_SIZE) - 8);

Up to now two successive memguard_guard_stack() were working fine. This
will be no longer the case, just as an observation.

> +    }
> +    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK);

As already hinted at in reply to the previous patch, I think this wants
to remain _PAGE_NONE when we don't use CET-SS.

> +    /* Primary Shadow Stack.  1x 4k in stack page 5. */
>      p += PRIMARY_SHSTK_SLOT * PAGE_SIZE;
> -    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, _PAGE_NONE);
> +    if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
> +        write_sss_token(p + PAGE_SIZE - 8);
> +
> +    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK);
>  }
>  
>  void memguard_unguard_stack(void *p)

Would this function perhaps better zap the tokens?

Jan
Andrew Cooper May 29, 2020, 7:35 p.m. UTC | #2
On 28/05/2020 13:50, Jan Beulich wrote:
> On 27.05.2020 21:18, Andrew Cooper wrote:
>> --- a/xen/arch/x86/cpu/common.c
>> +++ b/xen/arch/x86/cpu/common.c
>> @@ -769,6 +769,30 @@ void load_system_tables(void)
>>  	tss->rsp1 = 0x8600111111111111ul;
>>  	tss->rsp2 = 0x8600111111111111ul;
>>  
>> +	/* Set up the shadow stack IST. */
>> +	if (cpu_has_xen_shstk) {
>> +		volatile uint64_t *ist_ssp = this_cpu(tss_page).ist_ssp;
>> +
>> +		/*
>> +		 * Used entries must point at the supervisor stack token.
>> +		 * Unused entries are poisoned.
>> +		 *
>> +		 * This IST Table may be live, and the NMI/#MC entries must
>> +		 * remain valid on every instruction boundary, hence the
>> +		 * volatile qualifier.
>> +		 */
> Move this comment ahead of what it comments on, as we usually have it?
>
>> +		ist_ssp[0] = 0x8600111111111111ul;
>> +		ist_ssp[IST_MCE] = stack_top + (IST_MCE * IST_SHSTK_SIZE) - 8;
>> +		ist_ssp[IST_NMI] = stack_top + (IST_NMI * IST_SHSTK_SIZE) - 8;
>> +		ist_ssp[IST_DB]	 = stack_top + (IST_DB	* IST_SHSTK_SIZE) - 8;
>> +		ist_ssp[IST_DF]	 = stack_top + (IST_DF	* IST_SHSTK_SIZE) - 8;
> Strictly speaking you want to introduce
>
> #define IST_SHSTK_SLOT 0
>
> next to PRIMARY_SHSTK_SLOT and use
>
> 		ist_ssp[IST_MCE] = stack_top + (IST_SHSTK_SLOT * PAGE_SIZE) +
>                                                (IST_MCE * IST_SHSTK_SIZE) - 8;
>
> etc here. It's getting longish, so I'm not going to insist. But if you
> go this route, then please also below / elsewhere.

Actually no.  I've got a much better idea, based on how Linux does the
same, but it's definitely 4.15 material at this point.

>
>> --- a/xen/arch/x86/mm.c
>> +++ b/xen/arch/x86/mm.c
>> @@ -5994,12 +5994,33 @@ void memguard_unguard_range(void *p, unsigned long l)
>>  
>>  #endif
>>  
>> +static void write_sss_token(unsigned long *ptr)
>> +{
>> +    /*
>> +     * A supervisor shadow stack token is its own linear address, with the
>> +     * busy bit (0) clear.
>> +     */
>> +    *ptr = (unsigned long)ptr;
>> +}
>> +
>>  void memguard_guard_stack(void *p)
>>  {
>> -    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, _PAGE_NONE);
>> +    /* IST Shadow stacks.  4x 1k in stack page 0. */
>> +    if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
>> +    {
>> +        write_sss_token(p + (IST_MCE * IST_SHSTK_SIZE) - 8);
>> +        write_sss_token(p + (IST_NMI * IST_SHSTK_SIZE) - 8);
>> +        write_sss_token(p + (IST_DB  * IST_SHSTK_SIZE) - 8);
>> +        write_sss_token(p + (IST_DF  * IST_SHSTK_SIZE) - 8);
> Up to now two successive memguard_guard_stack() were working fine. This
> will be no longer the case, just as an observation.

I don't think that matters.

>
>> +    }
>> +    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK);
> As already hinted at in reply to the previous patch, I think this wants
> to remain _PAGE_NONE when we don't use CET-SS.

The commit message discussed why that is not an option (currently), and
why I don't consider it a good idea to make possible.

>> +    /* Primary Shadow Stack.  1x 4k in stack page 5. */
>>      p += PRIMARY_SHSTK_SLOT * PAGE_SIZE;
>> -    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, _PAGE_NONE);
>> +    if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
>> +        write_sss_token(p + PAGE_SIZE - 8);
>> +
>> +    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK);
>>  }
>>  
>>  void memguard_unguard_stack(void *p)
> Would this function perhaps better zap the tokens?

Why?  We don't zap any other stack contents, and let the regular page
scrubbing clean it.

~Andrew
Andrew Cooper May 29, 2020, 9:45 p.m. UTC | #3
On 29/05/2020 20:35, Andrew Cooper wrote:
>>> +    }
>>> +    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK);
>> As already hinted at in reply to the previous patch, I think this wants
>> to remain _PAGE_NONE when we don't use CET-SS.
> The commit message discussed why that is not an option (currently), and
> why I don't consider it a good idea to make possible.

Apologies.  I thought I'd written it in the commit message, but it was
half in the previous patch, and not terribly clear.  I've reworked both.

The current practical reason is to do with clone_mappings() in the XPTI
case.

A wild off-stack read is far far less likely than hitting the guard page
with a push first, which means that a R/O guard page is about the same
usefulness to us, but results in a much more simple stack setup, as it
doesn't vary attributes with enabled features.

~Andrew
Jan Beulich June 2, 2020, 12:32 p.m. UTC | #4
On 29.05.2020 23:45, Andrew Cooper wrote:
> On 29/05/2020 20:35, Andrew Cooper wrote:
>>>> +    }
>>>> +    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK);
>>> As already hinted at in reply to the previous patch, I think this wants
>>> to remain _PAGE_NONE when we don't use CET-SS.
>> The commit message discussed why that is not an option (currently), and
>> why I don't consider it a good idea to make possible.
> 
> Apologies.  I thought I'd written it in the commit message, but it was
> half in the previous patch, and not terribly clear.  I've reworked both.

Thanks, I've looked at them, but it's still not really clear to me:

> The current practical reason is to do with clone_mappings() in the XPTI
> case.

What exactly is the problem here? clone_mapping(), afaict, deals
fine with non-present PTEs. The original memguard_is_stack_guard_page()
check was more as a safe guard, to avoid establishing a mapping of
a guard page as much as possible.

> A wild off-stack read is far far less likely than hitting the guard page
> with a push first, which means that a R/O guard page is about the same
> usefulness to us, but results in a much more simple stack setup, as it
> doesn't vary attributes with enabled features.

While OoB stack reads may indeed be less likely, such aren't necessarily
"wild" (assuming my understanding of the term is what you mean): A
function epilogue can certainly pop (by the respective insn or by
incrementing %rsp by too much) too many slots, which would be detected
earlier with non-present mappings than with r/o ones. So I'd prefer to
stick to non-present guard pages when CET-SS is not in use, unless
there's indeed a technical reason not to do so. The two uses of
PAGE_HYPERVISOR_SHSTK can't be that bad a "variation" to alternatively
make _PAGE_NONE. In fact PAGE_HYPERVISOR_SHSTK could itself resolve
to _PAGE_NONE when !cpu_has_xen_shstk ...

Jan
Jan Beulich June 2, 2020, 12:35 p.m. UTC | #5
On 29.05.2020 21:35, Andrew Cooper wrote:
> On 28/05/2020 13:50, Jan Beulich wrote:
>> On 27.05.2020 21:18, Andrew Cooper wrote:
>>> +    /* Primary Shadow Stack.  1x 4k in stack page 5. */
>>>      p += PRIMARY_SHSTK_SLOT * PAGE_SIZE;
>>> -    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, _PAGE_NONE);
>>> +    if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
>>> +        write_sss_token(p + PAGE_SIZE - 8);
>>> +
>>> +    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK);
>>>  }
>>>  
>>>  void memguard_unguard_stack(void *p)
>> Would this function perhaps better zap the tokens?
> 
> Why?  We don't zap any other stack contents, and let the regular page
> scrubbing clean it.

Except that Xen used pages, if re-used by Xen itself, may not go
through a round of scrubbing. As long as we use 1:1 mappings,
re-using the same page for a shadow stack will end up having the
necessary token already in place. Looks like a defense-in-depth
measure to zap them off as soon as a page goes out of (shadow
stack) use.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index 690fd8baa8..dcc9ee08de 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -769,6 +769,30 @@  void load_system_tables(void)
 	tss->rsp1 = 0x8600111111111111ul;
 	tss->rsp2 = 0x8600111111111111ul;
 
+	/* Set up the shadow stack IST. */
+	if (cpu_has_xen_shstk) {
+		volatile uint64_t *ist_ssp = this_cpu(tss_page).ist_ssp;
+
+		/*
+		 * Used entries must point at the supervisor stack token.
+		 * Unused entries are poisoned.
+		 *
+		 * This IST Table may be live, and the NMI/#MC entries must
+		 * remain valid on every instruction boundary, hence the
+		 * volatile qualifier.
+		 */
+		ist_ssp[0] = 0x8600111111111111ul;
+		ist_ssp[IST_MCE] = stack_top + (IST_MCE * IST_SHSTK_SIZE) - 8;
+		ist_ssp[IST_NMI] = stack_top + (IST_NMI * IST_SHSTK_SIZE) - 8;
+		ist_ssp[IST_DB]	 = stack_top + (IST_DB	* IST_SHSTK_SIZE) - 8;
+		ist_ssp[IST_DF]	 = stack_top + (IST_DF	* IST_SHSTK_SIZE) - 8;
+		for ( i = IST_DF + 1;
+		      i < ARRAY_SIZE(this_cpu(tss_page).ist_ssp); ++i )
+			ist_ssp[i] = 0x8600111111111111ul;
+
+		wrmsrl(MSR_INTERRUPT_SSP_TABLE, (unsigned long)ist_ssp);
+	}
+
 	BUILD_BUG_ON(sizeof(*tss) <= 0x67); /* Mandated by the architecture. */
 
 	_set_tssldt_desc(gdt + TSS_ENTRY, (unsigned long)tss,
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 2f1e716b6d..4d6d22cc41 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5994,12 +5994,33 @@  void memguard_unguard_range(void *p, unsigned long l)
 
 #endif
 
+static void write_sss_token(unsigned long *ptr)
+{
+    /*
+     * A supervisor shadow stack token is its own linear address, with the
+     * busy bit (0) clear.
+     */
+    *ptr = (unsigned long)ptr;
+}
+
 void memguard_guard_stack(void *p)
 {
-    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, _PAGE_NONE);
+    /* IST Shadow stacks.  4x 1k in stack page 0. */
+    if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
+    {
+        write_sss_token(p + (IST_MCE * IST_SHSTK_SIZE) - 8);
+        write_sss_token(p + (IST_NMI * IST_SHSTK_SIZE) - 8);
+        write_sss_token(p + (IST_DB  * IST_SHSTK_SIZE) - 8);
+        write_sss_token(p + (IST_DF  * IST_SHSTK_SIZE) - 8);
+    }
+    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK);
 
+    /* Primary Shadow Stack.  1x 4k in stack page 5. */
     p += PRIMARY_SHSTK_SLOT * PAGE_SIZE;
-    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, _PAGE_NONE);
+    if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
+        write_sss_token(p + PAGE_SIZE - 8);
+
+    map_pages_to_xen((unsigned long)p, virt_to_mfn(p), 1, PAGE_HYPERVISOR_SHSTK);
 }
 
 void memguard_unguard_stack(void *p)
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index f3cf5df462..2ba234383d 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -66,6 +66,8 @@ 
 #define STACK_ORDER 3
 #define STACK_SIZE  (PAGE_SIZE << STACK_ORDER)
 
+#define IST_SHSTK_SIZE 1024
+
 #define TRAMPOLINE_STACK_SPACE  PAGE_SIZE
 #define TRAMPOLINE_SPACE        (KB(64) - TRAMPOLINE_STACK_SPACE)
 #define WAKEUP_STACK_MIN        3072
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index 5acf3d3d5a..f632affaef 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -364,6 +364,7 @@  void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
                                    _PAGE_DIRTY | _PAGE_RW)
 #define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD)
 #define __PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_HYPERVISOR_SHSTK   (__PAGE_HYPERVISOR_RO | _PAGE_DIRTY)
 
 #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
 
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 8ab09cf7ed..859bd9e2ec 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -435,7 +435,8 @@  struct __packed tss64 {
     uint16_t :16, bitmap;
 };
 struct tss_page {
-    struct tss64 __aligned(PAGE_SIZE) tss;
+    uint64_t __aligned(PAGE_SIZE) ist_ssp[8];
+    struct tss64 tss;
 };
 DECLARE_PER_CPU(struct tss_page, tss_page);
 
diff --git a/xen/include/asm-x86/x86_64/page.h b/xen/include/asm-x86/x86_64/page.h
index 9876634881..26621f9519 100644
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -171,6 +171,7 @@  static inline intpte_t put_pte_flags(unsigned int x)
 #define PAGE_HYPERVISOR_RW      (__PAGE_HYPERVISOR_RW      | _PAGE_GLOBAL)
 #define PAGE_HYPERVISOR_RX      (__PAGE_HYPERVISOR_RX      | _PAGE_GLOBAL)
 #define PAGE_HYPERVISOR_RWX     (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_SHSTK   (__PAGE_HYPERVISOR_SHSTK   | _PAGE_GLOBAL)
 
 #define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RW
 #define PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR_UCMINUS | \