diff mbox series

[59/65] x86/traps: Rework write_stub_trampoline() to not hardcode the jmp

Message ID 20211126123446.32324-60-andrew.cooper3@citrix.com (mailing list archive)
State New, archived
Headers show
Series x86: Support for CET Indirect Branch Tracking | expand

Commit Message

Andrew Cooper Nov. 26, 2021, 12:34 p.m. UTC
For CET-IBT, we will need to optionally insert an endbr64 instruction at the
start of the stub.  Don't hardcode the jmp displacement assuming that it
starts at byte 24 of the stub.

Also add extra comments describing what is going on.  The mix of %rax and %rsp
is far from trivial to follow.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
---
 xen/arch/x86/x86_64/traps.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

Comments

Jan Beulich Dec. 3, 2021, 1:17 p.m. UTC | #1
On 26.11.2021 13:34, Andrew Cooper wrote:
> For CET-IBT, we will need to optionally insert an endbr64 instruction at the
> start of the stub.  Don't hardcode the jmp displacement assuming that it
> starts at byte 24 of the stub.
> 
> Also add extra comments describing what is going on.  The mix of %rax and %rsp
> is far from trivial to follow.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> CC: Wei Liu <wl@xen.org>
> ---
>  xen/arch/x86/x86_64/traps.c | 36 ++++++++++++++++++++++--------------
>  1 file changed, 22 insertions(+), 14 deletions(-)
> 
> diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
> index d661d7ffcaaf..6f3c65bedc7a 100644
> --- a/xen/arch/x86/x86_64/traps.c
> +++ b/xen/arch/x86/x86_64/traps.c
> @@ -293,30 +293,38 @@ static unsigned int write_stub_trampoline(
>      unsigned char *stub, unsigned long stub_va,
>      unsigned long stack_bottom, unsigned long target_va)
>  {
> +    unsigned char *p = stub;
> +
> +    /* Store guest %rax into %ss slot */
>      /* movabsq %rax, stack_bottom - 8 */
> -    stub[0] = 0x48;
> -    stub[1] = 0xa3;
> -    *(uint64_t *)&stub[2] = stack_bottom - 8;
> +    *p++ = 0x48;
> +    *p++ = 0xa3;
> +    *(uint64_t *)p = stack_bottom - 8;
> +    p += 8;
>  
> +    /* Store guest %rsp in %rax */
>      /* movq %rsp, %rax */
> -    stub[10] = 0x48;
> -    stub[11] = 0x89;
> -    stub[12] = 0xe0;
> +    *p++ = 0x48;
> +    *p++ = 0x89;
> +    *p++ = 0xe0;
>  
> +    /* Switch to Xen stack */
>      /* movabsq $stack_bottom - 8, %rsp */
> -    stub[13] = 0x48;
> -    stub[14] = 0xbc;
> -    *(uint64_t *)&stub[15] = stack_bottom - 8;
> +    *p++ = 0x48;
> +    *p++ = 0xbc;
> +    *(uint64_t *)p = stack_bottom - 8;
> +    p += 8;
>  
> +    /* Store guest %rsp into %rsp slot */
>      /* pushq %rax */
> -    stub[23] = 0x50;
> +    *p++ = 0x50;
>  
>      /* jmp target_va */
> -    stub[24] = 0xe9;
> -    *(int32_t *)&stub[25] = target_va - (stub_va + 29);
> +    *p++ = 0xe9;
> +    *(int32_t *)p = target_va - (stub_va + (p - stub) + 4);
> +    p += 4;
>  
> -    /* Round up to a multiple of 16 bytes. */
> -    return 32;
> +    return p - stub;
>  }

I'm concerned of you silently discarding the aligning to 16 bytes here.
Imo this really needs justifying, or perhaps even delaying until a
later change. I'd like to point out though that we may not have a space
issue here at all, as I think we can replace one of the MOVABSQ (using
absolute numbers to hopefully make this easier to follow):

    movabsq %rax, stack_bottom - 8
    movq %rsp, %rax
    movq -18(%rip), %rsp
    pushq %rax
    jmp target_va

This totals to 26 bytes, leaving enough room for ENDBR64 without crossing
the 32-byte boundary. But I fear you may eat me for using insn bytes as
data ...

Jan
Andrew Cooper Dec. 3, 2021, 1:59 p.m. UTC | #2
On 03/12/2021 13:17, Jan Beulich wrote:
> On 26.11.2021 13:34, Andrew Cooper wrote:
>> For CET-IBT, we will need to optionally insert an endbr64 instruction at the
>> start of the stub.  Don't hardcode the jmp displacement assuming that it
>> starts at byte 24 of the stub.
>>
>> Also add extra comments describing what is going on.  The mix of %rax and %rsp
>> is far from trivial to follow.
>>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> ---
>> CC: Jan Beulich <JBeulich@suse.com>
>> CC: Roger Pau Monné <roger.pau@citrix.com>
>> CC: Wei Liu <wl@xen.org>
>> ---
>>  xen/arch/x86/x86_64/traps.c | 36 ++++++++++++++++++++++--------------
>>  1 file changed, 22 insertions(+), 14 deletions(-)
>>
>> diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
>> index d661d7ffcaaf..6f3c65bedc7a 100644
>> --- a/xen/arch/x86/x86_64/traps.c
>> +++ b/xen/arch/x86/x86_64/traps.c
>> @@ -293,30 +293,38 @@ static unsigned int write_stub_trampoline(
>>      unsigned char *stub, unsigned long stub_va,
>>      unsigned long stack_bottom, unsigned long target_va)
>>  {
>> +    unsigned char *p = stub;
>> +
>> +    /* Store guest %rax into %ss slot */
>>      /* movabsq %rax, stack_bottom - 8 */
>> -    stub[0] = 0x48;
>> -    stub[1] = 0xa3;
>> -    *(uint64_t *)&stub[2] = stack_bottom - 8;
>> +    *p++ = 0x48;
>> +    *p++ = 0xa3;
>> +    *(uint64_t *)p = stack_bottom - 8;
>> +    p += 8;
>>  
>> +    /* Store guest %rsp in %rax */
>>      /* movq %rsp, %rax */
>> -    stub[10] = 0x48;
>> -    stub[11] = 0x89;
>> -    stub[12] = 0xe0;
>> +    *p++ = 0x48;
>> +    *p++ = 0x89;
>> +    *p++ = 0xe0;
>>  
>> +    /* Switch to Xen stack */
>>      /* movabsq $stack_bottom - 8, %rsp */
>> -    stub[13] = 0x48;
>> -    stub[14] = 0xbc;
>> -    *(uint64_t *)&stub[15] = stack_bottom - 8;
>> +    *p++ = 0x48;
>> +    *p++ = 0xbc;
>> +    *(uint64_t *)p = stack_bottom - 8;
>> +    p += 8;
>>  
>> +    /* Store guest %rsp into %rsp slot */
>>      /* pushq %rax */
>> -    stub[23] = 0x50;
>> +    *p++ = 0x50;
>>  
>>      /* jmp target_va */
>> -    stub[24] = 0xe9;
>> -    *(int32_t *)&stub[25] = target_va - (stub_va + 29);
>> +    *p++ = 0xe9;
>> +    *(int32_t *)p = target_va - (stub_va + (p - stub) + 4);
>> +    p += 4;
>>  
>> -    /* Round up to a multiple of 16 bytes. */
>> -    return 32;
>> +    return p - stub;
>>  }
> I'm concerned of you silently discarding the aligning to 16 bytes here.
> Imo this really needs justifying, or perhaps even delaying until a
> later change.

Oh.  That was an oversight, and I'm honestly a little impressed that the
result worked fine.

return ROUNDUP(p - stub, 16);

ought to do?

>  I'd like to point out though that we may not have a space
> issue here at all, as I think we can replace one of the MOVABSQ (using
> absolute numbers to hopefully make this easier to follow):
>
>     movabsq %rax, stack_bottom - 8
>     movq %rsp, %rax
>     movq -18(%rip), %rsp
>     pushq %rax
>     jmp target_va
>
> This totals to 26 bytes, leaving enough room for ENDBR64 without crossing
> the 32-byte boundary. But I fear you may eat me for using insn bytes as
> data ...

Well that's entertaining, and really quite a shame that RIP-relative
addresses only work with 32bit displacements.

While it is shorter, it's only 3 bytes shorter, and the stack layout is
custom anyway so it really doesn't matter if the push lives here or not.

Furthermore (and probably scraping the excuses barrel here), it forces a
data side TLB and cacheline fill where we didn't have one previously. 
Modern CPUs ought to be fine here, but older ones (that don't have a
shared L2TLB) are liable to stall.

Perhaps lets leave this as an emergency option, if we need to find more
space again in the future?

~Andrew
Jan Beulich Dec. 3, 2021, 2:03 p.m. UTC | #3
On 03.12.2021 14:59, Andrew Cooper wrote:
> On 03/12/2021 13:17, Jan Beulich wrote:
>> On 26.11.2021 13:34, Andrew Cooper wrote:
>>> For CET-IBT, we will need to optionally insert an endbr64 instruction at the
>>> start of the stub.  Don't hardcode the jmp displacement assuming that it
>>> starts at byte 24 of the stub.
>>>
>>> Also add extra comments describing what is going on.  The mix of %rax and %rsp
>>> is far from trivial to follow.
>>>
>>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>>> ---
>>> CC: Jan Beulich <JBeulich@suse.com>
>>> CC: Roger Pau Monné <roger.pau@citrix.com>
>>> CC: Wei Liu <wl@xen.org>
>>> ---
>>>  xen/arch/x86/x86_64/traps.c | 36 ++++++++++++++++++++++--------------
>>>  1 file changed, 22 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
>>> index d661d7ffcaaf..6f3c65bedc7a 100644
>>> --- a/xen/arch/x86/x86_64/traps.c
>>> +++ b/xen/arch/x86/x86_64/traps.c
>>> @@ -293,30 +293,38 @@ static unsigned int write_stub_trampoline(
>>>      unsigned char *stub, unsigned long stub_va,
>>>      unsigned long stack_bottom, unsigned long target_va)
>>>  {
>>> +    unsigned char *p = stub;
>>> +
>>> +    /* Store guest %rax into %ss slot */
>>>      /* movabsq %rax, stack_bottom - 8 */
>>> -    stub[0] = 0x48;
>>> -    stub[1] = 0xa3;
>>> -    *(uint64_t *)&stub[2] = stack_bottom - 8;
>>> +    *p++ = 0x48;
>>> +    *p++ = 0xa3;
>>> +    *(uint64_t *)p = stack_bottom - 8;
>>> +    p += 8;
>>>  
>>> +    /* Store guest %rsp in %rax */
>>>      /* movq %rsp, %rax */
>>> -    stub[10] = 0x48;
>>> -    stub[11] = 0x89;
>>> -    stub[12] = 0xe0;
>>> +    *p++ = 0x48;
>>> +    *p++ = 0x89;
>>> +    *p++ = 0xe0;
>>>  
>>> +    /* Switch to Xen stack */
>>>      /* movabsq $stack_bottom - 8, %rsp */
>>> -    stub[13] = 0x48;
>>> -    stub[14] = 0xbc;
>>> -    *(uint64_t *)&stub[15] = stack_bottom - 8;
>>> +    *p++ = 0x48;
>>> +    *p++ = 0xbc;
>>> +    *(uint64_t *)p = stack_bottom - 8;
>>> +    p += 8;
>>>  
>>> +    /* Store guest %rsp into %rsp slot */
>>>      /* pushq %rax */
>>> -    stub[23] = 0x50;
>>> +    *p++ = 0x50;
>>>  
>>>      /* jmp target_va */
>>> -    stub[24] = 0xe9;
>>> -    *(int32_t *)&stub[25] = target_va - (stub_va + 29);
>>> +    *p++ = 0xe9;
>>> +    *(int32_t *)p = target_va - (stub_va + (p - stub) + 4);
>>> +    p += 4;
>>>  
>>> -    /* Round up to a multiple of 16 bytes. */
>>> -    return 32;
>>> +    return p - stub;
>>>  }
>> I'm concerned of you silently discarding the aligning to 16 bytes here.
>> Imo this really needs justifying, or perhaps even delaying until a
>> later change.
> 
> Oh.  That was an oversight, and I'm honestly a little impressed that the
> result worked fine.
> 
> return ROUNDUP(p - stub, 16);
> 
> ought to do?

Yes, sure. Then
Reviewed-by: Jan Beulich <jbeulich@suse.com>

>>  I'd like to point out though that we may not have a space
>> issue here at all, as I think we can replace one of the MOVABSQ (using
>> absolute numbers to hopefully make this easier to follow):
>>
>>     movabsq %rax, stack_bottom - 8
>>     movq %rsp, %rax
>>     movq -18(%rip), %rsp
>>     pushq %rax
>>     jmp target_va
>>
>> This totals to 26 bytes, leaving enough room for ENDBR64 without crossing
>> the 32-byte boundary. But I fear you may eat me for using insn bytes as
>> data ...
> 
> Well that's entertaining, and really quite a shame that RIP-relative
> addresses only work with 32bit displacements.
> 
> While it is shorter, it's only 3 bytes shorter, and the stack layout is
> custom anyway so it really doesn't matter if the push lives here or not.
> 
> Furthermore (and probably scraping the excuses barrel here), it forces a
> data side TLB and cacheline fill where we didn't have one previously. 
> Modern CPUs ought to be fine here, but older ones (that don't have a
> shared L2TLB) are liable to stall.

Well, that was why I though you might eat me for the suggestion.

> Perhaps lets leave this as an emergency option, if we need to find more
> space again in the future?

Yeah - as said elsewhere, due to the v1.1-s I did look at patches in the
wrong order, and hence wasn't aware yet that you have found a different
way to squeeze in the ENDBR.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
index d661d7ffcaaf..6f3c65bedc7a 100644
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -293,30 +293,38 @@  static unsigned int write_stub_trampoline(
     unsigned char *stub, unsigned long stub_va,
     unsigned long stack_bottom, unsigned long target_va)
 {
+    unsigned char *p = stub;
+
+    /* Store guest %rax into %ss slot */
     /* movabsq %rax, stack_bottom - 8 */
-    stub[0] = 0x48;
-    stub[1] = 0xa3;
-    *(uint64_t *)&stub[2] = stack_bottom - 8;
+    *p++ = 0x48;
+    *p++ = 0xa3;
+    *(uint64_t *)p = stack_bottom - 8;
+    p += 8;
 
+    /* Store guest %rsp in %rax */
     /* movq %rsp, %rax */
-    stub[10] = 0x48;
-    stub[11] = 0x89;
-    stub[12] = 0xe0;
+    *p++ = 0x48;
+    *p++ = 0x89;
+    *p++ = 0xe0;
 
+    /* Switch to Xen stack */
     /* movabsq $stack_bottom - 8, %rsp */
-    stub[13] = 0x48;
-    stub[14] = 0xbc;
-    *(uint64_t *)&stub[15] = stack_bottom - 8;
+    *p++ = 0x48;
+    *p++ = 0xbc;
+    *(uint64_t *)p = stack_bottom - 8;
+    p += 8;
 
+    /* Store guest %rsp into %rsp slot */
     /* pushq %rax */
-    stub[23] = 0x50;
+    *p++ = 0x50;
 
     /* jmp target_va */
-    stub[24] = 0xe9;
-    *(int32_t *)&stub[25] = target_va - (stub_va + 29);
+    *p++ = 0xe9;
+    *(int32_t *)p = target_va - (stub_va + (p - stub) + 4);
+    p += 4;
 
-    /* Round up to a multiple of 16 bytes. */
-    return 32;
+    return p - stub;
 }
 
 DEFINE_PER_CPU(struct stubs, stubs);