Message ID | 20211126123446.32324-60-andrew.cooper3@citrix.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | x86: Support for CET Indirect Branch Tracking | expand |
On 26.11.2021 13:34, Andrew Cooper wrote: > For CET-IBT, we will need to optionally insert an endbr64 instruction at the > start of the stub. Don't hardcode the jmp displacement assuming that it > starts at byte 24 of the stub. > > Also add extra comments describing what is going on. The mix of %rax and %rsp > is far from trivial to follow. > > Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> > --- > CC: Jan Beulich <JBeulich@suse.com> > CC: Roger Pau Monné <roger.pau@citrix.com> > CC: Wei Liu <wl@xen.org> > --- > xen/arch/x86/x86_64/traps.c | 36 ++++++++++++++++++++++-------------- > 1 file changed, 22 insertions(+), 14 deletions(-) > > diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c > index d661d7ffcaaf..6f3c65bedc7a 100644 > --- a/xen/arch/x86/x86_64/traps.c > +++ b/xen/arch/x86/x86_64/traps.c > @@ -293,30 +293,38 @@ static unsigned int write_stub_trampoline( > unsigned char *stub, unsigned long stub_va, > unsigned long stack_bottom, unsigned long target_va) > { > + unsigned char *p = stub; > + > + /* Store guest %rax into %ss slot */ > /* movabsq %rax, stack_bottom - 8 */ > - stub[0] = 0x48; > - stub[1] = 0xa3; > - *(uint64_t *)&stub[2] = stack_bottom - 8; > + *p++ = 0x48; > + *p++ = 0xa3; > + *(uint64_t *)p = stack_bottom - 8; > + p += 8; > > + /* Store guest %rsp in %rax */ > /* movq %rsp, %rax */ > - stub[10] = 0x48; > - stub[11] = 0x89; > - stub[12] = 0xe0; > + *p++ = 0x48; > + *p++ = 0x89; > + *p++ = 0xe0; > > + /* Switch to Xen stack */ > /* movabsq $stack_bottom - 8, %rsp */ > - stub[13] = 0x48; > - stub[14] = 0xbc; > - *(uint64_t *)&stub[15] = stack_bottom - 8; > + *p++ = 0x48; > + *p++ = 0xbc; > + *(uint64_t *)p = stack_bottom - 8; > + p += 8; > > + /* Store guest %rsp into %rsp slot */ > /* pushq %rax */ > - stub[23] = 0x50; > + *p++ = 0x50; > > /* jmp target_va */ > - stub[24] = 0xe9; > - *(int32_t *)&stub[25] = target_va - (stub_va + 29); > + *p++ = 0xe9; > + *(int32_t *)p = target_va - (stub_va + (p - stub) + 4); > + p += 4; > > - /* Round up to a multiple of 16 bytes. */ > - return 32; > + return p - stub; > } I'm concerned of you silently discarding the aligning to 16 bytes here. Imo this really needs justifying, or perhaps even delaying until a later change. I'd like to point out though that we may not have a space issue here at all, as I think we can replace one of the MOVABSQ (using absolute numbers to hopefully make this easier to follow): movabsq %rax, stack_bottom - 8 movq %rsp, %rax movq -18(%rip), %rsp pushq %rax jmp target_va This totals to 26 bytes, leaving enough room for ENDBR64 without crossing the 32-byte boundary. But I fear you may eat me for using insn bytes as data ... Jan
On 03/12/2021 13:17, Jan Beulich wrote: > On 26.11.2021 13:34, Andrew Cooper wrote: >> For CET-IBT, we will need to optionally insert an endbr64 instruction at the >> start of the stub. Don't hardcode the jmp displacement assuming that it >> starts at byte 24 of the stub. >> >> Also add extra comments describing what is going on. The mix of %rax and %rsp >> is far from trivial to follow. >> >> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> >> --- >> CC: Jan Beulich <JBeulich@suse.com> >> CC: Roger Pau Monné <roger.pau@citrix.com> >> CC: Wei Liu <wl@xen.org> >> --- >> xen/arch/x86/x86_64/traps.c | 36 ++++++++++++++++++++++-------------- >> 1 file changed, 22 insertions(+), 14 deletions(-) >> >> diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c >> index d661d7ffcaaf..6f3c65bedc7a 100644 >> --- a/xen/arch/x86/x86_64/traps.c >> +++ b/xen/arch/x86/x86_64/traps.c >> @@ -293,30 +293,38 @@ static unsigned int write_stub_trampoline( >> unsigned char *stub, unsigned long stub_va, >> unsigned long stack_bottom, unsigned long target_va) >> { >> + unsigned char *p = stub; >> + >> + /* Store guest %rax into %ss slot */ >> /* movabsq %rax, stack_bottom - 8 */ >> - stub[0] = 0x48; >> - stub[1] = 0xa3; >> - *(uint64_t *)&stub[2] = stack_bottom - 8; >> + *p++ = 0x48; >> + *p++ = 0xa3; >> + *(uint64_t *)p = stack_bottom - 8; >> + p += 8; >> >> + /* Store guest %rsp in %rax */ >> /* movq %rsp, %rax */ >> - stub[10] = 0x48; >> - stub[11] = 0x89; >> - stub[12] = 0xe0; >> + *p++ = 0x48; >> + *p++ = 0x89; >> + *p++ = 0xe0; >> >> + /* Switch to Xen stack */ >> /* movabsq $stack_bottom - 8, %rsp */ >> - stub[13] = 0x48; >> - stub[14] = 0xbc; >> - *(uint64_t *)&stub[15] = stack_bottom - 8; >> + *p++ = 0x48; >> + *p++ = 0xbc; >> + *(uint64_t *)p = stack_bottom - 8; >> + p += 8; >> >> + /* Store guest %rsp into %rsp slot */ >> /* pushq %rax */ >> - stub[23] = 0x50; >> + *p++ = 0x50; >> >> /* jmp target_va */ >> - stub[24] = 0xe9; >> - *(int32_t *)&stub[25] = target_va - (stub_va + 29); >> + *p++ = 0xe9; >> + *(int32_t *)p = target_va - (stub_va + (p - stub) + 4); >> + p += 4; >> >> - /* Round up to a multiple of 16 bytes. */ >> - return 32; >> + return p - stub; >> } > I'm concerned of you silently discarding the aligning to 16 bytes here. > Imo this really needs justifying, or perhaps even delaying until a > later change. Oh. That was an oversight, and I'm honestly a little impressed that the result worked fine. return ROUNDUP(p - stub, 16); ought to do? > I'd like to point out though that we may not have a space > issue here at all, as I think we can replace one of the MOVABSQ (using > absolute numbers to hopefully make this easier to follow): > > movabsq %rax, stack_bottom - 8 > movq %rsp, %rax > movq -18(%rip), %rsp > pushq %rax > jmp target_va > > This totals to 26 bytes, leaving enough room for ENDBR64 without crossing > the 32-byte boundary. But I fear you may eat me for using insn bytes as > data ... Well that's entertaining, and really quite a shame that RIP-relative addresses only work with 32bit displacements. While it is shorter, it's only 3 bytes shorter, and the stack layout is custom anyway so it really doesn't matter if the push lives here or not. Furthermore (and probably scraping the excuses barrel here), it forces a data side TLB and cacheline fill where we didn't have one previously. Modern CPUs ought to be fine here, but older ones (that don't have a shared L2TLB) are liable to stall. Perhaps lets leave this as an emergency option, if we need to find more space again in the future? ~Andrew
On 03.12.2021 14:59, Andrew Cooper wrote: > On 03/12/2021 13:17, Jan Beulich wrote: >> On 26.11.2021 13:34, Andrew Cooper wrote: >>> For CET-IBT, we will need to optionally insert an endbr64 instruction at the >>> start of the stub. Don't hardcode the jmp displacement assuming that it >>> starts at byte 24 of the stub. >>> >>> Also add extra comments describing what is going on. The mix of %rax and %rsp >>> is far from trivial to follow. >>> >>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> >>> --- >>> CC: Jan Beulich <JBeulich@suse.com> >>> CC: Roger Pau Monné <roger.pau@citrix.com> >>> CC: Wei Liu <wl@xen.org> >>> --- >>> xen/arch/x86/x86_64/traps.c | 36 ++++++++++++++++++++++-------------- >>> 1 file changed, 22 insertions(+), 14 deletions(-) >>> >>> diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c >>> index d661d7ffcaaf..6f3c65bedc7a 100644 >>> --- a/xen/arch/x86/x86_64/traps.c >>> +++ b/xen/arch/x86/x86_64/traps.c >>> @@ -293,30 +293,38 @@ static unsigned int write_stub_trampoline( >>> unsigned char *stub, unsigned long stub_va, >>> unsigned long stack_bottom, unsigned long target_va) >>> { >>> + unsigned char *p = stub; >>> + >>> + /* Store guest %rax into %ss slot */ >>> /* movabsq %rax, stack_bottom - 8 */ >>> - stub[0] = 0x48; >>> - stub[1] = 0xa3; >>> - *(uint64_t *)&stub[2] = stack_bottom - 8; >>> + *p++ = 0x48; >>> + *p++ = 0xa3; >>> + *(uint64_t *)p = stack_bottom - 8; >>> + p += 8; >>> >>> + /* Store guest %rsp in %rax */ >>> /* movq %rsp, %rax */ >>> - stub[10] = 0x48; >>> - stub[11] = 0x89; >>> - stub[12] = 0xe0; >>> + *p++ = 0x48; >>> + *p++ = 0x89; >>> + *p++ = 0xe0; >>> >>> + /* Switch to Xen stack */ >>> /* movabsq $stack_bottom - 8, %rsp */ >>> - stub[13] = 0x48; >>> - stub[14] = 0xbc; >>> - *(uint64_t *)&stub[15] = stack_bottom - 8; >>> + *p++ = 0x48; >>> + *p++ = 0xbc; >>> + *(uint64_t *)p = stack_bottom - 8; >>> + p += 8; >>> >>> + /* Store guest %rsp into %rsp slot */ >>> /* pushq %rax */ >>> - stub[23] = 0x50; >>> + *p++ = 0x50; >>> >>> /* jmp target_va */ >>> - stub[24] = 0xe9; >>> - *(int32_t *)&stub[25] = target_va - (stub_va + 29); >>> + *p++ = 0xe9; >>> + *(int32_t *)p = target_va - (stub_va + (p - stub) + 4); >>> + p += 4; >>> >>> - /* Round up to a multiple of 16 bytes. */ >>> - return 32; >>> + return p - stub; >>> } >> I'm concerned of you silently discarding the aligning to 16 bytes here. >> Imo this really needs justifying, or perhaps even delaying until a >> later change. > > Oh. That was an oversight, and I'm honestly a little impressed that the > result worked fine. > > return ROUNDUP(p - stub, 16); > > ought to do? Yes, sure. Then Reviewed-by: Jan Beulich <jbeulich@suse.com> >> I'd like to point out though that we may not have a space >> issue here at all, as I think we can replace one of the MOVABSQ (using >> absolute numbers to hopefully make this easier to follow): >> >> movabsq %rax, stack_bottom - 8 >> movq %rsp, %rax >> movq -18(%rip), %rsp >> pushq %rax >> jmp target_va >> >> This totals to 26 bytes, leaving enough room for ENDBR64 without crossing >> the 32-byte boundary. But I fear you may eat me for using insn bytes as >> data ... > > Well that's entertaining, and really quite a shame that RIP-relative > addresses only work with 32bit displacements. > > While it is shorter, it's only 3 bytes shorter, and the stack layout is > custom anyway so it really doesn't matter if the push lives here or not. > > Furthermore (and probably scraping the excuses barrel here), it forces a > data side TLB and cacheline fill where we didn't have one previously. > Modern CPUs ought to be fine here, but older ones (that don't have a > shared L2TLB) are liable to stall. Well, that was why I though you might eat me for the suggestion. > Perhaps lets leave this as an emergency option, if we need to find more > space again in the future? Yeah - as said elsewhere, due to the v1.1-s I did look at patches in the wrong order, and hence wasn't aware yet that you have found a different way to squeeze in the ENDBR. Jan
diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c index d661d7ffcaaf..6f3c65bedc7a 100644 --- a/xen/arch/x86/x86_64/traps.c +++ b/xen/arch/x86/x86_64/traps.c @@ -293,30 +293,38 @@ static unsigned int write_stub_trampoline( unsigned char *stub, unsigned long stub_va, unsigned long stack_bottom, unsigned long target_va) { + unsigned char *p = stub; + + /* Store guest %rax into %ss slot */ /* movabsq %rax, stack_bottom - 8 */ - stub[0] = 0x48; - stub[1] = 0xa3; - *(uint64_t *)&stub[2] = stack_bottom - 8; + *p++ = 0x48; + *p++ = 0xa3; + *(uint64_t *)p = stack_bottom - 8; + p += 8; + /* Store guest %rsp in %rax */ /* movq %rsp, %rax */ - stub[10] = 0x48; - stub[11] = 0x89; - stub[12] = 0xe0; + *p++ = 0x48; + *p++ = 0x89; + *p++ = 0xe0; + /* Switch to Xen stack */ /* movabsq $stack_bottom - 8, %rsp */ - stub[13] = 0x48; - stub[14] = 0xbc; - *(uint64_t *)&stub[15] = stack_bottom - 8; + *p++ = 0x48; + *p++ = 0xbc; + *(uint64_t *)p = stack_bottom - 8; + p += 8; + /* Store guest %rsp into %rsp slot */ /* pushq %rax */ - stub[23] = 0x50; + *p++ = 0x50; /* jmp target_va */ - stub[24] = 0xe9; - *(int32_t *)&stub[25] = target_va - (stub_va + 29); + *p++ = 0xe9; + *(int32_t *)p = target_va - (stub_va + (p - stub) + 4); + p += 4; - /* Round up to a multiple of 16 bytes. */ - return 32; + return p - stub; } DEFINE_PER_CPU(struct stubs, stubs);
For CET-IBT, we will need to optionally insert an endbr64 instruction at the start of the stub. Don't hardcode the jmp displacement assuming that it starts at byte 24 of the stub. Also add extra comments describing what is going on. The mix of %rax and %rsp is far from trivial to follow. Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> --- CC: Jan Beulich <JBeulich@suse.com> CC: Roger Pau Monné <roger.pau@citrix.com> CC: Wei Liu <wl@xen.org> --- xen/arch/x86/x86_64/traps.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-)