[for_v23,16/16] x86/vdso: sgx: Rework __vdso_sgx_enter_enclave() to prefer "no callback"
diff mbox series

Message ID 20191008044613.12350-17-sean.j.christopherson@intel.com
State New
Headers show
Series
  • x86/vdso: sgx: Major vDSO cleanup
Related show

Commit Message

Sean Christopherson Oct. 8, 2019, 4:46 a.m. UTC
Rework __vdso_sgx_enter_enclave() to prioritize the flow where userspace
is not providing a callback, which is the preferred method of operation.
Using a callback requires a retpoline, and the only known motivation for
employing a callback is to allow the enclave to muck with the stack of
the untrusted runtime.

Opportunistically replace the majority of the local labels with local
symbol names to improve the readability of the code.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
---
 arch/x86/entry/vdso/vsgx_enter_enclave.S | 120 ++++++++++++++---------
 1 file changed, 71 insertions(+), 49 deletions(-)

Comments

Xing, Cedric Oct. 9, 2019, 6 p.m. UTC | #1
On 10/7/2019 9:46 PM, Sean Christopherson wrote:
> Rework __vdso_sgx_enter_enclave() to prioritize the flow where userspace
> is not providing a callback, which is the preferred method of operation.

Processors have branch predictors so your "prioritizing" may not get 
what your want.

But if you still insist, a simple ht/hnt prefixing the original branch 
instruction would have sufficed. Rearrangement of code blocks is indeed 
unnecessary.

A caveat though, for any given process, whether it supplies a callback 
or not is usually hard-coded. So either it always takes the callback 
path, or it always takes the other. And the branch predictor will do 
well in both cases. It's usually unwise to apply ht/hnt prefixes.

> Using a callback requires a retpoline, and the only known motivation for
> employing a callback is to allow the enclave to muck with the stack of
> the untrusted runtime.
> 
> Opportunistically replace the majority of the local labels with local
> symbol names to improve the readability of the code.
> 
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> ---
>   arch/x86/entry/vdso/vsgx_enter_enclave.S | 120 ++++++++++++++---------
>   1 file changed, 71 insertions(+), 49 deletions(-)
> 
> diff --git a/arch/x86/entry/vdso/vsgx_enter_enclave.S b/arch/x86/entry/vdso/vsgx_enter_enclave.S
> index de54e47c83f4..fc5622dcd2fa 100644
> --- a/arch/x86/entry/vdso/vsgx_enter_enclave.S
> +++ b/arch/x86/entry/vdso/vsgx_enter_enclave.S
> @@ -85,75 +85,97 @@ ENTRY(__vdso_sgx_enter_enclave)
>   	mov	%rsp, %rbp
>   	.cfi_def_cfa_register	%rbp
>   
> -1:	/* EENTER <= leaf <= ERESUME */
> +.Lenter_enclave:
> +	/* EENTER <= leaf <= ERESUME */
>   	cmp	$0x2, %eax
> -	jb	6f
> +	jb	.Linvalid_leaf
>   	cmp	$0x3, %eax
> -	ja	6f
> +	ja	.Linvalid_leaf
>   
>   	/* Load TCS and AEP */
>   	mov	0x10(%rbp), %rbx
> -	lea	2f(%rip), %rcx
> +	lea	.Lasync_exit_pointer(%rip), %rcx
>   
>   	/* Single ENCLU serving as both EENTER and AEP (ERESUME) */
> -2:	enclu
> +.Lasync_exit_pointer:
> +.Lenclu_eenter_eresume:
> +	enclu
>   
> -	/* EEXIT path */
> +	/* EEXIT jumps here unless the enclave is doing something fancy. */
>   	xor	%eax, %eax
> -3:	mov	%eax, %ecx
> -
> -	/* Call the exit handler if supplied */
> -	mov	0x20(%rbp), %rax
> -	test	%rax, %rax
> -	jz	7f

> -	/* Align stack per x86_64 ABI. The original %rsp is saved in %rbx to be
> -	 * restored after the exit handler returns. */
> +
> +	/* Invoke userspace's exit handler if one was provided. */
> +.Lhandle_exit:
> +	cmp	$0, 0x20(%rbp)
> +	jne	.Linvoke_userspace_handler
> +
> +.Lout:
> +	leave
> +	.cfi_def_cfa		%rsp, 8
> +	ret
> +
> +.Linvalid_leaf:

Please set frame pointer back to %rbp here, or stack unwinding will fail.

> +	mov	$(-EINVAL), %eax
> +	jmp	.Lout
> +
> +.Lhandle_exception:
> +	mov	0x18(%rbp), %rcx
> +	test    %rcx, %rcx
> +	je	.Lskip_exception_info

A single "jrcxz .Lskip_exception_info" is equivalent to the above 2 
instructions combined.

> +
> +	/* Fill optional exception info. */
> +	mov	%eax, EX_LEAF(%rcx)
> +	mov	%di,  EX_TRAPNR(%rcx)
> +	mov	%si,  EX_ERROR_CODE(%rcx)
> +	mov	%rdx, EX_ADDRESS(%rcx)
> +.Lskip_exception_info:
> +	mov	$(-EFAULT), %eax
> +	jmp	.Lhandle_exit
> +
> +.Linvoke_userspace_handler:
> +	/*
> +	 * Align stack per x86_64 ABI. Save the original %rsp in %rbx to be
> +	 * restored after the callback returns.
> +	 */
>   	mov	%rsp, %rbx
>   	and	$-0x10, %rsp
> -	/* Clear RFLAGS.DF per x86_64 ABI */
> -	cld
> -	/* Parameters for the exit handler */
> +
> +	/* Push @e, u_rsp and @tcs as parameters to the callback. */
>   	push	0x18(%rbp)
>   	push	%rbx
>   	push	0x10(%rbp)
> -	/* Call *%rax via retpoline */
> -	call	40f
> -	/* Restore %rsp to its original value left off by the enclave from last
> -	 * exit */
> +
> +	/* Pass the "return" value to the callback via %rcx. */
> +	mov	%eax, %ecx

@e (ex_info) is almost always needed by every callback as it also serves 
as the "context pointer". The return value on the other hand is 
insignificant because it could be deduced from @e->EX_LEAF anyway. So 
I'd retain %rcx and push %rax to the stack instead, given the purpose of 
this patch is to squeeze out a bit performance.

> +
> +	/* Clear RFLAGS.DF per x86_64 ABI */
> +	cld
> +
> +	/* Load the callback pointer to %rax and invoke it via retpoline. */
> +	mov	0x20(%rbp), %rax

Per X86_64 ABI, %rsp shall be 16 bytes aligned before "call". But %rsp 
here doesn't look aligned properly.

> +	call	.Lretpoline
> +
> +	/* Restore %rsp to its post-exit value. */
>   	mov	%rbx, %rsp
> -	/* Positive return value from the exit handler will be interpreted as
> -	 * an ENCLU leaf, while a non-positive value will be interpreted as the
> -	 * return value to be passed back to the caller. */
> -	jmp	1b
> -40:	/* retpoline */
> -	call	42f
> -41:	pause
> -	lfence
> -	jmp	41b
> -42:	mov	%rax, (%rsp)
> -	ret
>   
> -5:	/* Exception path */
> -	mov	0x18(%rbp), %rcx
> -	jrcxz	52f
> -	mov	%eax, EX_LEAF(%rcx)
> -	mov	%di,  EX_TRAPNR(%rcx)
> -	mov	%si,  EX_ERROR_CODE(%rcx)
> -	mov	%rdx, EX_ADDRESS(%rcx)
> -52:	mov	$-EFAULT, %eax
> -	jmp	3b
> -
> -6:	/* Unsupported ENCLU leaf */
> +	/*
> +	 * If the return from callback is zero or negative, return immediately,
> +	 * else re-execute ENCLU with the postive return value interpreted as
> +	 * the requested ENCLU leaf.
> +	 */
>   	cmp	$0, %eax
> -	jle	7f
> -	mov	$-EINVAL, %eax
> +	jle	.Lout
> +	jmp	.Lenter_enclave
>   
> -7:	/* Epilog */
> -	leave
> -	.cfi_def_cfa		%rsp, 8
> +.Lretpoline:
> +	call	2f
> +1:	pause
> +	lfence
> +	jmp	1b
> +2:	mov	%rax, (%rsp)
>   	ret
>   	.cfi_endproc
>   
> -_ASM_VDSO_EXTABLE_HANDLE(2b, 5b)
> +_ASM_VDSO_EXTABLE_HANDLE(.Lenclu_eenter_eresume, .Lhandle_exception)
>   
>   ENDPROC(__vdso_sgx_enter_enclave)
>
Sean Christopherson Oct. 9, 2019, 7:10 p.m. UTC | #2
On Wed, Oct 09, 2019 at 11:00:55AM -0700, Xing, Cedric wrote:
> On 10/7/2019 9:46 PM, Sean Christopherson wrote:
> >Rework __vdso_sgx_enter_enclave() to prioritize the flow where userspace
> >is not providing a callback, which is the preferred method of operation.
> 
> Processors have branch predictors so your "prioritizing" may not get what
> your want.
> 
> But if you still insist, a simple ht/hnt prefixing the original branch
> instruction would have sufficed. Rearrangement of code blocks is indeed
> unnecessary.

Tying into the other thread, it's not a matter of absolute necessity, it's
a matter of us providing the best code possible.

> A caveat though, for any given process, whether it supplies a callback or
> not is usually hard-coded. So either it always takes the callback path, or
> it always takes the other. And the branch predictor will do well in both
> cases. It's usually unwise to apply ht/hnt prefixes.

Readability is the primary concern, performance is secondary concern.  For
a random joe user, this:

   0x0000000000000a20 <+0>:	push   %rbp
   0x0000000000000a21 <+1>:	mov    %rsp,%rbp
   0x0000000000000a24 <+4>:	cmp    $0x2,%eax
   0x0000000000000a27 <+7>:	jb     0xa46 <__vdso_sgx_enter_enclave+38>
   0x0000000000000a29 <+9>:	cmp    $0x3,%eax
   0x0000000000000a2c <+12>:	ja     0xa46 <__vdso_sgx_enter_enclave+38>
   0x0000000000000a2e <+14>:	mov    0x10(%rbp),%rbx
   0x0000000000000a32 <+18>:	lea    0x0(%rip),%rcx        # 0xa39 <__vdso_sgx_enter_enclave+25>
   0x0000000000000a39 <+25>:	enclu  
   0x0000000000000a3c <+28>:	xor    %eax,%eax
   0x0000000000000a3e <+30>:	cmpl   $0x0,0x20(%rbp)
   0x0000000000000a42 <+34>:	jne    0xa6b <__vdso_sgx_enter_enclave+75>
   0x0000000000000a44 <+36>:	leaveq 
   0x0000000000000a45 <+37>:	retq   

is easier to follow for the happy path than this:

   0x0000000000000a20 <+0>:	push   %rbp
   0x0000000000000a21 <+1>:	mov    %rsp,%rbp
   0x0000000000000a24 <+4>:	cmp    $0x2,%eax
   0x0000000000000a27 <+7>:	jb     0xa8e <__vdso_sgx_enter_enclave+110>
   0x0000000000000a29 <+9>:	cmp    $0x3,%eax
   0x0000000000000a2c <+12>:	ja     0xa8e <__vdso_sgx_enter_enclave+110>
   0x0000000000000a2e <+14>:	mov    0x10(%rbp),%rbx
   0x0000000000000a32 <+18>:	lea    0x0(%rip),%rcx        # 0xa39 <__vdso_sgx_enter_enclave+25>
   0x0000000000000a39 <+25>:	enclu  
   0x0000000000000a3c <+28>:	xor    %eax,%eax
   0x0000000000000a3e <+30>:	mov    %eax,%ecx
   0x0000000000000a40 <+32>:	mov    0x20(%rbp),%rax
   0x0000000000000a44 <+36>:	test   %rax,%rax
   0x0000000000000a47 <+39>:	je     0xa98 <__vdso_sgx_enter_enclave+120>
   0x0000000000000a49 <+41>:	mov    %rsp,%rbx
   0x0000000000000a4c <+44>:	and    $0xfffffffffffffff0,%rsp
   0x0000000000000a50 <+48>:	cld    
   0x0000000000000a51 <+49>:	pushq  0x18(%rbp)
   0x0000000000000a54 <+52>:	push   %rbx
   0x0000000000000a55 <+53>:	pushq  0x10(%rbp)
   0x0000000000000a58 <+56>:	callq  0xa62 <__vdso_sgx_enter_enclave+66>
   0x0000000000000a5d <+61>:	mov    %rbx,%rsp
   0x0000000000000a60 <+64>:	jmp    0xa24 <__vdso_sgx_enter_enclave+4>
   0x0000000000000a62 <+66>:	callq  0xa6e <__vdso_sgx_enter_enclave+78>
   0x0000000000000a67 <+71>:	pause  
   0x0000000000000a69 <+73>:	lfence 
   0x0000000000000a6c <+76>:	jmp    0xa67 <__vdso_sgx_enter_enclave+71>
   0x0000000000000a6e <+78>:	mov    %rax,(%rsp)
   0x0000000000000a72 <+82>:	retq   
   0x0000000000000a73 <+83>:	mov    0x18(%rbp),%rcx
   0x0000000000000a77 <+87>:	jrcxz  0xa87 <__vdso_sgx_enter_enclave+103>
   0x0000000000000a79 <+89>:	mov    %eax,(%rcx)
   0x0000000000000a7b <+91>:	mov    %di,0x4(%rcx)
   0x0000000000000a7f <+95>:	mov    %si,0x6(%rcx)
   0x0000000000000a83 <+99>:	mov    %rdx,0x8(%rcx)
   0x0000000000000a87 <+103>:	mov    $0xfffffff2,%eax
   0x0000000000000a8c <+108>:	jmp    0xa3e <__vdso_sgx_enter_enclave+30>
   0x0000000000000a8e <+110>:	cmp    $0x0,%eax
   0x0000000000000a91 <+113>:	jle    0xa98 <__vdso_sgx_enter_enclave+120>
   0x0000000000000a93 <+115>:	mov    $0xffffffea,%eax
   0x0000000000000a98 <+120>:	leaveq 
   0x0000000000000a99 <+121>:	retq  

and much easier to follow than the version where the exception struct is
filled in on a synchronous EEXIT:

   0x0000000000000a20 <+0>:	push   %rbp
   0x0000000000000a21 <+1>:	mov    %rsp,%rbp
   0x0000000000000a24 <+4>:	cmp    $0x2,%eax
   0x0000000000000a27 <+7>:	jb     0xa90 <__vdso_sgx_enter_enclave+112>
   0x0000000000000a29 <+9>:	cmp    $0x3,%eax
   0x0000000000000a2c <+12>:	ja     0xa90 <__vdso_sgx_enter_enclave+112>
   0x0000000000000a2e <+14>:	mov    0x10(%rbp),%rbx
   0x0000000000000a32 <+18>:	lea    0x0(%rip),%rcx        # 0xa39 <__vdso_sgx_enter_enclave+25>
   0x0000000000000a39 <+25>:	enclu  
   0x0000000000000a3c <+28>:	xor    %ebx,%ebx
   0x0000000000000a3e <+30>:	mov    0x18(%rbp),%rcx
   0x0000000000000a42 <+34>:	jrcxz  0xa54 <__vdso_sgx_enter_enclave+52>
   0x0000000000000a44 <+36>:	mov    %eax,(%rcx)
   0x0000000000000a46 <+38>:	jae    0xa54 <__vdso_sgx_enter_enclave+52>
   0x0000000000000a48 <+40>:	mov    %di,0x4(%rcx)
   0x0000000000000a4c <+44>:	mov    %si,0x6(%rcx)
   0x0000000000000a50 <+48>:	mov    %rdx,0x8(%rcx)
   0x0000000000000a54 <+52>:	mov    0x20(%rbp),%rax
   0x0000000000000a58 <+56>:	test   %rax,%rax
   0x0000000000000a5b <+59>:	cmove  %rbx,%rax
   0x0000000000000a5f <+63>:	je     0xa9a <__vdso_sgx_enter_enclave+122>

   ...

   0x0000000000000a9a <+122>:	leaveq 
   0x0000000000000a9b <+123>:	retq   


> >Using a callback requires a retpoline, and the only known motivation for
> >employing a callback is to allow the enclave to muck with the stack of
> >the untrusted runtime.
> >
> >Opportunistically replace the majority of the local labels with local
> >symbol names to improve the readability of the code.
> >
> >Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> >---
> >  arch/x86/entry/vdso/vsgx_enter_enclave.S | 120 ++++++++++++++---------
> >  1 file changed, 71 insertions(+), 49 deletions(-)
> >
> >diff --git a/arch/x86/entry/vdso/vsgx_enter_enclave.S b/arch/x86/entry/vdso/vsgx_enter_enclave.S
> >index de54e47c83f4..fc5622dcd2fa 100644
> >--- a/arch/x86/entry/vdso/vsgx_enter_enclave.S
> >+++ b/arch/x86/entry/vdso/vsgx_enter_enclave.S
> >@@ -85,75 +85,97 @@ ENTRY(__vdso_sgx_enter_enclave)
> >  	mov	%rsp, %rbp
> >  	.cfi_def_cfa_register	%rbp
> >-1:	/* EENTER <= leaf <= ERESUME */
> >+.Lenter_enclave:
> >+	/* EENTER <= leaf <= ERESUME */
> >  	cmp	$0x2, %eax
> >-	jb	6f
> >+	jb	.Linvalid_leaf
> >  	cmp	$0x3, %eax
> >-	ja	6f
> >+	ja	.Linvalid_leaf
> >  	/* Load TCS and AEP */
> >  	mov	0x10(%rbp), %rbx
> >-	lea	2f(%rip), %rcx
> >+	lea	.Lasync_exit_pointer(%rip), %rcx
> >  	/* Single ENCLU serving as both EENTER and AEP (ERESUME) */
> >-2:	enclu
> >+.Lasync_exit_pointer:
> >+.Lenclu_eenter_eresume:
> >+	enclu
> >-	/* EEXIT path */
> >+	/* EEXIT jumps here unless the enclave is doing something fancy. */
> >  	xor	%eax, %eax
> >-3:	mov	%eax, %ecx
> >-
> >-	/* Call the exit handler if supplied */
> >-	mov	0x20(%rbp), %rax
> >-	test	%rax, %rax
> >-	jz	7f
> 
> >-	/* Align stack per x86_64 ABI. The original %rsp is saved in %rbx to be
> >-	 * restored after the exit handler returns. */
> >+
> >+	/* Invoke userspace's exit handler if one was provided. */
> >+.Lhandle_exit:
> >+	cmp	$0, 0x20(%rbp)
> >+	jne	.Linvoke_userspace_handler
> >+
> >+.Lout:
> >+	leave
> >+	.cfi_def_cfa		%rsp, 8
> >+	ret
> >+
> >+.Linvalid_leaf:
> 
> Please set frame pointer back to %rbp here, or stack unwinding will fail.

Sorry, coffee isn't doing it's job, what's getting crushed, and where?

> >+	mov	$(-EINVAL), %eax
> >+	jmp	.Lout
> >+
> >+.Lhandle_exception:
> >+	mov	0x18(%rbp), %rcx
> >+	test    %rcx, %rcx
> >+	je	.Lskip_exception_info
> 
> A single "jrcxz .Lskip_exception_info" is equivalent to the above 2
> instructions combined.

Both implementations take a single uop on CPUs that support SGX.  IMO,
using the simpler and more common instructions is more universally
readable.

> >+
> >+	/* Fill optional exception info. */
> >+	mov	%eax, EX_LEAF(%rcx)
> >+	mov	%di,  EX_TRAPNR(%rcx)
> >+	mov	%si,  EX_ERROR_CODE(%rcx)
> >+	mov	%rdx, EX_ADDRESS(%rcx)
> >+.Lskip_exception_info:
> >+	mov	$(-EFAULT), %eax
> >+	jmp	.Lhandle_exit
> >+
> >+.Linvoke_userspace_handler:
> >+	/*
> >+	 * Align stack per x86_64 ABI. Save the original %rsp in %rbx to be
> >+	 * restored after the callback returns.
> >+	 */
> >  	mov	%rsp, %rbx
> >  	and	$-0x10, %rsp
> >-	/* Clear RFLAGS.DF per x86_64 ABI */
> >-	cld
> >-	/* Parameters for the exit handler */
> >+
> >+	/* Push @e, u_rsp and @tcs as parameters to the callback. */
> >  	push	0x18(%rbp)
> >  	push	%rbx
> >  	push	0x10(%rbp)
> >-	/* Call *%rax via retpoline */
> >-	call	40f
> >-	/* Restore %rsp to its original value left off by the enclave from last
> >-	 * exit */
> >+
> >+	/* Pass the "return" value to the callback via %rcx. */
> >+	mov	%eax, %ecx
> 
> @e (ex_info) is almost always needed by every callback as it also serves as
> the "context pointer". The return value on the other hand is insignificant
> because it could be deduced from @e->EX_LEAF anyway. So I'd retain %rcx and
> push %rax to the stack instead, given the purpose of this patch is to
> squeeze out a bit performance.

Please take this up in patch 02/16, which actually introduced this change.
> 
> >+
> >+	/* Clear RFLAGS.DF per x86_64 ABI */
> >+	cld
> >+
> >+	/* Load the callback pointer to %rax and invoke it via retpoline. */
> >+	mov	0x20(%rbp), %rax
> 
> Per X86_64 ABI, %rsp shall be 16 bytes aligned before "call". But %rsp here
> doesn't look aligned properly.

Argh, I probably botched it back in patch 02/16 too.  I'll see if I can
add a check to verify %rsp alignment in the selftest, verifying via code
inspection is bound to be error prone.

> >+	call	.Lretpoline
Sean Christopherson Oct. 10, 2019, 12:21 a.m. UTC | #3
On Wed, Oct 09, 2019 at 12:10:03PM -0700, Sean Christopherson wrote:
> > >+
> > >+	/* Clear RFLAGS.DF per x86_64 ABI */
> > >+	cld
> > >+
> > >+	/* Load the callback pointer to %rax and invoke it via retpoline. */
> > >+	mov	0x20(%rbp), %rax
> > 
> > Per X86_64 ABI, %rsp shall be 16 bytes aligned before "call". But %rsp here
> > doesn't look aligned properly.
> 
> Argh, I probably botched it back in patch 02/16 too.  I'll see if I can
> add a check to verify %rsp alignment in the selftest, verifying via code
> inspection is bound to be error prone.

Added a selftest, stack is indeed not properly aligned.
Xing, Cedric Oct. 10, 2019, 5:49 p.m. UTC | #4
On 10/9/2019 12:10 PM, Sean Christopherson wrote:
> On Wed, Oct 09, 2019 at 11:00:55AM -0700, Xing, Cedric wrote:
>> On 10/7/2019 9:46 PM, Sean Christopherson wrote:
>>> -	/* Align stack per x86_64 ABI. The original %rsp is saved in %rbx to be
>>> -	 * restored after the exit handler returns. */
>>> +
>>> +	/* Invoke userspace's exit handler if one was provided. */
>>> +.Lhandle_exit:
>>> +	cmp	$0, 0x20(%rbp)
>>> +	jne	.Linvoke_userspace_handler
>>> +
>>> +.Lout:
>>> +	leave
>>> +	.cfi_def_cfa		%rsp, 8
>>> +	ret
>>> +
>>> +.Linvalid_leaf:
>>
>> Please set frame pointer back to %rbp here, or stack unwinding will fail.
> 
> Sorry, coffee isn't doing it's job, what's getting crushed, and where?

The frame pointer was %rbp but you changed it to %rsp 3 lines ago. 
That's correct after "leave" and execution won't pass "ret". But the 
unwinder doesn't know. So you have to restore frame pointer after "ret", by
	.cfi_def_cfa		%rbp, 16

As you mentioned in the stack alignment case, we just can't rely on code 
review to catch such bugs. We need a test case to make sure all CFI 
directives are correct, which was also a request from Andy.

>>> +.Lhandle_exception:
>>> +	mov	0x18(%rbp), %rcx
>>> +	test    %rcx, %rcx
>>> +	je	.Lskip_exception_info
>>
>> A single "jrcxz .Lskip_exception_info" is equivalent to the above 2
>> instructions combined.
> 
> Both implementations take a single uop on CPUs that support SGX.  IMO,
> using the simpler and more common instructions is more universally
> readable.

I'm not sure the processor could combine 2 instructions ("test"+"je") 
into just 1 uop. And "jrcxz" is also a broadly used instruction.

>>> +	/* Push @e, u_rsp and @tcs as parameters to the callback. */
>>>   	push	0x18(%rbp)
>>>   	push	%rbx
>>>   	push	0x10(%rbp)
>>> -	/* Call *%rax via retpoline */
>>> -	call	40f
>>> -	/* Restore %rsp to its original value left off by the enclave from last
>>> -	 * exit */
>>> +
>>> +	/* Pass the "return" value to the callback via %rcx. */
>>> +	mov	%eax, %ecx
>>
>> @e (ex_info) is almost always needed by every callback as it also serves as
>> the "context pointer". The return value on the other hand is insignificant
>> because it could be deduced from @e->EX_LEAF anyway. So I'd retain %rcx and
>> push %rax to the stack instead, given the purpose of this patch is to
>> squeeze out a bit performance.
> 
> Please take this up in patch 02/16, which actually introduced this change.

My apology but willing to pull all related discussions into a single thread.

If you adhere to the convention of "%rcx containing @e", then the code 
here could be
	push	%rax		// for stack alignment
	push	%rax		// return value
	push	%rbx		// u_rsp
	push	0x10(%rsp)	// tcs
				// %rcx left unchanged pointing to @e
>>> +	/* Clear RFLAGS.DF per x86_64 ABI */
>>> +	cld
>>> +
>>> +	/* Load the callback pointer to %rax and invoke it via retpoline. */
>>> +	mov	0x20(%rbp), %rax
>>
>> Per X86_64 ABI, %rsp shall be 16 bytes aligned before "call". But %rsp here
>> doesn't look aligned properly.
> 
> Argh, I probably botched it back in patch 02/16 too.  I'll see if I can
> add a check to verify %rsp alignment in the selftest, verifying via code
> inspection is bound to be error prone.
> 
>>> +	call	.Lretpoline
Sean Christopherson Oct. 10, 2019, 11:59 p.m. UTC | #5
On Thu, Oct 10, 2019 at 10:49:59AM -0700, Xing, Cedric wrote:
> On 10/9/2019 12:10 PM, Sean Christopherson wrote:
> >On Wed, Oct 09, 2019 at 11:00:55AM -0700, Xing, Cedric wrote:
> >>On 10/7/2019 9:46 PM, Sean Christopherson wrote:
> >>>-	/* Align stack per x86_64 ABI. The original %rsp is saved in %rbx to be
> >>>-	 * restored after the exit handler returns. */
> >>>+
> >>>+	/* Invoke userspace's exit handler if one was provided. */
> >>>+.Lhandle_exit:
> >>>+	cmp	$0, 0x20(%rbp)
> >>>+	jne	.Linvoke_userspace_handler
> >>>+
> >>>+.Lout:
> >>>+	leave
> >>>+	.cfi_def_cfa		%rsp, 8
> >>>+	ret
> >>>+
> >>>+.Linvalid_leaf:
> >>
> >>Please set frame pointer back to %rbp here, or stack unwinding will fail.
> >
> >Sorry, coffee isn't doing it's job, what's getting crushed, and where?
> 
> The frame pointer was %rbp but you changed it to %rsp 3 lines ago. That's
> correct after "leave" and execution won't pass "ret". But the unwinder
> doesn't know. So you have to restore frame pointer after "ret", by
> 	.cfi_def_cfa		%rbp, 16

Isn't the proper fix to move ".cfi_endproc" here?  Which I incorrectly
left after the RET for the retpoline.
 
> As you mentioned in the stack alignment case, we just can't rely on code
> review to catch such bugs. We need a test case to make sure all CFI
> directives are correct, which was also a request from Andy.

On the todo list...

> >>>+.Lhandle_exception:
> >>>+	mov	0x18(%rbp), %rcx
> >>>+	test    %rcx, %rcx
> >>>+	je	.Lskip_exception_info
> >>
> >>A single "jrcxz .Lskip_exception_info" is equivalent to the above 2
> >>instructions combined.
> >
> >Both implementations take a single uop on CPUs that support SGX.  IMO,
> >using the simpler and more common instructions is more universally
> >readable.
> 
> I'm not sure the processor could combine 2 instructions ("test"+"je") into
> just 1 uop. And "jrcxz" is also a broadly used instruction.

TEST+Jcc macrofusion has been supported since Merom (Core 2)[*].  CMP+Jcc
have also been fused since Merom, though not for all Jcc flavors (uarch
specific), whereas TEST can fuse with everything.  Sandy Bridge added
fusing of ADD, SUB, INC, DEC, AND and OR, with AND/OR following TEST
in terms of fusing capabilities, the rest following CMP behavior.

[*] https://en.wikichip.org/wiki/macro-operation_fusion

> >>>+	/* Push @e, u_rsp and @tcs as parameters to the callback. */
> >>>  	push	0x18(%rbp)
> >>>  	push	%rbx
> >>>  	push	0x10(%rbp)
> >>>-	/* Call *%rax via retpoline */
> >>>-	call	40f
> >>>-	/* Restore %rsp to its original value left off by the enclave from last
> >>>-	 * exit */
> >>>+
> >>>+	/* Pass the "return" value to the callback via %rcx. */
> >>>+	mov	%eax, %ecx
> >>
> >>@e (ex_info) is almost always needed by every callback as it also serves as
> >>the "context pointer". The return value on the other hand is insignificant
> >>because it could be deduced from @e->EX_LEAF anyway. So I'd retain %rcx and
> >>push %rax to the stack instead, given the purpose of this patch is to
> >>squeeze out a bit performance.
> >
> >Please take this up in patch 02/16, which actually introduced this change.
> 
> My apology but willing to pull all related discussions into a single thread.
> 
> If you adhere to the convention of "%rcx containing @e", then the code here
> could be
> 	push	%rax		// for stack alignment
> 	push	%rax		// return value
> 	push	%rbx		// u_rsp
> 	push	0x10(%rsp)	// tcs
> 				// %rcx left unchanged pointing to @e

Hmm, I still think it makes sense to have @e as the last parameters since
it's the one thing that's optional.  What if the callback prototype were
instead:

typedef int (*sgx_enclave_exit_handler_t)(long rdi, long rsi, long rdx,
					  long ursp, long r8, long r9,
					  void *tcs, int ret,
					  struct sgx_enclave_exception *e);

I.e. put @ret and @e next to each other since they go hand-in-hand.  For
me, that's visually easies to parse than burying 'int ret' or 'struct ... *e'
in the middle of the prototype.

And the relevant asm:
	/* Push @e, "return" value and @tcs as parameters to the callback. */
	push	0x18(%rbp)
	push	%eax
	push	0x10(%rbp)

	/* Pass the untrusted RSP (at exit) to the callback via %rcx. */
	mov	%ebx, %ecx

> >>>+	/* Clear RFLAGS.DF per x86_64 ABI */
> >>>+	cld
> >>>+
> >>>+	/* Load the callback pointer to %rax and invoke it via retpoline. */
> >>>+	mov	0x20(%rbp), %rax
> >>
> >>Per X86_64 ABI, %rsp shall be 16 bytes aligned before "call". But %rsp here
> >>doesn't look aligned properly.
> >
> >Argh, I probably botched it back in patch 02/16 too.  I'll see if I can
> >add a check to verify %rsp alignment in the selftest, verifying via code
> >inspection is bound to be error prone.
> >
> >>>+	call	.Lretpoline
Xing, Cedric Oct. 16, 2019, 10:18 p.m. UTC | #6
On 10/10/2019 4:59 PM, Sean Christopherson wrote:
> On Thu, Oct 10, 2019 at 10:49:59AM -0700, Xing, Cedric wrote:
>> On 10/9/2019 12:10 PM, Sean Christopherson wrote:
>>> On Wed, Oct 09, 2019 at 11:00:55AM -0700, Xing, Cedric wrote:
>>>> On 10/7/2019 9:46 PM, Sean Christopherson wrote:
>>>>> -	/* Align stack per x86_64 ABI. The original %rsp is saved in %rbx to be
>>>>> -	 * restored after the exit handler returns. */
>>>>> +
>>>>> +	/* Invoke userspace's exit handler if one was provided. */
>>>>> +.Lhandle_exit:
>>>>> +	cmp	$0, 0x20(%rbp)
>>>>> +	jne	.Linvoke_userspace_handler
>>>>> +
>>>>> +.Lout:
>>>>> +	leave
>>>>> +	.cfi_def_cfa		%rsp, 8
>>>>> +	ret
>>>>> +
>>>>> +.Linvalid_leaf:
>>>>
>>>> Please set frame pointer back to %rbp here, or stack unwinding will fail.
>>>
>>> Sorry, coffee isn't doing it's job, what's getting crushed, and where?
>>
>> The frame pointer was %rbp but you changed it to %rsp 3 lines ago. That's
>> correct after "leave" and execution won't pass "ret". But the unwinder
>> doesn't know. So you have to restore frame pointer after "ret", by
>> 	.cfi_def_cfa		%rbp, 16
> 
> Isn't the proper fix to move ".cfi_endproc" here?  Which I incorrectly
> left after the RET for the retpoline.

No. .cfi_endproc is used by the unwinder to determine if an address 
falls within a function. Its location has nothing to do with where RET 
is but shall always be at the end of the whole function.

.cfi_def_cfa tells the unwinder where the call frame starts. At here, 
the call frame starts at %rbp+16 but not %rsp+8, so ".cfi_def_cfa %rbp, 
16" is a must.

>>>>> +.Lhandle_exception:
>>>>> +	mov	0x18(%rbp), %rcx
>>>>> +	test    %rcx, %rcx
>>>>> +	je	.Lskip_exception_info
>>>>
>>>> A single "jrcxz .Lskip_exception_info" is equivalent to the above 2
>>>> instructions combined.
>>>
>>> Both implementations take a single uop on CPUs that support SGX.  IMO,
>>> using the simpler and more common instructions is more universally
>>> readable.
>>
>> I'm not sure the processor could combine 2 instructions ("test"+"je") into
>> just 1 uop. And "jrcxz" is also a broadly used instruction.
> 
> TEST+Jcc macrofusion has been supported since Merom (Core 2)[*].  CMP+Jcc
> have also been fused since Merom, though not for all Jcc flavors (uarch
> specific), whereas TEST can fuse with everything.  Sandy Bridge added
> fusing of ADD, SUB, INC, DEC, AND and OR, with AND/OR following TEST
> in terms of fusing capabilities, the rest following CMP behavior.
> 
> [*] https://en.wikichip.org/wiki/macro-operation_fusion

Good to know. Thanks for the info!

>>>>> +	/* Push @e, u_rsp and @tcs as parameters to the callback. */
>>>>>   	push	0x18(%rbp)
>>>>>   	push	%rbx
>>>>>   	push	0x10(%rbp)
>>>>> -	/* Call *%rax via retpoline */
>>>>> -	call	40f
>>>>> -	/* Restore %rsp to its original value left off by the enclave from last
>>>>> -	 * exit */
>>>>> +
>>>>> +	/* Pass the "return" value to the callback via %rcx. */
>>>>> +	mov	%eax, %ecx
>>>>
>>>> @e (ex_info) is almost always needed by every callback as it also serves as
>>>> the "context pointer". The return value on the other hand is insignificant
>>>> because it could be deduced from @e->EX_LEAF anyway. So I'd retain %rcx and
>>>> push %rax to the stack instead, given the purpose of this patch is to
>>>> squeeze out a bit performance.
>>>
>>> Please take this up in patch 02/16, which actually introduced this change.
>>
>> My apology but willing to pull all related discussions into a single thread.
>>
>> If you adhere to the convention of "%rcx containing @e", then the code here
>> could be
>> 	push	%rax		// for stack alignment
>> 	push	%rax		// return value
>> 	push	%rbx		// u_rsp
>> 	push	0x10(%rsp)	// tcs
>> 				// %rcx left unchanged pointing to @e
> 
> Hmm, I still think it makes sense to have @e as the last parameters since
> it's the one thing that's optional.  What if the callback prototype were
> instead:
> 
> typedef int (*sgx_enclave_exit_handler_t)(long rdi, long rsi, long rdx,
> 					  long ursp, long r8, long r9,
> 					  void *tcs, int ret,
> 					  struct sgx_enclave_exception *e);
> 
> I.e. put @ret and @e next to each other since they go hand-in-hand.  For
> me, that's visually easies to parse than burying 'int ret' or 'struct ... *e'
> in the middle of the prototype.
> 
> And the relevant asm:
> 	/* Push @e, "return" value and @tcs as parameters to the callback. */
> 	push	0x18(%rbp)
> 	push	%eax
> 	push	0x10(%rbp)
> 
> 	/* Pass the untrusted RSP (at exit) to the callback via %rcx. */
> 	mov	%ebx, %ecx
> 

Looks good to me. Don't forget to align the stack though, and ursp shall 
be 64-bit. That is,

	push	%rax		// align stack
	push	%rcx		// @e
	push	%rax		// @ret
	push	0x10(%rsp)	// @tcs
	mov	%rbx, %rcx	// @ursp
Sean Christopherson Oct. 16, 2019, 10:53 p.m. UTC | #7
On Wed, Oct 16, 2019 at 03:18:05PM -0700, Xing, Cedric wrote:
> On 10/10/2019 4:59 PM, Sean Christopherson wrote:
> >On Thu, Oct 10, 2019 at 10:49:59AM -0700, Xing, Cedric wrote:
> >>On 10/9/2019 12:10 PM, Sean Christopherson wrote:
> >>>On Wed, Oct 09, 2019 at 11:00:55AM -0700, Xing, Cedric wrote:
> >>>>On 10/7/2019 9:46 PM, Sean Christopherson wrote:
> >>>>>-	/* Align stack per x86_64 ABI. The original %rsp is saved in %rbx to be
> >>>>>-	 * restored after the exit handler returns. */
> >>>>>+
> >>>>>+	/* Invoke userspace's exit handler if one was provided. */
> >>>>>+.Lhandle_exit:
> >>>>>+	cmp	$0, 0x20(%rbp)
> >>>>>+	jne	.Linvoke_userspace_handler
> >>>>>+
> >>>>>+.Lout:
> >>>>>+	leave
> >>>>>+	.cfi_def_cfa		%rsp, 8
> >>>>>+	ret
> >>>>>+
> >>>>>+.Linvalid_leaf:
> >>>>
> >>>>Please set frame pointer back to %rbp here, or stack unwinding will fail.
> >>>
> >>>Sorry, coffee isn't doing it's job, what's getting crushed, and where?
> >>
> >>The frame pointer was %rbp but you changed it to %rsp 3 lines ago. That's
> >>correct after "leave" and execution won't pass "ret". But the unwinder
> >>doesn't know. So you have to restore frame pointer after "ret", by
> >>	.cfi_def_cfa		%rbp, 16
> >
> >Isn't the proper fix to move ".cfi_endproc" here?  Which I incorrectly
> >left after the RET for the retpoline.
> 
> No. .cfi_endproc is used by the unwinder to determine if an address falls
> within a function. Its location has nothing to do with where RET is but
> shall always be at the end of the whole function.
> 
> .cfi_def_cfa tells the unwinder where the call frame starts. At here, the
> call frame starts at %rbp+16 but not %rsp+8, so ".cfi_def_cfa %rbp, 16" is a
> must.

Ahh, I understand now, hopefully.  I was thinking the .cfi directives
would magically understand the control flow.  Thanks!

> >>>>>+.Lhandle_exception:
> >>>>>+	mov	0x18(%rbp), %rcx
> >>>>>+	test    %rcx, %rcx
> >>>>>+	je	.Lskip_exception_info
> >>>>

Patch
diff mbox series

diff --git a/arch/x86/entry/vdso/vsgx_enter_enclave.S b/arch/x86/entry/vdso/vsgx_enter_enclave.S
index de54e47c83f4..fc5622dcd2fa 100644
--- a/arch/x86/entry/vdso/vsgx_enter_enclave.S
+++ b/arch/x86/entry/vdso/vsgx_enter_enclave.S
@@ -85,75 +85,97 @@  ENTRY(__vdso_sgx_enter_enclave)
 	mov	%rsp, %rbp
 	.cfi_def_cfa_register	%rbp
 
-1:	/* EENTER <= leaf <= ERESUME */
+.Lenter_enclave:
+	/* EENTER <= leaf <= ERESUME */
 	cmp	$0x2, %eax
-	jb	6f
+	jb	.Linvalid_leaf
 	cmp	$0x3, %eax
-	ja	6f
+	ja	.Linvalid_leaf
 
 	/* Load TCS and AEP */
 	mov	0x10(%rbp), %rbx
-	lea	2f(%rip), %rcx
+	lea	.Lasync_exit_pointer(%rip), %rcx
 
 	/* Single ENCLU serving as both EENTER and AEP (ERESUME) */
-2:	enclu
+.Lasync_exit_pointer:
+.Lenclu_eenter_eresume:
+	enclu
 
-	/* EEXIT path */
+	/* EEXIT jumps here unless the enclave is doing something fancy. */
 	xor	%eax, %eax
-3:	mov	%eax, %ecx
-
-	/* Call the exit handler if supplied */
-	mov	0x20(%rbp), %rax
-	test	%rax, %rax
-	jz	7f
-	/* Align stack per x86_64 ABI. The original %rsp is saved in %rbx to be
-	 * restored after the exit handler returns. */
+
+	/* Invoke userspace's exit handler if one was provided. */
+.Lhandle_exit:
+	cmp	$0, 0x20(%rbp)
+	jne	.Linvoke_userspace_handler
+
+.Lout:
+	leave
+	.cfi_def_cfa		%rsp, 8
+	ret
+
+.Linvalid_leaf:
+	mov	$(-EINVAL), %eax
+	jmp	.Lout
+
+.Lhandle_exception:
+	mov	0x18(%rbp), %rcx
+	test    %rcx, %rcx
+	je	.Lskip_exception_info
+
+	/* Fill optional exception info. */
+	mov	%eax, EX_LEAF(%rcx)
+	mov	%di,  EX_TRAPNR(%rcx)
+	mov	%si,  EX_ERROR_CODE(%rcx)
+	mov	%rdx, EX_ADDRESS(%rcx)
+.Lskip_exception_info:
+	mov	$(-EFAULT), %eax
+	jmp	.Lhandle_exit
+
+.Linvoke_userspace_handler:
+	/*
+	 * Align stack per x86_64 ABI. Save the original %rsp in %rbx to be
+	 * restored after the callback returns.
+	 */
 	mov	%rsp, %rbx
 	and	$-0x10, %rsp
-	/* Clear RFLAGS.DF per x86_64 ABI */
-	cld
-	/* Parameters for the exit handler */
+
+	/* Push @e, u_rsp and @tcs as parameters to the callback. */
 	push	0x18(%rbp)
 	push	%rbx
 	push	0x10(%rbp)
-	/* Call *%rax via retpoline */
-	call	40f
-	/* Restore %rsp to its original value left off by the enclave from last
-	 * exit */
+
+	/* Pass the "return" value to the callback via %rcx. */
+	mov	%eax, %ecx
+
+	/* Clear RFLAGS.DF per x86_64 ABI */
+	cld
+
+	/* Load the callback pointer to %rax and invoke it via retpoline. */
+	mov	0x20(%rbp), %rax
+	call	.Lretpoline
+
+	/* Restore %rsp to its post-exit value. */
 	mov	%rbx, %rsp
-	/* Positive return value from the exit handler will be interpreted as
-	 * an ENCLU leaf, while a non-positive value will be interpreted as the
-	 * return value to be passed back to the caller. */
-	jmp	1b
-40:	/* retpoline */
-	call	42f
-41:	pause
-	lfence
-	jmp	41b
-42:	mov	%rax, (%rsp)
-	ret
 
-5:	/* Exception path */
-	mov	0x18(%rbp), %rcx
-	jrcxz	52f
-	mov	%eax, EX_LEAF(%rcx)
-	mov	%di,  EX_TRAPNR(%rcx)
-	mov	%si,  EX_ERROR_CODE(%rcx)
-	mov	%rdx, EX_ADDRESS(%rcx)
-52:	mov	$-EFAULT, %eax
-	jmp	3b
-
-6:	/* Unsupported ENCLU leaf */
+	/*
+	 * If the return from callback is zero or negative, return immediately,
+	 * else re-execute ENCLU with the postive return value interpreted as
+	 * the requested ENCLU leaf.
+	 */
 	cmp	$0, %eax
-	jle	7f
-	mov	$-EINVAL, %eax
+	jle	.Lout
+	jmp	.Lenter_enclave
 
-7:	/* Epilog */
-	leave
-	.cfi_def_cfa		%rsp, 8
+.Lretpoline:
+	call	2f
+1:	pause
+	lfence
+	jmp	1b
+2:	mov	%rax, (%rsp)
 	ret
 	.cfi_endproc
 
-_ASM_VDSO_EXTABLE_HANDLE(2b, 5b)
+_ASM_VDSO_EXTABLE_HANDLE(.Lenclu_eenter_eresume, .Lhandle_exception)
 
 ENDPROC(__vdso_sgx_enter_enclave)