diff mbox series

[07/10] x86/tdx: Extend TDX_MODULE_CALL to support more TDCALL/SEAMCALL leafs

Message ID ecfd84af9186aa5368acb40a2740afbf1d0d1b5d.1689151537.git.kai.huang@intel.com (mailing list archive)
State New, archived
Headers show
Series Unify TDCALL/SEAMCALL and TDVMCALL assembly | expand

Commit Message

Huang, Kai July 12, 2023, 8:55 a.m. UTC
The TDX guest live migration support (TDX 1.5) adds new TDCALL/SEAMCALL
leaf functions.  Those new TDCALLs/SEAMCALLs take additional registers
for input (R10-R13) and output (R12-R13).  TDG.SERVTD.RD is an example.

Also, the current TDX_MODULE_CALL doesn't aim to handle TDH.VP.ENTER
SEAMCALL, which monitors the TDG.VP.VMCALL in input/output registers
when it returns in case of VMCALL from TDX guest.

With those new TDCALLs/SEAMCALLs and the TDH.VP.ENTER covered, the
TDX_MODULE_CALL macro basically needs to handle the same input/output
registers as the TDX_HYPERCALL does.  And as a result, they also share
similar logic in the assembly, thus should be unified to use one common
assembly.

Extend the TDX_MODULE_CALL asm to support the new TDCALLs/SEAMCALLs and
also the TDH.VP.ENTER SEAMCALL.  Eventually it will be unified with the
TDX_HYPERCALL.

The new input/output registers fit with the "callee-saved" registers in
the x86 calling convention.  Add a new "saved" parameter to support
those new TDCALLs/SEAMCALLs and TDH.VP.ENTER and keep the existing
TDCALLs/SEAMCALLs minimally impacted.

For TDH.VP.ENTER, after it returns the registers shared by the guest
contain guest's values.  Explicitly clear them to prevent speculative
use of guest's values.

Note most TDX live migration related SEAMCALLs may also clobber AVX*
state ("AVX, AVX2 and AVX512 state: may be reset to the architectural
INIT state" -- see TDH.EXPORT.MEM for example).  And TDH.VP.ENTER also
clobbers XMM0-XMM15 when the corresponding bit is set in RCX.  Don't
handle them in the TDX_MODULE_CALL macro but let the caller save and
restore when needed.

This is basically based on Peter's code.

Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kai Huang <kai.huang@intel.com>
---
 arch/x86/coco/tdx/tdcall.S      |   4 +-
 arch/x86/include/asm/tdx.h      |  12 +++-
 arch/x86/kernel/asm-offsets.c   |   7 ++
 arch/x86/virt/vmx/tdx/tdxcall.S | 121 ++++++++++++++++++++++++++++++--
 4 files changed, 134 insertions(+), 10 deletions(-)

Comments

Peter Zijlstra July 12, 2023, 4:53 p.m. UTC | #1
On Wed, Jul 12, 2023 at 08:55:21PM +1200, Kai Huang wrote:


> @@ -72,7 +142,46 @@
>  	movq %r9,  TDX_MODULE_r9(%rsi)
>  	movq %r10, TDX_MODULE_r10(%rsi)
>  	movq %r11, TDX_MODULE_r11(%rsi)
> -	.endif
> +	.endif	/* \ret */
> +
> +	.if \saved
> +	.if \ret && \host
> +	/*
> +	 * Clear registers shared by guest for VP.ENTER to prevent
> +	 * speculative use of guest's values, including those are
> +	 * restored from the stack.
> +	 *
> +	 * See arch/x86/kvm/vmx/vmenter.S:
> +	 *
> +	 * In theory, a L1 cache miss when restoring register from stack
> +	 * could lead to speculative execution with guest's values.
> +	 *
> +	 * Note: RBP/RSP are not used as shared register.  RSI has been
> +	 * restored already.
> +	 *
> +	 * XOR is cheap, thus unconditionally do for all leafs.
> +	 */
> +	xorq %rcx, %rcx
> +	xorq %rdx, %rdx
> +	xorq %r8,  %r8
> +	xorq %r9,  %r9
> +	xorq %r10, %r10
> +	xorq %r11, %r11

> +	xorq %r12, %r12
> +	xorq %r13, %r13
> +	xorq %r14, %r14
> +	xorq %r15, %r15
> +	xorq %rbx, %rbx

^ those are an instant pop below, seems daft to clear them.

> +	xorq %rdi, %rdi
> +	.endif	/* \ret && \host */
> +
> +	/* Restore callee-saved GPRs as mandated by the x86_64 ABI */
> +	popq	%r15
> +	popq	%r14
> +	popq	%r13
> +	popq	%r12
> +	popq	%rbx
> +	.endif	/* \saved */
>  
>  	FRAME_END
>  	RET
> -- 
> 2.41.0
>
Peter Zijlstra July 12, 2023, 4:59 p.m. UTC | #2
On Wed, Jul 12, 2023 at 06:53:37PM +0200, Peter Zijlstra wrote:
> On Wed, Jul 12, 2023 at 08:55:21PM +1200, Kai Huang wrote:
> 
> 
> > @@ -72,7 +142,46 @@
> >  	movq %r9,  TDX_MODULE_r9(%rsi)
> >  	movq %r10, TDX_MODULE_r10(%rsi)
> >  	movq %r11, TDX_MODULE_r11(%rsi)
> > -	.endif
> > +	.endif	/* \ret */
> > +
> > +	.if \saved
> > +	.if \ret && \host
> > +	/*
> > +	 * Clear registers shared by guest for VP.ENTER to prevent
> > +	 * speculative use of guest's values, including those are
> > +	 * restored from the stack.
> > +	 *
> > +	 * See arch/x86/kvm/vmx/vmenter.S:
> > +	 *
> > +	 * In theory, a L1 cache miss when restoring register from stack
> > +	 * could lead to speculative execution with guest's values.
> > +	 *
> > +	 * Note: RBP/RSP are not used as shared register.  RSI has been
> > +	 * restored already.
> > +	 *
> > +	 * XOR is cheap, thus unconditionally do for all leafs.
> > +	 */
> > +	xorq %rcx, %rcx
> > +	xorq %rdx, %rdx
> > +	xorq %r8,  %r8
> > +	xorq %r9,  %r9
> > +	xorq %r10, %r10
> > +	xorq %r11, %r11
> 
> > +	xorq %r12, %r12
> > +	xorq %r13, %r13
> > +	xorq %r14, %r14
> > +	xorq %r15, %r15
> > +	xorq %rbx, %rbx
> 
> ^ those are an instant pop below, seems daft to clear them.

Also, please use the 32bit variant:

	xorl	%ecx, %ecx

saves a RAX prefix each.

> > +	xorq %rdi, %rdi
> > +	.endif	/* \ret && \host */
> > +
> > +	/* Restore callee-saved GPRs as mandated by the x86_64 ABI */
> > +	popq	%r15
> > +	popq	%r14
> > +	popq	%r13
> > +	popq	%r12
> > +	popq	%rbx
> > +	.endif	/* \saved */
> >  
> >  	FRAME_END
> >  	RET
> > -- 
> > 2.41.0
> >
Peter Zijlstra July 12, 2023, 5:11 p.m. UTC | #3
On Wed, Jul 12, 2023 at 08:55:21PM +1200, Kai Huang wrote:
> @@ -65,6 +104,37 @@
>  	.endif
>  
>  	.if \ret
> +	.if \saved
> +	/*
> +	 * Restore the structure from stack to saved the output registers
> +	 *
> +	 * In case of VP.ENTER returns due to TDVMCALL, all registers are
> +	 * valid thus no register can be used as spare to restore the
> +	 * structure from the stack (see "TDH.VP.ENTER Output Operands
> +	 * Definition on TDCALL(TDG.VP.VMCALL) Following a TD Entry").
> +	 * For this case, need to make one register as spare by saving it
> +	 * to the stack and then manually load the structure pointer to
> +	 * the spare register.
> +	 *
> +	 * Note for other TDCALLs/SEAMCALLs there are spare registers
> +	 * thus no need for such hack but just use this for all for now.
> +	 */
> +	pushq	%rax		/* save the TDCALL/SEAMCALL return code */
> +	movq	8(%rsp), %rax	/* restore the structure pointer */
> +	movq	%rsi, TDX_MODULE_rsi(%rax)	/* save %rsi */
> +	movq	%rax, %rsi	/* use %rsi as structure pointer */
> +	popq	%rax		/* restore the return code */
> +	popq	%rsi		/* pop the structure pointer */

Urgghh... At least for the \host case you can simply pop %rsi, no?
VP.ENTER returns with 0 there IIRC.

	/* xor-swap (%rsp) and %rax */
	xorq	(%rsp), %rax
	xorq	%rax, (%rsp)
	xorq	(%rsp), %rax

	movq	%rsi, TDX_MODULE_rsi(%rax);
	movq	%rax, %rsi

	popq	%rax

Isn't much better is it :/ Too bad xchg implies lock prefix.

> +
> +	/* Copy additional output regs to the structure  */
> +	movq %r12, TDX_MODULE_r12(%rsi)
> +	movq %r13, TDX_MODULE_r13(%rsi)
> +	movq %r14, TDX_MODULE_r14(%rsi)
> +	movq %r15, TDX_MODULE_r15(%rsi)
> +	movq %rbx, TDX_MODULE_rbx(%rsi)
> +	movq %rdi, TDX_MODULE_rdi(%rsi)
> +	.endif	/* \saved */
> +
>  	/* Copy output regs to the structure */
>  	movq %rcx, TDX_MODULE_rcx(%rsi)
>  	movq %rdx, TDX_MODULE_rdx(%rsi)
Huang, Kai July 13, 2023, 7:48 a.m. UTC | #4
On Wed, 2023-07-12 at 18:53 +0200, Peter Zijlstra wrote:
> On Wed, Jul 12, 2023 at 08:55:21PM +1200, Kai Huang wrote:
> 
> 
> > @@ -72,7 +142,46 @@
> >  	movq %r9,  TDX_MODULE_r9(%rsi)
> >  	movq %r10, TDX_MODULE_r10(%rsi)
> >  	movq %r11, TDX_MODULE_r11(%rsi)
> > -	.endif
> > +	.endif	/* \ret */
> > +
> > +	.if \saved
> > +	.if \ret && \host
> > +	/*
> > +	 * Clear registers shared by guest for VP.ENTER to prevent
> > +	 * speculative use of guest's values, including those are
> > +	 * restored from the stack.
> > +	 *
> > +	 * See arch/x86/kvm/vmx/vmenter.S:
> > +	 *
> > +	 * In theory, a L1 cache miss when restoring register from stack
> > +	 * could lead to speculative execution with guest's values.
> > +	 *
> > +	 * Note: RBP/RSP are not used as shared register.  RSI has been
> > +	 * restored already.
> > +	 *
> > +	 * XOR is cheap, thus unconditionally do for all leafs.
> > +	 */
> > +	xorq %rcx, %rcx
> > +	xorq %rdx, %rdx
> > +	xorq %r8,  %r8
> > +	xorq %r9,  %r9
> > +	xorq %r10, %r10
> > +	xorq %r11, %r11
> 
> > +	xorq %r12, %r12
> > +	xorq %r13, %r13
> > +	xorq %r14, %r14
> > +	xorq %r15, %r15
> > +	xorq %rbx, %rbx
> 
> ^ those are an instant pop below, seems daft to clear them.
> > 

I found below comment in KVM code:

> +	 * See arch/x86/kvm/vmx/vmenter.S:
> +	 *
> +	 * In theory, a L1 cache miss when restoring register from stack
> +	 * could lead to speculative execution with guest's values.

And KVM explicitly does XOR for the registers that gets "pop"ed almost
instantly, so I followed.

But to be honest I don't quite understand this.  :-)
Huang, Kai July 13, 2023, 8:02 a.m. UTC | #5
On Wed, 2023-07-12 at 18:59 +0200, Peter Zijlstra wrote:
> On Wed, Jul 12, 2023 at 06:53:37PM +0200, Peter Zijlstra wrote:
> > On Wed, Jul 12, 2023 at 08:55:21PM +1200, Kai Huang wrote:
> > 
> > 
> > > @@ -72,7 +142,46 @@
> > >  	movq %r9,  TDX_MODULE_r9(%rsi)
> > >  	movq %r10, TDX_MODULE_r10(%rsi)
> > >  	movq %r11, TDX_MODULE_r11(%rsi)
> > > -	.endif
> > > +	.endif	/* \ret */
> > > +
> > > +	.if \saved
> > > +	.if \ret && \host
> > > +	/*
> > > +	 * Clear registers shared by guest for VP.ENTER to prevent
> > > +	 * speculative use of guest's values, including those are
> > > +	 * restored from the stack.
> > > +	 *
> > > +	 * See arch/x86/kvm/vmx/vmenter.S:
> > > +	 *
> > > +	 * In theory, a L1 cache miss when restoring register from stack
> > > +	 * could lead to speculative execution with guest's values.
> > > +	 *
> > > +	 * Note: RBP/RSP are not used as shared register.  RSI has been
> > > +	 * restored already.
> > > +	 *
> > > +	 * XOR is cheap, thus unconditionally do for all leafs.
> > > +	 */
> > > +	xorq %rcx, %rcx
> > > +	xorq %rdx, %rdx
> > > +	xorq %r8,  %r8
> > > +	xorq %r9,  %r9
> > > +	xorq %r10, %r10
> > > +	xorq %r11, %r11
> > 
> > > +	xorq %r12, %r12
> > > +	xorq %r13, %r13
> > > +	xorq %r14, %r14
> > > +	xorq %r15, %r15
> > > +	xorq %rbx, %rbx
> > 
> > ^ those are an instant pop below, seems daft to clear them.
> 
> Also, please use the 32bit variant:
> 
> 	xorl	%ecx, %ecx
> 
> saves a RAX prefix each.

Sorry I am ignorant here.  Won't "clearing ECX only" leave high bits of
registers still containing guest's value?

I see KVM code uses:

        xor %eax, %eax
        xor %ecx, %ecx
        xor %edx, %edx
        xor %ebp, %ebp
        xor %esi, %esi
        xor %edi, %edi
#ifdef CONFIG_X86_64
        xor %r8d,  %r8d
        xor %r9d,  %r9d
        xor %r10d, %r10d
        xor %r11d, %r11d
        xor %r12d, %r12d
        xor %r13d, %r13d
        xor %r14d, %r14d
        xor %r15d, %r15d
#endif

Which makes sense because KVM wants to support 32-bit too.

However for TDX is 64-bit only.

And I also see the current TDVMCALL code has:

        xor %r8d,  %r8d
        xor %r9d,  %r9d
        xor %r10d, %r10d                                                       
        xor %r11d, %r11d                                                       
        xor %rdi,  %rdi                                                        
        xor %rdx,  %rdx

Why does it need to use "d" postfix for all r* registers?

Sorry for those questions but I struggled when I wrote those assembly and am
hoping to get my mind cleared on this. :-)

Thanks!
Huang, Kai July 13, 2023, 8:09 a.m. UTC | #6
On Wed, 2023-07-12 at 19:11 +0200, Peter Zijlstra wrote:
> On Wed, Jul 12, 2023 at 08:55:21PM +1200, Kai Huang wrote:
> > @@ -65,6 +104,37 @@
> >  	.endif
> >  
> >  	.if \ret
> > +	.if \saved
> > +	/*
> > +	 * Restore the structure from stack to saved the output registers
> > +	 *
> > +	 * In case of VP.ENTER returns due to TDVMCALL, all registers are
> > +	 * valid thus no register can be used as spare to restore the
> > +	 * structure from the stack (see "TDH.VP.ENTER Output Operands
> > +	 * Definition on TDCALL(TDG.VP.VMCALL) Following a TD Entry").
> > +	 * For this case, need to make one register as spare by saving it
> > +	 * to the stack and then manually load the structure pointer to
> > +	 * the spare register.
> > +	 *
> > +	 * Note for other TDCALLs/SEAMCALLs there are spare registers
> > +	 * thus no need for such hack but just use this for all for now.
> > +	 */
> > +	pushq	%rax		/* save the TDCALL/SEAMCALL return code */
> > +	movq	8(%rsp), %rax	/* restore the structure pointer */
> > +	movq	%rsi, TDX_MODULE_rsi(%rax)	/* save %rsi */
> > +	movq	%rax, %rsi	/* use %rsi as structure pointer */
> > +	popq	%rax		/* restore the return code */
> > +	popq	%rsi		/* pop the structure pointer */
> 
> Urgghh... At least for the \host case you can simply pop %rsi, no?
> VP.ENTER returns with 0 there IIRC.

No VP.ENTER doesn't return 0 for RAX.  Firstly, VP.ENTER can return for many
reasons but not limited to TDVMCALL (e.g., due to interrupt), and for those
cases RAX contains valid non-zero value.  Secondly, even for TDVMCALL, the RAX
isn't 0:

Table 6.256 TDX 1.5 ABI spec:

RAX	SEAMCALL instruction return code
	The DETAILS_L2 field in bits 31:0 contain the VMCS exit reason, 
	indicating TDCALL (77).
Peter Zijlstra July 13, 2023, 8:43 a.m. UTC | #7
On Thu, Jul 13, 2023 at 08:02:54AM +0000, Huang, Kai wrote:

> Sorry I am ignorant here.  Won't "clearing ECX only" leave high bits of
> registers still containing guest's value?

architecture zero-extends 32bit stores

> I see KVM code uses:
> 
>         xor %eax, %eax
>         xor %ecx, %ecx
>         xor %edx, %edx
>         xor %ebp, %ebp
>         xor %esi, %esi
>         xor %edi, %edi
> #ifdef CONFIG_X86_64
>         xor %r8d,  %r8d
>         xor %r9d,  %r9d
>         xor %r10d, %r10d
>         xor %r11d, %r11d
>         xor %r12d, %r12d
>         xor %r13d, %r13d
>         xor %r14d, %r14d
>         xor %r15d, %r15d
> #endif
> 
> Which makes sense because KVM wants to support 32-bit too.

Encoding for the first lot is shorter, the 64bit regs obviously need the
RAX byte anyway.

> However for TDX is 64-bit only.
> 
> And I also see the current TDVMCALL code has:
> 
>         xor %r8d,  %r8d
>         xor %r9d,  %r9d
>         xor %r10d, %r10d                                                       
>         xor %r11d, %r11d                                                       
>         xor %rdi,  %rdi                                                        
>         xor %rdx,  %rdx
> 
> Why does it need to use "d" postfix for all r* registers?

That's the name of the 32bit subword, r#[bwd] for byte, word,
double-word. SDM v1 3.7.2.1 has the whole list, I couldn't quicky find
one for the zero-extention thing.

> Sorry for those questions but I struggled when I wrote those assembly and am
> hoping to get my mind cleared on this. :-)

No problem.
Peter Zijlstra July 13, 2023, 8:46 a.m. UTC | #8
On Thu, Jul 13, 2023 at 07:48:20AM +0000, Huang, Kai wrote:

> I found below comment in KVM code:
> 
> > +	 * See arch/x86/kvm/vmx/vmenter.S:
> > +	 *
> > +	 * In theory, a L1 cache miss when restoring register from stack
> > +	 * could lead to speculative execution with guest's values.
> 
> And KVM explicitly does XOR for the registers that gets "pop"ed almost
> instantly, so I followed.
> 
> But to be honest I don't quite understand this.  :-)

Urgh, I suppose that actually makes sense. Since pop is a load it might
continue speculation with the previous value. Whereas the xor-clear
idiom is impossible to speculate through.

Oh well...
Peter Zijlstra July 13, 2023, 9:01 a.m. UTC | #9
On Thu, Jul 13, 2023 at 08:09:33AM +0000, Huang, Kai wrote:
> On Wed, 2023-07-12 at 19:11 +0200, Peter Zijlstra wrote:
> > On Wed, Jul 12, 2023 at 08:55:21PM +1200, Kai Huang wrote:
> > > @@ -65,6 +104,37 @@
> > >  	.endif
> > >  
> > >  	.if \ret
> > > +	.if \saved
> > > +	/*
> > > +	 * Restore the structure from stack to saved the output registers
> > > +	 *
> > > +	 * In case of VP.ENTER returns due to TDVMCALL, all registers are
> > > +	 * valid thus no register can be used as spare to restore the
> > > +	 * structure from the stack (see "TDH.VP.ENTER Output Operands
> > > +	 * Definition on TDCALL(TDG.VP.VMCALL) Following a TD Entry").
> > > +	 * For this case, need to make one register as spare by saving it
> > > +	 * to the stack and then manually load the structure pointer to
> > > +	 * the spare register.
> > > +	 *
> > > +	 * Note for other TDCALLs/SEAMCALLs there are spare registers
> > > +	 * thus no need for such hack but just use this for all for now.
> > > +	 */
> > > +	pushq	%rax		/* save the TDCALL/SEAMCALL return code */
> > > +	movq	8(%rsp), %rax	/* restore the structure pointer */
> > > +	movq	%rsi, TDX_MODULE_rsi(%rax)	/* save %rsi */
> > > +	movq	%rax, %rsi	/* use %rsi as structure pointer */
> > > +	popq	%rax		/* restore the return code */
> > > +	popq	%rsi		/* pop the structure pointer */
> > 
> > Urgghh... At least for the \host case you can simply pop %rsi, no?
> > VP.ENTER returns with 0 there IIRC.
> 
> No VP.ENTER doesn't return 0 for RAX.  Firstly, VP.ENTER can return for many

No, but it *does* return 0 for: RBX,RSI,RDI,R10-R15.

So for \host you can simply do:

	pop	%rsi
	mov	$0, TDX_MODULE_rsi(%rsi)

and call it a day.
Huang, Kai July 13, 2023, 9:15 a.m. UTC | #10
On Thu, 2023-07-13 at 11:01 +0200, Peter Zijlstra wrote:
> On Thu, Jul 13, 2023 at 08:09:33AM +0000, Huang, Kai wrote:
> > On Wed, 2023-07-12 at 19:11 +0200, Peter Zijlstra wrote:
> > > On Wed, Jul 12, 2023 at 08:55:21PM +1200, Kai Huang wrote:
> > > > @@ -65,6 +104,37 @@
> > > >  	.endif
> > > >  
> > > >  	.if \ret
> > > > +	.if \saved
> > > > +	/*
> > > > +	 * Restore the structure from stack to saved the output registers
> > > > +	 *
> > > > +	 * In case of VP.ENTER returns due to TDVMCALL, all registers are
> > > > +	 * valid thus no register can be used as spare to restore the
> > > > +	 * structure from the stack (see "TDH.VP.ENTER Output Operands
> > > > +	 * Definition on TDCALL(TDG.VP.VMCALL) Following a TD Entry").
> > > > +	 * For this case, need to make one register as spare by saving it
> > > > +	 * to the stack and then manually load the structure pointer to
> > > > +	 * the spare register.
> > > > +	 *
> > > > +	 * Note for other TDCALLs/SEAMCALLs there are spare registers
> > > > +	 * thus no need for such hack but just use this for all for now.
> > > > +	 */
> > > > +	pushq	%rax		/* save the TDCALL/SEAMCALL return code */
> > > > +	movq	8(%rsp), %rax	/* restore the structure pointer */
> > > > +	movq	%rsi, TDX_MODULE_rsi(%rax)	/* save %rsi */
> > > > +	movq	%rax, %rsi	/* use %rsi as structure pointer */
> > > > +	popq	%rax		/* restore the return code */
> > > > +	popq	%rsi		/* pop the structure pointer */
> > > 
> > > Urgghh... At least for the \host case you can simply pop %rsi, no?
> > > VP.ENTER returns with 0 there IIRC.
> > 
> > No VP.ENTER doesn't return 0 for RAX.  Firstly, VP.ENTER can return for many
> 
> No, but it *does* return 0 for: RBX,RSI,RDI,R10-R15.
> 
> So for \host you can simply do:
> 
> 	pop	%rsi
> 	mov	$0, TDX_MODULE_rsi(%rsi)
> 
> and call it a day.

This isn't true for the case that VP.ENTER returns due to a TDVMCALL.  In that
case RCX contains the bitmap of shared registers, and RBX/RDX/RDI/RSI/R8-R15
contains guest value if the corresponding bit is set in RCX (RBP will be
excluded by updating the spec I assume).

Or are you suggesting we need to decode RAX to decide whether the VP.ENTER
return is due to TDVMCALL vs other reasons, and act differently?
Peter Zijlstra July 13, 2023, 9:25 a.m. UTC | #11
On Thu, Jul 13, 2023 at 09:15:49AM +0000, Huang, Kai wrote:
> On Thu, 2023-07-13 at 11:01 +0200, Peter Zijlstra wrote:
> > On Thu, Jul 13, 2023 at 08:09:33AM +0000, Huang, Kai wrote:
> > > On Wed, 2023-07-12 at 19:11 +0200, Peter Zijlstra wrote:
> > > > On Wed, Jul 12, 2023 at 08:55:21PM +1200, Kai Huang wrote:
> > > > > @@ -65,6 +104,37 @@
> > > > >  	.endif
> > > > >  
> > > > >  	.if \ret
> > > > > +	.if \saved
> > > > > +	/*
> > > > > +	 * Restore the structure from stack to saved the output registers
> > > > > +	 *
> > > > > +	 * In case of VP.ENTER returns due to TDVMCALL, all registers are
> > > > > +	 * valid thus no register can be used as spare to restore the
> > > > > +	 * structure from the stack (see "TDH.VP.ENTER Output Operands
> > > > > +	 * Definition on TDCALL(TDG.VP.VMCALL) Following a TD Entry").
> > > > > +	 * For this case, need to make one register as spare by saving it
> > > > > +	 * to the stack and then manually load the structure pointer to
> > > > > +	 * the spare register.
> > > > > +	 *
> > > > > +	 * Note for other TDCALLs/SEAMCALLs there are spare registers
> > > > > +	 * thus no need for such hack but just use this for all for now.
> > > > > +	 */
> > > > > +	pushq	%rax		/* save the TDCALL/SEAMCALL return code */
> > > > > +	movq	8(%rsp), %rax	/* restore the structure pointer */
> > > > > +	movq	%rsi, TDX_MODULE_rsi(%rax)	/* save %rsi */
> > > > > +	movq	%rax, %rsi	/* use %rsi as structure pointer */
> > > > > +	popq	%rax		/* restore the return code */
> > > > > +	popq	%rsi		/* pop the structure pointer */
> > > > 
> > > > Urgghh... At least for the \host case you can simply pop %rsi, no?
> > > > VP.ENTER returns with 0 there IIRC.
> > > 
> > > No VP.ENTER doesn't return 0 for RAX.  Firstly, VP.ENTER can return for many
> > 
> > No, but it *does* return 0 for: RBX,RSI,RDI,R10-R15.
> > 
> > So for \host you can simply do:
> > 
> > 	pop	%rsi
> > 	mov	$0, TDX_MODULE_rsi(%rsi)
> > 
> > and call it a day.
> 
> This isn't true for the case that VP.ENTER returns due to a TDVMCALL.  In that
> case RCX contains the bitmap of shared registers, and RBX/RDX/RDI/RSI/R8-R15
> contains guest value if the corresponding bit is set in RCX (RBP will be
> excluded by updating the spec I assume).
> 
> Or are you suggesting we need to decode RAX to decide whether the VP.ENTER
> return is due to TDVMCALL vs other reasons, and act differently?

Urgh, no I had missed there are *TWO* tables for output :/ Who does
something like that :-(

So yeah, sucks.
Huang, Kai July 13, 2023, 9:34 a.m. UTC | #12
On Thu, 2023-07-13 at 10:46 +0200, Peter Zijlstra wrote:
> On Thu, Jul 13, 2023 at 07:48:20AM +0000, Huang, Kai wrote:
> 
> > I found below comment in KVM code:
> > 
> > > +	 * See arch/x86/kvm/vmx/vmenter.S:
> > > +	 *
> > > +	 * In theory, a L1 cache miss when restoring register from stack
> > > +	 * could lead to speculative execution with guest's values.
> > 
> > And KVM explicitly does XOR for the registers that gets "pop"ed almost
> > instantly, so I followed.
> > 
> > But to be honest I don't quite understand this.  :-)
> 
> Urgh, I suppose that actually makes sense. Since pop is a load it might
> continue speculation with the previous value. Whereas the xor-clear
> idiom is impossible to speculate through.
> 
> Oh well...

Then should I keep those registers that are "pop"ed immediately afterwards?
Peter Zijlstra July 13, 2023, 9:40 a.m. UTC | #13
On Thu, Jul 13, 2023 at 09:34:24AM +0000, Huang, Kai wrote:
> On Thu, 2023-07-13 at 10:46 +0200, Peter Zijlstra wrote:
> > On Thu, Jul 13, 2023 at 07:48:20AM +0000, Huang, Kai wrote:
> > 
> > > I found below comment in KVM code:
> > > 
> > > > +	 * See arch/x86/kvm/vmx/vmenter.S:
> > > > +	 *
> > > > +	 * In theory, a L1 cache miss when restoring register from stack
> > > > +	 * could lead to speculative execution with guest's values.
> > > 
> > > And KVM explicitly does XOR for the registers that gets "pop"ed almost
> > > instantly, so I followed.
> > > 
> > > But to be honest I don't quite understand this.  :-)
> > 
> > Urgh, I suppose that actually makes sense. Since pop is a load it might
> > continue speculation with the previous value. Whereas the xor-clear
> > idiom is impossible to speculate through.
> > 
> > Oh well...
> 
> Then should I keep those registers that are "pop"ed immediately afterwards?

Yeah, I suppose so.
Huang, Kai July 13, 2023, 10:01 a.m. UTC | #14
On Thu, 2023-07-13 at 11:25 +0200, Peter Zijlstra wrote:
> On Thu, Jul 13, 2023 at 09:15:49AM +0000, Huang, Kai wrote:
> > On Thu, 2023-07-13 at 11:01 +0200, Peter Zijlstra wrote:
> > > On Thu, Jul 13, 2023 at 08:09:33AM +0000, Huang, Kai wrote:
> > > > On Wed, 2023-07-12 at 19:11 +0200, Peter Zijlstra wrote:
> > > > > On Wed, Jul 12, 2023 at 08:55:21PM +1200, Kai Huang wrote:
> > > > > > @@ -65,6 +104,37 @@
> > > > > >  	.endif
> > > > > >  
> > > > > >  	.if \ret
> > > > > > +	.if \saved
> > > > > > +	/*
> > > > > > +	 * Restore the structure from stack to saved the output registers
> > > > > > +	 *
> > > > > > +	 * In case of VP.ENTER returns due to TDVMCALL, all registers are
> > > > > > +	 * valid thus no register can be used as spare to restore the
> > > > > > +	 * structure from the stack (see "TDH.VP.ENTER Output Operands
> > > > > > +	 * Definition on TDCALL(TDG.VP.VMCALL) Following a TD Entry").
> > > > > > +	 * For this case, need to make one register as spare by saving it
> > > > > > +	 * to the stack and then manually load the structure pointer to
> > > > > > +	 * the spare register.
> > > > > > +	 *
> > > > > > +	 * Note for other TDCALLs/SEAMCALLs there are spare registers
> > > > > > +	 * thus no need for such hack but just use this for all for now.
> > > > > > +	 */
> > > > > > +	pushq	%rax		/* save the TDCALL/SEAMCALL return code */
> > > > > > +	movq	8(%rsp), %rax	/* restore the structure pointer */
> > > > > > +	movq	%rsi, TDX_MODULE_rsi(%rax)	/* save %rsi */
> > > > > > +	movq	%rax, %rsi	/* use %rsi as structure pointer */
> > > > > > +	popq	%rax		/* restore the return code */
> > > > > > +	popq	%rsi		/* pop the structure pointer */
> > > > > 
> > > > > Urgghh... At least for the \host case you can simply pop %rsi, no?
> > > > > VP.ENTER returns with 0 there IIRC.
> > > > 
> > > > No VP.ENTER doesn't return 0 for RAX.  Firstly, VP.ENTER can return for many
> > > 
> > > No, but it *does* return 0 for: RBX,RSI,RDI,R10-R15.
> > > 
> > > So for \host you can simply do:
> > > 
> > > 	pop	%rsi
> > > 	mov	$0, TDX_MODULE_rsi(%rsi)
> > > 
> > > and call it a day.
> > 
> > This isn't true for the case that VP.ENTER returns due to a TDVMCALL.  In that
> > case RCX contains the bitmap of shared registers, and RBX/RDX/RDI/RSI/R8-R15
> > contains guest value if the corresponding bit is set in RCX (RBP will be
> > excluded by updating the spec I assume).
> > 
> > Or are you suggesting we need to decode RAX to decide whether the VP.ENTER
> > return is due to TDVMCALL vs other reasons, and act differently?
> 
> Urgh, no I had missed there are *TWO* tables for output :/ Who does
> something like that :-(
> 
> So yeah, sucks.

Yeah.  I think for code simplicity we should just do the way the current patch
has implemented :-)
Huang, Kai July 13, 2023, 10:19 a.m. UTC | #15
On Thu, 2023-07-13 at 10:43 +0200, Peter Zijlstra wrote:
> On Thu, Jul 13, 2023 at 08:02:54AM +0000, Huang, Kai wrote:
> 
> > Sorry I am ignorant here.  Won't "clearing ECX only" leave high bits of
> > registers still containing guest's value?
> 
> architecture zero-extends 32bit stores

Sorry, where can I find this information? Looking at SDM I couldn't find :-(

> 
> > I see KVM code uses:
> > 
> >         xor %eax, %eax
> >         xor %ecx, %ecx
> >         xor %edx, %edx
> >         xor %ebp, %ebp
> >         xor %esi, %esi
> >         xor %edi, %edi
> > #ifdef CONFIG_X86_64
> >         xor %r8d,  %r8d
> >         xor %r9d,  %r9d
> >         xor %r10d, %r10d
> >         xor %r11d, %r11d
> >         xor %r12d, %r12d
> >         xor %r13d, %r13d
> >         xor %r14d, %r14d
> >         xor %r15d, %r15d
> > #endif
> > 
> > Which makes sense because KVM wants to support 32-bit too.
> 
> Encoding for the first lot is shorter, the 64bit regs obviously need the
> RAX byte anyway.
> 
> > However for TDX is 64-bit only.
> > 
> > And I also see the current TDVMCALL code has:
> > 
> >         xor %r8d,  %r8d
> >         xor %r9d,  %r9d
> >         xor %r10d, %r10d                                                       
> >         xor %r11d, %r11d                                                       
> >         xor %rdi,  %rdi                                                        
> >         xor %rdx,  %rdx
> > 
> > Why does it need to use "d" postfix for all r* registers?
> 
> That's the name of the 32bit subword, r#[bwd] for byte, word,
> double-word. SDM v1 3.7.2.1 has the whole list, I couldn't quicky find
> one for the zero-extention thing
> 
> > Sorry for those questions but I struggled when I wrote those assembly and am
> > hoping to get my mind cleared on this. :-)
> 
> No problem.
> 

I _think_ I understand now? In 64-bit mode

	xor %eax, %eax

equals to

	xor %rax, %rax

(due to "architecture zero-extends 32bit stores")

Thus using the former (plus using "d" for %r*) can save some memory?
Huang, Kai July 13, 2023, 10:24 a.m. UTC | #16
On Thu, 2023-07-13 at 10:19 +0000, Huang, Kai wrote:
> On Thu, 2023-07-13 at 10:43 +0200, Peter Zijlstra wrote:
> > On Thu, Jul 13, 2023 at 08:02:54AM +0000, Huang, Kai wrote:
> > 
> > > Sorry I am ignorant here.  Won't "clearing ECX only" leave high bits of
> > > registers still containing guest's value?
> > 
> > architecture zero-extends 32bit stores
> 
> Sorry, where can I find this information? Looking at SDM I couldn't find :-(
> 
> 

Hmm.. I think I found it -- it's in SDM vol 1:

3.4.1.1 General-Purpose Registers in 64-Bit Mode

32-bit operands generate a 32-bit result, zero-extended to a 64-bit result in
the destination general-purpose register.

Sorry for the noise!
Peter Zijlstra July 13, 2023, 10:37 a.m. UTC | #17
On Thu, Jul 13, 2023 at 10:19:49AM +0000, Huang, Kai wrote:
> On Thu, 2023-07-13 at 10:43 +0200, Peter Zijlstra wrote:
> > On Thu, Jul 13, 2023 at 08:02:54AM +0000, Huang, Kai wrote:
> > 
> > > Sorry I am ignorant here.  Won't "clearing ECX only" leave high bits of
> > > registers still containing guest's value?
> > 
> > architecture zero-extends 32bit stores
> 
> Sorry, where can I find this information? Looking at SDM I couldn't find :-(

Yeah, I couldn't find it in a hurry either, but bpetkov pasted me this
from the AMD document:

 "In 64-bit mode, the following general rules apply to instructions and their operands:
 “Promoted to 64 Bit”: If an instruction’s operand size (16-bit or 32-bit) in legacy and
 compatibility modes depends on the CS.D bit and the operand-size override prefix, then the
 operand-size choices in 64-bit mode are extended from 16-bit and 32-bit to include 64 bits (with a
 REX prefix), or the operand size is fixed at 64 bits. Such instructions are said to be “Promoted to
 64 bits” in Table B-1. However, byte-operand opcodes of such instructions are not promoted."

> I _think_ I understand now? In 64-bit mode
> 
> 	xor %eax, %eax
> 
> equals to
> 
> 	xor %rax, %rax
> 
> (due to "architecture zero-extends 32bit stores")
> 
> Thus using the former (plus using "d" for %r*) can save some memory?

Yes, 64bit wide instruction get a REX prefix 0x4X (somehow I keep typing
RAX) byte in front to tell it's a 64bit wide op.

   31 c0                   xor    %eax,%eax
   48 31 c0                xor    %rax,%rax

The REX byte will show up for rN usage, because then we need the actual
Register Extention part of that prefix irrespective of the width.

   45 31 d2                xor    %r10d,%r10d
   4d 31 d2                xor    %r10,%r10

x86 instruction encoding is 'fun' :-)

See SDM Vol 2 2.2.1.2 if you want to know more about the REX prefix.
Peter Zijlstra July 13, 2023, 10:39 a.m. UTC | #18
On Thu, Jul 13, 2023 at 10:24:48AM +0000, Huang, Kai wrote:
> On Thu, 2023-07-13 at 10:19 +0000, Huang, Kai wrote:
> > On Thu, 2023-07-13 at 10:43 +0200, Peter Zijlstra wrote:
> > > On Thu, Jul 13, 2023 at 08:02:54AM +0000, Huang, Kai wrote:
> > > 
> > > > Sorry I am ignorant here.  Won't "clearing ECX only" leave high bits of
> > > > registers still containing guest's value?
> > > 
> > > architecture zero-extends 32bit stores
> > 
> > Sorry, where can I find this information? Looking at SDM I couldn't find :-(
> > 
> > 
> 
> Hmm.. I think I found it -- it's in SDM vol 1:
> 
> 3.4.1.1 General-Purpose Registers in 64-Bit Mode
> 
> 32-bit operands generate a 32-bit result, zero-extended to a 64-bit result in
> the destination general-purpose register.

Yes, that's it.
Huang, Kai July 13, 2023, 10:47 a.m. UTC | #19
On Thu, 2023-07-13 at 12:37 +0200, Peter Zijlstra wrote:
> On Thu, Jul 13, 2023 at 10:19:49AM +0000, Huang, Kai wrote:
> > On Thu, 2023-07-13 at 10:43 +0200, Peter Zijlstra wrote:
> > > On Thu, Jul 13, 2023 at 08:02:54AM +0000, Huang, Kai wrote:
> > > 
> > > > Sorry I am ignorant here.  Won't "clearing ECX only" leave high bits of
> > > > registers still containing guest's value?
> > > 
> > > architecture zero-extends 32bit stores
> > 
> > Sorry, where can I find this information? Looking at SDM I couldn't find :-(
> 
> Yeah, I couldn't find it in a hurry either, but bpetkov pasted me this
> from the AMD document:
> 
>  "In 64-bit mode, the following general rules apply to instructions and their operands:
>  “Promoted to 64 Bit”: If an instruction’s operand size (16-bit or 32-bit) in legacy and
>  compatibility modes depends on the CS.D bit and the operand-size override prefix, then the
>  operand-size choices in 64-bit mode are extended from 16-bit and 32-bit to include 64 bits (with a
>  REX prefix), or the operand size is fixed at 64 bits. Such instructions are said to be “Promoted to
>  64 bits” in Table B-1. However, byte-operand opcodes of such instructions are not promoted."
> 
> > I _think_ I understand now? In 64-bit mode
> > 
> > 	xor %eax, %eax
> > 
> > equals to
> > 
> > 	xor %rax, %rax
> > 
> > (due to "architecture zero-extends 32bit stores")
> > 
> > Thus using the former (plus using "d" for %r*) can save some memory?
> 
> Yes, 64bit wide instruction get a REX prefix 0x4X (somehow I keep typing
> RAX) byte in front to tell it's a 64bit wide op.
> 
>    31 c0                   xor    %eax,%eax
>    48 31 c0                xor    %rax,%rax
> 
> The REX byte will show up for rN usage, because then we need the actual
> Register Extention part of that prefix irrespective of the width.
> 
>    45 31 d2                xor    %r10d,%r10d
>    4d 31 d2                xor    %r10,%r10
> 
> x86 instruction encoding is 'fun' :-)
> 
> See SDM Vol 2 2.2.1.2 if you want to know more about the REX prefix.

Learned something new.  Appreciate your time! :-)
Andrew Cooper July 13, 2023, 11:22 a.m. UTC | #20
On Thu, 13 Jul 2023 10:47:44 +0000, Huang, Kai wrote:
> On Thu, 2023-07-13 at 12:37 +0200, Peter Zijlstra wrote:
> > On Thu, Jul 13, 2023 at 10:19:49AM +0000, Huang, Kai wrote:
> > > On Thu, 2023-07-13 at 10:43 +0200, Peter Zijlstra wrote:
> > > > On Thu, Jul 13, 2023 at 08:02:54AM +0000, Huang, Kai wrote:
> > > >
> > > > > Sorry I am ignorant here.  Won't "clearing ECX only" leave high bits of
> > > > > registers still containing guest's value?
> > > >
> > > > architecture zero-extends 32bit stores
> > >
> > > Sorry, where can I find this information? Looking at SDM I couldn't find :-(
> >
> > Yeah, I couldn't find it in a hurry either, but bpetkov pasted me this
> > from the AMD document:
> >
> >  "In 64-bit mode, the following general rules apply to instructions and their operands:
> >  “Promoted to 64 Bit”: If an instruction’s operand size (16-bit or 32-bit) in legacy and
> >  compatibility modes depends on the CS.D bit and the operand-size override prefix, then the
> >  operand-size choices in 64-bit mode are extended from 16-bit and 32-bit to include 64 bits (with a
> >  REX prefix), or the operand size is fixed at 64 bits. Such instructions are said to be “Promoted to
> >  64 bits” in Table B-1. However, byte-operand opcodes of such instructions are not promoted."
> >
> > > I _think_ I understand now? In 64-bit mode
> > >
> > > 	xor %eax, %eax
> > >
> > > equals to
> > >
> > > 	xor %rax, %rax
> > >
> > > (due to "architecture zero-extends 32bit stores")
> > >
> > > Thus using the former (plus using "d" for %r*) can save some memory?
> >
> > Yes, 64bit wide instruction get a REX prefix 0x4X (somehow I keep typing
> > RAX) byte in front to tell it's a 64bit wide op.
> >
> >    31 c0                   xor    %eax,%eax
> >    48 31 c0                xor    %rax,%rax
> >
> > The REX byte will show up for rN usage, because then we need the actual
> > Register Extention part of that prefix irrespective of the width.
> >
> >    45 31 d2                xor    %r10d,%r10d
> >    4d 31 d2                xor    %r10,%r10
> >
> > x86 instruction encoding is 'fun' :-)
> >
> > See SDM Vol 2 2.2.1.2 if you want to know more about the REX prefix.
>
> Learned something new.  Appreciate your time! :-)

And now for the extra fun...

The Silvermont uarch is 64bit, but only recognises 32bit XORs as zeroing
idioms.

So for best performance on as many uarches as possible, you should *always*
use the 32bit forms, even for %r8-15.

~Andrew
Huang, Kai July 13, 2023, 11:40 a.m. UTC | #21
On Thu, 2023-07-13 at 12:22 +0100, Andrew Cooper wrote:
> On Thu, 13 Jul 2023 10:47:44 +0000, Huang, Kai wrote:
> > On Thu, 2023-07-13 at 12:37 +0200, Peter Zijlstra wrote:
> > > On Thu, Jul 13, 2023 at 10:19:49AM +0000, Huang, Kai wrote:
> > > > On Thu, 2023-07-13 at 10:43 +0200, Peter Zijlstra wrote:
> > > > > On Thu, Jul 13, 2023 at 08:02:54AM +0000, Huang, Kai wrote:
> > > > > 
> > > > > > Sorry I am ignorant here.  Won't "clearing ECX only" leave high bits of
> > > > > > registers still containing guest's value?
> > > > > 
> > > > > architecture zero-extends 32bit stores
> > > > 
> > > > Sorry, where can I find this information? Looking at SDM I couldn't find :-(
> > > 
> > > Yeah, I couldn't find it in a hurry either, but bpetkov pasted me this
> > > from the AMD document:
> > > 
> > >  "In 64-bit mode, the following general rules apply to instructions and their operands:
> > >  “Promoted to 64 Bit”: If an instruction’s operand size (16-bit or 32-bit) in legacy and
> > >  compatibility modes depends on the CS.D bit and the operand-size override prefix, then the
> > >  operand-size choices in 64-bit mode are extended from 16-bit and 32-bit to include 64 bits (with a
> > >  REX prefix), or the operand size is fixed at 64 bits. Such instructions are said to be “Promoted to
> > >  64 bits” in Table B-1. However, byte-operand opcodes of such instructions are not promoted."
> > > 
> > > > I _think_ I understand now? In 64-bit mode
> > > > 
> > > > 	xor %eax, %eax
> > > > 
> > > > equals to
> > > > 
> > > > 	xor %rax, %rax
> > > > 
> > > > (due to "architecture zero-extends 32bit stores")
> > > > 
> > > > Thus using the former (plus using "d" for %r*) can save some memory?
> > > 
> > > Yes, 64bit wide instruction get a REX prefix 0x4X (somehow I keep typing
> > > RAX) byte in front to tell it's a 64bit wide op.
> > > 
> > >    31 c0                   xor    %eax,%eax
> > >    48 31 c0                xor    %rax,%rax
> > > 
> > > The REX byte will show up for rN usage, because then we need the actual
> > > Register Extention part of that prefix irrespective of the width.
> > > 
> > >    45 31 d2                xor    %r10d,%r10d
> > >    4d 31 d2                xor    %r10,%r10
> > > 
> > > x86 instruction encoding is 'fun' :-)
> > > 
> > > See SDM Vol 2 2.2.1.2 if you want to know more about the REX prefix.
> > 
> > Learned something new.  Appreciate your time! :-)
> 
> And now for the extra fun...
> 
> The Silvermont uarch is 64bit, but only recognises 32bit XORs as zeroing
> idioms.
> 
> So for best performance on as many uarches as possible, you should *always*
> use the 32bit forms, even for %r8-15.
> 
> 

Ah thanks Andrew for the tip :)
diff mbox series

Patch

diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S
index 46847e85c372..d5693330ee9b 100644
--- a/arch/x86/coco/tdx/tdcall.S
+++ b/arch/x86/coco/tdx/tdcall.S
@@ -51,7 +51,7 @@ 
  * Return status of TDCALL via RAX.
  */
 SYM_FUNC_START(__tdcall)
-	TDX_MODULE_CALL host=0 ret=0
+	TDX_MODULE_CALL host=0 ret=0 saved=0
 SYM_FUNC_END(__tdcall)
 
 /*
@@ -66,7 +66,7 @@  SYM_FUNC_END(__tdcall)
  * Return status of TDCALL via RAX.
  */
 SYM_FUNC_START(__tdcall_ret)
-	TDX_MODULE_CALL host=0 ret=1
+	TDX_MODULE_CALL host=0 ret=1 saved=0
 SYM_FUNC_END(__tdcall_ret)
 
 /*
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index e353c7ec5f24..a549bcc77f4f 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -27,14 +27,22 @@ 
  * This is a software only structure and not part of the TDX module/VMM ABI.
  */
 struct tdx_module_args {
-	/* input/output */
+	/* callee-clobbered */
 	u64 rcx;
 	u64 rdx;
 	u64 r8;
 	u64 r9;
-	/* additional output */
+	/* extra callee-clobbered */
 	u64 r10;
 	u64 r11;
+	/* callee-saved + rdi/rsi */
+	u64 r12;
+	u64 r13;
+	u64 r14;
+	u64 r15;
+	u64 rbx;
+	u64 rdi;
+	u64 rsi;
 };
 
 /*
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 50383bc46dd7..1581564a67b7 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -74,6 +74,13 @@  static void __used common(void)
 	OFFSET(TDX_MODULE_r9,  tdx_module_args, r9);
 	OFFSET(TDX_MODULE_r10, tdx_module_args, r10);
 	OFFSET(TDX_MODULE_r11, tdx_module_args, r11);
+	OFFSET(TDX_MODULE_r12, tdx_module_args, r12);
+	OFFSET(TDX_MODULE_r13, tdx_module_args, r13);
+	OFFSET(TDX_MODULE_r14, tdx_module_args, r14);
+	OFFSET(TDX_MODULE_r15, tdx_module_args, r15);
+	OFFSET(TDX_MODULE_rbx, tdx_module_args, rbx);
+	OFFSET(TDX_MODULE_rdi, tdx_module_args, rdi);
+	OFFSET(TDX_MODULE_rsi, tdx_module_args, rsi);
 
 	BLANK();
 	OFFSET(TDX_HYPERCALL_r8,  tdx_hypercall_args, r8);
diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S
index 0af1374ad8f5..389f4ec5eee8 100644
--- a/arch/x86/virt/vmx/tdx/tdxcall.S
+++ b/arch/x86/virt/vmx/tdx/tdxcall.S
@@ -23,17 +23,25 @@ 
  *-------------------------------------------------------------------------
  * Input Registers:
  *
- * RAX                 - TDCALL/SEAMCALL Leaf number.
- * RCX,RDX,R8-R9       - TDCALL/SEAMCALL Leaf specific input registers.
+ * RAX                        - TDCALL/SEAMCALL Leaf number.
+ * RCX,RDX,RDI,RSI,RBX,R8-R15 - TDCALL/SEAMCALL Leaf specific input registers.
  *
  * Output Registers:
  *
- * RAX                 - TDCALL/SEAMCALL instruction error code.
- * RCX,RDX,R8-R11      - TDCALL/SEAMCALL Leaf specific output registers.
+ * RAX                        - TDCALL/SEAMCALL Leaf number.
+ * RCX,RDX,RDI,RSI,RBX,R8-R15 - TDCALL/SEAMCALL Leaf specific output registers.
  *
  *-------------------------------------------------------------------------
+ *
+ * So while the common core (RAX,RCX,RDX,R8-R11) fits nicely in the
+ * callee-clobbered registers and even leaves RDI,RSI free to act as a
+ * base pointer, some leafs (e.g., VP.ENTER) make a giant mess of things.
+ *
+ * For simplicity, assume that anything that needs the callee-saved regs
+ * also tramples on RDI,RSI.  This isn't strictly true, see for example
+ * TDH.EXPORT.MEM.
  */
-.macro TDX_MODULE_CALL host:req ret:req
+.macro TDX_MODULE_CALL host:req ret:req saved:req
 	FRAME_BEGIN
 
 	/* Move Leaf ID to RAX */
@@ -44,6 +52,37 @@ 
 	movq	TDX_MODULE_rdx(%rsi), %rdx
 	movq	TDX_MODULE_r8(%rsi), %r8
 	movq	TDX_MODULE_r9(%rsi), %r9
+	movq	TDX_MODULE_r10(%rsi), %r10
+	movq	TDX_MODULE_r11(%rsi), %r11
+
+	.if \saved
+	/*
+	 * Move additional input regs from the structure.  For simplicity
+	 * assume that anything needs the callee-saved regs also tramples
+	 * on %rdi/%rsi (see VP.ENTER).
+	 */
+	/* Save those callee-saved GPRs as mandated by the x86_64 ABI */
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	movq	TDX_MODULE_r12(%rsi), %r12
+	movq	TDX_MODULE_r13(%rsi), %r13
+	movq	TDX_MODULE_r14(%rsi), %r14
+	movq	TDX_MODULE_r15(%rsi), %r15
+	movq	TDX_MODULE_rbx(%rsi), %rbx
+
+	.if \ret
+	/* Save the structure pointer as %rsi is about to be clobbered */
+	pushq	%rsi
+	.endif
+
+	movq	TDX_MODULE_rdi(%rsi), %rdi
+	/* %rsi needs to be done at last */
+	movq	TDX_MODULE_rsi(%rsi), %rsi
+	.endif	/* \saved */
 
 	.if \host
 	seamcall
@@ -65,6 +104,37 @@ 
 	.endif
 
 	.if \ret
+	.if \saved
+	/*
+	 * Restore the structure from stack to saved the output registers
+	 *
+	 * In case of VP.ENTER returns due to TDVMCALL, all registers are
+	 * valid thus no register can be used as spare to restore the
+	 * structure from the stack (see "TDH.VP.ENTER Output Operands
+	 * Definition on TDCALL(TDG.VP.VMCALL) Following a TD Entry").
+	 * For this case, need to make one register as spare by saving it
+	 * to the stack and then manually load the structure pointer to
+	 * the spare register.
+	 *
+	 * Note for other TDCALLs/SEAMCALLs there are spare registers
+	 * thus no need for such hack but just use this for all for now.
+	 */
+	pushq	%rax		/* save the TDCALL/SEAMCALL return code */
+	movq	8(%rsp), %rax	/* restore the structure pointer */
+	movq	%rsi, TDX_MODULE_rsi(%rax)	/* save %rsi */
+	movq	%rax, %rsi	/* use %rsi as structure pointer */
+	popq	%rax		/* restore the return code */
+	popq	%rsi		/* pop the structure pointer */
+
+	/* Copy additional output regs to the structure  */
+	movq %r12, TDX_MODULE_r12(%rsi)
+	movq %r13, TDX_MODULE_r13(%rsi)
+	movq %r14, TDX_MODULE_r14(%rsi)
+	movq %r15, TDX_MODULE_r15(%rsi)
+	movq %rbx, TDX_MODULE_rbx(%rsi)
+	movq %rdi, TDX_MODULE_rdi(%rsi)
+	.endif	/* \saved */
+
 	/* Copy output regs to the structure */
 	movq %rcx, TDX_MODULE_rcx(%rsi)
 	movq %rdx, TDX_MODULE_rdx(%rsi)
@@ -72,7 +142,46 @@ 
 	movq %r9,  TDX_MODULE_r9(%rsi)
 	movq %r10, TDX_MODULE_r10(%rsi)
 	movq %r11, TDX_MODULE_r11(%rsi)
-	.endif
+	.endif	/* \ret */
+
+	.if \saved
+	.if \ret && \host
+	/*
+	 * Clear registers shared by guest for VP.ENTER to prevent
+	 * speculative use of guest's values, including those are
+	 * restored from the stack.
+	 *
+	 * See arch/x86/kvm/vmx/vmenter.S:
+	 *
+	 * In theory, a L1 cache miss when restoring register from stack
+	 * could lead to speculative execution with guest's values.
+	 *
+	 * Note: RBP/RSP are not used as shared register.  RSI has been
+	 * restored already.
+	 *
+	 * XOR is cheap, thus unconditionally do for all leafs.
+	 */
+	xorq %rcx, %rcx
+	xorq %rdx, %rdx
+	xorq %r8,  %r8
+	xorq %r9,  %r9
+	xorq %r10, %r10
+	xorq %r11, %r11
+	xorq %r12, %r12
+	xorq %r13, %r13
+	xorq %r14, %r14
+	xorq %r15, %r15
+	xorq %rbx, %rbx
+	xorq %rdi, %rdi
+	.endif	/* \ret && \host */
+
+	/* Restore callee-saved GPRs as mandated by the x86_64 ABI */
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbx
+	.endif	/* \saved */
 
 	FRAME_END
 	RET