diff mbox series

[v9,05/18] x86/virt/tdx: Add SEAMCALL infrastructure

Message ID dd18d6b42768e0107d212fdebedae92cfd72cfe1.1676286526.git.kai.huang@intel.com (mailing list archive)
State New
Headers show
Series TDX host kernel support | expand

Commit Message

Huang, Kai Feb. 13, 2023, 11:59 a.m. UTC
TDX introduces a new CPU mode: Secure Arbitration Mode (SEAM).  This
mode runs only the TDX module itself or other code to load the TDX
module.

The host kernel communicates with SEAM software via a new SEAMCALL
instruction.  This is conceptually similar to a guest->host hypercall,
except it is made from the host to SEAM software instead.  The TDX
module establishes a new SEAMCALL ABI which allows the host to
initialize the module and to manage VMs.

Add infrastructure to make SEAMCALLs.  The SEAMCALL ABI is very similar
to the TDCALL ABI and leverages much TDCALL infrastructure.

SEAMCALL instruction causes #GP when TDX isn't BIOS enabled, and #UD
when CPU is not in VMX operation.  The current TDX_MODULE_CALL macro
doesn't handle any of them.  There's no way to check whether the CPU is
in VMX operation or not.

Initializing the TDX module is done at runtime on demand, and it depends
on the caller to ensure CPU is in VMX operation before making SEAMCALL.
To avoid getting Oops when the caller mistakenly tries to initialize the
TDX module when CPU is not in VMX operation, extend the TDX_MODULE_CALL
macro to handle #UD (and opportunistically #GP since they share the same
assembly).

Introduce two new TDX error codes for #UD and #GP respectively so the
caller can distinguish.  Also, Opportunistically put the new TDX error
codes and the existing TDX_SEAMCALL_VMFAILINVALID into INTEL_TDX_HOST
Kconfig option as they are only used when it is on.

Any failure during the module initialization is not recoverable for now.
Print out error message when SEAMCALL failed depending on the error code
to help the user to understand what went wrong.

Signed-off-by: Kai Huang <kai.huang@intel.com>
---

v8 -> v9:
 - Changed patch title (Dave).
 - Enhanced seamcall() to include the cpu id to the error message when
   SEAMCALL fails.

v7 -> v8:
 - Improved changelog (Dave):
   - Trim down some sentences (Dave).
   - Removed __seamcall() and seamcall() function name and changed
     accordingly (Dave).
   - Improved the sentence explaining why to handle #GP (Dave).
 - Added code to print out error message in seamcall(), following
   the idea that tdx_enable() to return universal error and print out
   error message to make clear what's going wrong (Dave).  Also mention
   this in changelog.

v6 -> v7:
 - No change.

v5 -> v6:
 - Added code to handle #UD and #GP (Dave).
 - Moved the seamcall() wrapper function to this patch, and used a
   temporary __always_unused to avoid compile warning (Dave).

- v3 -> v5 (no feedback on v4):
 - Explicitly tell TDX_SEAMCALL_VMFAILINVALID is returned if the
   SEAMCALL itself fails.
 - Improve the changelog.

---
 arch/x86/include/asm/tdx.h       |  9 +++++
 arch/x86/virt/vmx/tdx/Makefile   |  2 +-
 arch/x86/virt/vmx/tdx/seamcall.S | 52 +++++++++++++++++++++++++++
 arch/x86/virt/vmx/tdx/tdx.c      | 60 ++++++++++++++++++++++++++++++++
 arch/x86/virt/vmx/tdx/tdx.h      |  5 +++
 arch/x86/virt/vmx/tdx/tdxcall.S  | 19 ++++++++--
 6 files changed, 144 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/virt/vmx/tdx/seamcall.S

Comments

Dave Hansen Feb. 13, 2023, 5:48 p.m. UTC | #1
On 2/13/23 03:59, Kai Huang wrote:
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index 4a3ee64c1ca7..5c5ecfddb15b 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -8,6 +8,10 @@
>  #include <asm/ptrace.h>
>  #include <asm/shared/tdx.h>
>  
> +#ifdef CONFIG_INTEL_TDX_HOST
...
> +#define TDX_SEAMCALL_GP			(TDX_SW_ERROR | X86_TRAP_GP)
> +#define TDX_SEAMCALL_UD			(TDX_SW_ERROR | X86_TRAP_UD)
> +
> +#endif

All these kinds of header #ifdefs do it make it harder to write code in
.c files without matching #ifdefs.  Think of code like this completely
made up example:

	if (!tdx_enable()) {
		// Success!  Make a seamcall:
		int something = tdx_seamcall();
		if (something == TDX_SEAMCALL_UD)
			// oh no!
	}

tdx_enable() can never return 0 if CONFIG_INTEL_TDX_HOST=n, so the
entire if() block is optimized away by the compiler.  *BUT*, if you've
#ifdef'd away TDX_SEAMCALL_UD, you'll get a compile error.  People
usually fix the compile error like this:

	if (!tdx_enable()) {
#ifdef CONFIG_INTEL_TDX_HOST
		// Success!  Make a seamcall:
		int something = tdx_seamcall();
		if (something == TDX_SEAMCALL_UD)
			// oh no!
#endif
	}

Which isn't great.

Defining things unconditionally in header files is *FINE*, as long as
the #ifdefs are there somewhere to make the code go away at compile time.

Please post an updated (and tested) patch as a reply to this.
Huang, Kai Feb. 13, 2023, 9:21 p.m. UTC | #2
> On 2/13/23 03:59, Kai Huang wrote:
> > diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> > index 4a3ee64c1ca7..5c5ecfddb15b 100644
> > --- a/arch/x86/include/asm/tdx.h
> > +++ b/arch/x86/include/asm/tdx.h
> > @@ -8,6 +8,10 @@
> >  #include <asm/ptrace.h>
> >  #include <asm/shared/tdx.h>
> >
> > +#ifdef CONFIG_INTEL_TDX_HOST
> ...
> > +#define TDX_SEAMCALL_GP			(TDX_SW_ERROR |
> X86_TRAP_GP)
> > +#define TDX_SEAMCALL_UD			(TDX_SW_ERROR |
> X86_TRAP_UD)
> > +
> > +#endif
> 
> All these kinds of header #ifdefs do it make it harder to write code in .c files
> without matching #ifdefs.  Think of code like this completely made up example:
> 
> 	if (!tdx_enable()) {
> 		// Success!  Make a seamcall:
> 		int something = tdx_seamcall();
> 		if (something == TDX_SEAMCALL_UD)
> 			// oh no!
> 	}
> 
> tdx_enable() can never return 0 if CONFIG_INTEL_TDX_HOST=n, so the entire if()
> block is optimized away by the compiler.  *BUT*, if you've #ifdef'd away
> TDX_SEAMCALL_UD, you'll get a compile error.  People usually fix the compile
> error like this:
> 
> 	if (!tdx_enable()) {
> #ifdef CONFIG_INTEL_TDX_HOST
> 		// Success!  Make a seamcall:
> 		int something = tdx_seamcall();
> 		if (something == TDX_SEAMCALL_UD)
> 			// oh no!
> #endif
> 	}
> 
> Which isn't great.
> 
> Defining things unconditionally in header files is *FINE*, as long as the #ifdefs
> are there somewhere to make the code go away at compile time.

Thanks for the explanation above!

> 
> Please post an updated (and tested) patch as a reply to this.

Will do.
Dave Hansen Feb. 13, 2023, 10:39 p.m. UTC | #3
On 2/13/23 03:59, Kai Huang wrote:
> SEAMCALL instruction causes #GP when TDX isn't BIOS enabled, and #UD
> when CPU is not in VMX operation.  The current TDX_MODULE_CALL macro
> doesn't handle any of them.  There's no way to check whether the CPU is
> in VMX operation or not.

Really?  ... *REALLY*?

Like, there's no possible way for the kernel to record whether it has
executed VMXON or not?

I think what you're saying here is that there's no architecturally
visible flag that tells you whether in spot #1 or #2 in the following code:

static int kvm_cpu_vmxon(u64 vmxon_pointer)
{
        u64 msr;

        cr4_set_bits(X86_CR4_VMXE);
// spot #1
        asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
                          _ASM_EXTABLE(1b, %l[fault])
                          : : [vmxon_pointer] "m"(vmxon_pointer)
                          : : fault);
// spot #2

That's _maybe_ technically correct (I don't know enough about VMX
enabling to tell you).  But, what I *DO* know is that it's nonsense to
say that it's impossible in the *kernel* to tell whether we're on a CPU
that's successfully executed VMXON or not.

kvm_cpu_vmxon() has two paths through it:

  1. Successfully executes VMXON and leaves with X86_CR4_VMXE=1
  2. Fails VMXON and leaves with X86_CR4_VMXE=0

Guess what?  CR4 is rather architecturally visible.  From what I can
tell, it's *ENTIRELY* plausible to assume that X86_CR4_VMXE==1 means
that VMXON has been done.  Even if that's wrong, it's only a cpumask and
a cpumask_set() away from becoming plausible.  Like so:

static int kvm_cpu_vmxon(u64 vmxon_pointer)
{
        u64 msr;

        cr4_set_bits(X86_CR4_VMXE);

        asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
                          _ASM_EXTABLE(1b, %l[fault])
                          : : [vmxon_pointer] "m"(vmxon_pointer)
                          : : fault);
	// set cpumask bit here
        return 0;

fault:
	// clear cpu bit here
        cr4_clear_bits(X86_CR4_VMXE);

        return -EFAULT;
}

How many design decisions down the line in this series were predicated
on the idea that:

	There's no way to check whether the CPU is
	in VMX operation or not.

?
Huang, Kai Feb. 13, 2023, 11:22 p.m. UTC | #4
On Mon, 2023-02-13 at 14:39 -0800, Dave Hansen wrote:
> On 2/13/23 03:59, Kai Huang wrote:
> > SEAMCALL instruction causes #GP when TDX isn't BIOS enabled, and #UD
> > when CPU is not in VMX operation.  The current TDX_MODULE_CALL macro
> > doesn't handle any of them.  There's no way to check whether the CPU is
> > in VMX operation or not.
> 
> Really?  ... *REALLY*?
> 
> Like, there's no possible way for the kernel to record whether it has
> executed VMXON or not?
> 
> I think what you're saying here is that there's no architecturally
> visible flag that tells you whether in spot #1 or #2 in the following code:
> 
> static int kvm_cpu_vmxon(u64 vmxon_pointer)
> {
>         u64 msr;
> 
>         cr4_set_bits(X86_CR4_VMXE);
> // spot #1
>         asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
>                           _ASM_EXTABLE(1b, %l[fault])
>                           : : [vmxon_pointer] "m"(vmxon_pointer)
>                           : : fault);
> // spot #2
> 

Yes I was talking about architectural flag rather than kernel defined software
tracking mechanism.

> That's _maybe_ technically correct (I don't know enough about VMX
> enabling to tell you).  But, what I *DO* know is that it's nonsense to
> say that it's impossible in the *kernel* to tell whether we're on a CPU
> that's successfully executed VMXON or not.
> 
> kvm_cpu_vmxon() has two paths through it:
> 
>   1. Successfully executes VMXON and leaves with X86_CR4_VMXE=1
>   2. Fails VMXON and leaves with X86_CR4_VMXE=0
> 
> Guess what?  CR4 is rather architecturally visible.  From what I can
> tell, it's *ENTIRELY* plausible to assume that X86_CR4_VMXE==1 means
> that VMXON has been done.  
> 

Yes CR4.VMXE bit can be used to check.  This is what KVM does.

Architecturally CR4.VMXE bit only checks whether VMX is enabled, but not  VMXON
has been done, but in current kernel implement they are always done together. 

So checking CR4 is fine.

> Even if that's wrong, it's only a cpumask and
> a cpumask_set() away from becoming plausible.  Like so:
> 
> static int kvm_cpu_vmxon(u64 vmxon_pointer)
> {
>         u64 msr;
> 
>         cr4_set_bits(X86_CR4_VMXE);
> 
>         asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
>                           _ASM_EXTABLE(1b, %l[fault])
>                           : : [vmxon_pointer] "m"(vmxon_pointer)
>                           : : fault);
> 	// set cpumask bit here
>         return 0;
> 
> fault:
> 	// clear cpu bit here
>         cr4_clear_bits(X86_CR4_VMXE);
> 
>         return -EFAULT;
> }
> 
> How many design decisions down the line in this series were predicated
> on the idea that:
> 
> 	There's no way to check whether the CPU is
> 	in VMX operation or not.
> 
> ?

Only the assembly code to handle TDX_SEAMCALL_UD in this patch is.  

Whether we have definitive way to _check_ whether VMXON has been done doesn't
matter.  What impacts the design decisions is (non-KVM) kernel doesn't support
doing VMXON and we depend on KVM to do that (which is also a design decision).

We can remove the assembly code which returns TDX_SEAMCALL_{UD|GP} and replace
it with a below check in seamcall():

	static int seamcall(...)
	{
		cpu = get_cpu();

		if (cr4_read_shadow() & X86_CR4_VMXE) {
			WARN_ONCE("VMXON isn't done for cpu ...\n");
			ret = -EINVAL;
		}
	
		...

	out:
		put_cpu();
		return ret;
	}

But this was actually discussed in the v5, in which IIUC you prefer to having
the assembly code to return additional TDX_SEAMCALL_UD rather than having above
CR4 check:

https://lore.kernel.org/all/77c90075-79d4-7cc7-f266-1b67e586513b@intel.com/
Huang, Kai Feb. 14, 2023, 8:57 a.m. UTC | #5
On Mon, 2023-02-13 at 23:22 +0000, Huang, Kai wrote:
> On Mon, 2023-02-13 at 14:39 -0800, Dave Hansen wrote:
> > On 2/13/23 03:59, Kai Huang wrote:
> > > SEAMCALL instruction causes #GP when TDX isn't BIOS enabled, and #UD
> > > when CPU is not in VMX operation.  The current TDX_MODULE_CALL macro
> > > doesn't handle any of them.  There's no way to check whether the CPU is
> > > in VMX operation or not.
> > 
> > Really?  ... *REALLY*?
> > 
> > Like, there's no possible way for the kernel to record whether it has
> > executed VMXON or not?
> > 
> > I think what you're saying here is that there's no architecturally
> > visible flag that tells you whether in spot #1 or #2 in the following code:
> > 
> > static int kvm_cpu_vmxon(u64 vmxon_pointer)
> > {
> >         u64 msr;
> > 
> >         cr4_set_bits(X86_CR4_VMXE);
> > // spot #1
> >         asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
> >                           _ASM_EXTABLE(1b, %l[fault])
> >                           : : [vmxon_pointer] "m"(vmxon_pointer)
> >                           : : fault);
> > // spot #2
> > 
> 
> Yes I was talking about architectural flag rather than kernel defined software
> tracking mechanism.
> 
> > That's _maybe_ technically correct (I don't know enough about VMX
> > enabling to tell you).  But, what I *DO* know is that it's nonsense to
> > say that it's impossible in the *kernel* to tell whether we're on a CPU
> > that's successfully executed VMXON or not.
> > 
> > kvm_cpu_vmxon() has two paths through it:
> > 
> >   1. Successfully executes VMXON and leaves with X86_CR4_VMXE=1
> >   2. Fails VMXON and leaves with X86_CR4_VMXE=0
> > 
> > Guess what?  CR4 is rather architecturally visible.  From what I can
> > tell, it's *ENTIRELY* plausible to assume that X86_CR4_VMXE==1 means
> > that VMXON has been done.  
> > 
> 
> Yes CR4.VMXE bit can be used to check.  This is what KVM does.
> 
> Architecturally CR4.VMXE bit only checks whether VMX is enabled, but not  VMXON
> has been done, but in current kernel implement they are always done together. 
> 
> So checking CR4 is fine.
> 
> > Even if that's wrong, it's only a cpumask and
> > a cpumask_set() away from becoming plausible.  Like so:
> > 
> > static int kvm_cpu_vmxon(u64 vmxon_pointer)
> > {
> >         u64 msr;
> > 
> >         cr4_set_bits(X86_CR4_VMXE);
> > 
> >         asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
> >                           _ASM_EXTABLE(1b, %l[fault])
> >                           : : [vmxon_pointer] "m"(vmxon_pointer)
> >                           : : fault);
> > 	// set cpumask bit here
> >         return 0;
> > 
> > fault:
> > 	// clear cpu bit here
> >         cr4_clear_bits(X86_CR4_VMXE);
> > 
> >         return -EFAULT;
> > }
> > 
> > How many design decisions down the line in this series were predicated
> > on the idea that:
> > 
> > 	There's no way to check whether the CPU is
> > 	in VMX operation or not.
> > 
> > ?
> 
> Only the assembly code to handle TDX_SEAMCALL_UD in this patch is.  
> 
> Whether we have definitive way to _check_ whether VMXON has been done doesn't
> matter.  What impacts the design decisions is (non-KVM) kernel doesn't support
> doing VMXON and we depend on KVM to do that (which is also a design decision).
> 
> We can remove the assembly code which returns TDX_SEAMCALL_{UD|GP} and replace
> it with a below check in seamcall():
> 
> 	static int seamcall(...)
> 	{
> 		cpu = get_cpu();
> 
> 		if (cr4_read_shadow() & X86_CR4_VMXE) {
> 			WARN_ONCE("VMXON isn't done for cpu ...\n");
> 			ret = -EINVAL;
> 		}
> 	
> 		...
> 
> 	out:
> 		put_cpu();
> 		return ret;
> 	}
> 
> But this was actually discussed in the v5, in which IIUC you prefer to having
> the assembly code to return additional TDX_SEAMCALL_UD rather than having above
> CR4 check:
> 
> https://lore.kernel.org/all/77c90075-79d4-7cc7-f266-1b67e586513b@intel.com/
> 
> 

Hmm I replied too quickly.  If we need to consider other non-KVM kernel
components mistakenly call tdx_enable() w/o doing VMXON on all online cpus
first, there's one issue when using CR4.VMXE to check whether VMXON has been
done (or even whether VMX has been enabled) in my above pseudo seamcall()
implementation.

The problem is above seamcall() code isn't IRQ safe between CR4.VMXE check and
the actual SEAMCALL.

KVM does VMXON/VMXOFF for all online cpus via IPI:

	// when first VM is created
	on_each_cpu(hardware_enable, NULL, 1);

	// when last  VM is destroyed
	on_each_cpu(hardware_disable, NULL, 1);

Consider this case:

	1) KVM does VMXON for all online cpus (a VM created)
	2) Another kernel component is calling tdx_enable()
	3) KVM does VMXOFF for all online cpus (last VM is destroyed)

When 2) and 3) happen in parallel on different cpus, below race can happen:

	CPU 0 (CR4.VMXE enabled)		CPU 1 (CR4.VMXE enabled)

	non-KVM thread calling seamcall()	KVM thread doing VMXOFF via IPI

	Check CR4.VMXE	<- pass			
						on_each_cpu(hardware_disable)

						send IPI to CPU 0 to do VMXOFF
						<-------
		// Interrupted
		// IRQ handler to do VMXOFF
		
		VMXOFF 
		clear CR4.VMXE

		// IRQ done.
		// Resume to seamcall()
	
	SEAMCALL <-- #UD
	
So we do need to handle #UD in the assembly if we want tdx_enable() to be safe
in general (doesn't cause Oops even mistakenly used out of KVM).

However, in TDX CPU online callback, checking CR4.VMXE to know whether VMXON has
been done is fine, since KVM will never send IPI to those "to-be-online" cpus.
Peter Zijlstra Feb. 14, 2023, 12:42 p.m. UTC | #6
On Tue, Feb 14, 2023 at 12:59:12AM +1300, Kai Huang wrote:
> +/*
> + * Wrapper of __seamcall() to convert SEAMCALL leaf function error code
> + * to kernel error code.  @seamcall_ret and @out contain the SEAMCALL
> + * leaf function return code and the additional output respectively if
> + * not NULL.
> + */
> +static int __always_unused seamcall(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
> +				    u64 *seamcall_ret,
> +				    struct tdx_module_output *out)
> +{
> +	int cpu, ret = 0;
> +	u64 sret;
> +
> +	/* Need a stable CPU id for printing error message */
> +	cpu = get_cpu();
> +
> +	sret = __seamcall(fn, rcx, rdx, r8, r9, out);
> +
> +	/* Save SEAMCALL return code if the caller wants it */
> +	if (seamcall_ret)
> +		*seamcall_ret = sret;
> +
> +	/* SEAMCALL was successful */
> +	if (!sret)
> +		goto out;

I'm thinking you want if (likely(!sret)), here. That whole switch thing
should end up in cold storage.

> +
> +	switch (sret) {
> +	case TDX_SEAMCALL_GP:
> +		/*
> +		 * tdx_enable() has already checked that BIOS has
> +		 * enabled TDX at the very beginning before going
> +		 * forward.  It's likely a firmware bug if the
> +		 * SEAMCALL still caused #GP.
> +		 */
> +		pr_err_once("[firmware bug]: TDX is not enabled by BIOS.\n");
> +		ret = -ENODEV;
> +		break;
> +	case TDX_SEAMCALL_VMFAILINVALID:
> +		pr_err_once("TDX module is not loaded.\n");
> +		ret = -ENODEV;
> +		break;
> +	case TDX_SEAMCALL_UD:
> +		pr_err_once("SEAMCALL failed: CPU %d is not in VMX operation.\n",
> +				cpu);
> +		ret = -EINVAL;
> +		break;
> +	default:
> +		pr_err_once("SEAMCALL failed: CPU %d: leaf %llu, error 0x%llx.\n",
> +				cpu, fn, sret);
> +		if (out)
> +			pr_err_once("additional output: rcx 0x%llx, rdx 0x%llx, r8 0x%llx, r9 0x%llx, r10 0x%llx, r11 0x%llx.\n",
> +					out->rcx, out->rdx, out->r8,
> +					out->r9, out->r10, out->r11);
> +		ret = -EIO;
> +	}
> +out:
> +	put_cpu();
> +	return ret;
> +}
Dave Hansen Feb. 14, 2023, 5:27 p.m. UTC | #7
On 2/14/23 00:57, Huang, Kai wrote:
> Consider this case:
> 
>         1) KVM does VMXON for all online cpus (a VM created)
>         2) Another kernel component is calling tdx_enable()
>         3) KVM does VMXOFF for all online cpus (last VM is destroyed)

Doctor, it hurts when I...

Then let's just call tdx_enable() from other kernel components.

Kai, I'm worried that this is, again, making things more complicated
than they have to be.
Huang, Kai Feb. 14, 2023, 9:02 p.m. UTC | #8
On Tue, 2023-02-14 at 13:42 +0100, Peter Zijlstra wrote:
> On Tue, Feb 14, 2023 at 12:59:12AM +1300, Kai Huang wrote:
> > +/*
> > + * Wrapper of __seamcall() to convert SEAMCALL leaf function error code
> > + * to kernel error code.  @seamcall_ret and @out contain the SEAMCALL
> > + * leaf function return code and the additional output respectively if
> > + * not NULL.
> > + */
> > +static int __always_unused seamcall(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
> > +				    u64 *seamcall_ret,
> > +				    struct tdx_module_output *out)
> > +{
> > +	int cpu, ret = 0;
> > +	u64 sret;
> > +
> > +	/* Need a stable CPU id for printing error message */
> > +	cpu = get_cpu();
> > +
> > +	sret = __seamcall(fn, rcx, rdx, r8, r9, out);
> > +
> > +	/* Save SEAMCALL return code if the caller wants it */
> > +	if (seamcall_ret)
> > +		*seamcall_ret = sret;
> > +
> > +	/* SEAMCALL was successful */
> > +	if (!sret)
> > +		goto out;
> 
> I'm thinking you want if (likely(!sret)), here. That whole switch thing
> should end up in cold storage.
> 

Thanks Peter.  Will do.

> > +
> > +	switch (sret) {
> > +	case TDX_SEAMCALL_GP:
> > +		/*
> > +		 * tdx_enable() has already checked that BIOS has
> > +		 * enabled TDX at the very beginning before going
> > +		 * forward.  It's likely a firmware bug if the
> > +		 * SEAMCALL still caused #GP.
> > +		 */
> > +		pr_err_once("[firmware bug]: TDX is not enabled by BIOS.\n");
> > +		ret = -ENODEV;
> > +		break;
> > +	case TDX_SEAMCALL_VMFAILINVALID:
> > +		pr_err_once("TDX module is not loaded.\n");
> > +		ret = -ENODEV;
> > +		break;
> > +	case TDX_SEAMCALL_UD:
> > +		pr_err_once("SEAMCALL failed: CPU %d is not in VMX operation.\n",
> > +				cpu);
> > +		ret = -EINVAL;
> > +		break;
> > +	default:
> > +		pr_err_once("SEAMCALL failed: CPU %d: leaf %llu, error 0x%llx.\n",
> > +				cpu, fn, sret);
> > +		if (out)
> > +			pr_err_once("additional output: rcx 0x%llx, rdx 0x%llx, r8 0x%llx, r9 0x%llx, r10 0x%llx, r11 0x%llx.\n",
> > +					out->rcx, out->rdx, out->r8,
> > +					out->r9, out->r10, out->r11);
> > +		ret = -EIO;
> > +	}
> > +out:
> > +	put_cpu();
> > +	return ret;
> > +}
>
Huang, Kai Feb. 14, 2023, 10:17 p.m. UTC | #9
On Tue, 2023-02-14 at 09:27 -0800, Dave Hansen wrote:
> On 2/14/23 00:57, Huang, Kai wrote:
> > Consider this case:
> > 
> >         1) KVM does VMXON for all online cpus (a VM created)
> >         2) Another kernel component is calling tdx_enable()
> >         3) KVM does VMXOFF for all online cpus (last VM is destroyed)
> 
> Doctor, it hurts when I...
> 
> Then let's just call tdx_enable() from other kernel components.
> 
> Kai, I'm worried that this is, again, making things more complicated
> than they have to be.

The handling of #UD/#GP itself only takes ~10 LoC.  All those complicated logic
comes from we depend on caller of TDX to ensure VMXON has been done.

AFAICT we have below options:

1) Don't support VMXON in the core-kernel, then  
  1.a Handle #UD/#GP in assembly as shown in this patch; Or
  1.b Disable interrupt from CR4.VMXE check until SEAMCALL is done in 
      seamcall().

2) Let's support VMXON in the core-kernel (by moving VMXON from KVM to the core-
x86), then we get rid of all above.  We explicitly do VMXON (if haven't done)
inside tdx_enable() to make  sure SEAMCALL doesn't cause #UD.  No #UD/#GP
handling is needed in assembly.  No interrupt disable in seamcall().

(well #GP can theoretically still happen if BIOS is buggy, we can keep assembly
code change if it's better -- just ~10 LoC). 

Supporting VMXON in the core-kernel also has other advantages:

1) We can get rid of the logic to always try to do LP.INIT for all online cpus.
LP.INIT can just be done: a) during module initialization; b) in TDX CPU hotplug
callback.

2) The TDX CPU hotplug callback can just do VMXON and LP.INIT.  No CR4.VMXE
check is needed.  And it can be put before KVM (all TDX users)' hotplug
callback.

The downside of supporting VMXON to the core-kernel:

1) Need patch(es) to change KVM, so those patches need to be reviewed by KVM
maintainers.
2) No other cons.

Logically, supporting VMXON in the core-kernel makes things simple.  And long-
termly, I _think_ we will need it to support future TDX features.

The effort to support VMXON in the core-kernel would be ~300 LOC.  I can already
utilize some old patches, but need to polish those patches and do some test.

What's your thinking?
diff mbox series

Patch

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 4a3ee64c1ca7..5c5ecfddb15b 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -8,6 +8,10 @@ 
 #include <asm/ptrace.h>
 #include <asm/shared/tdx.h>
 
+#ifdef CONFIG_INTEL_TDX_HOST
+
+#include <asm/trapnr.h>
+
 /*
  * SW-defined error codes.
  *
@@ -18,6 +22,11 @@ 
 #define TDX_SW_ERROR			(TDX_ERROR | GENMASK_ULL(47, 40))
 #define TDX_SEAMCALL_VMFAILINVALID	(TDX_SW_ERROR | _UL(0xFFFF0000))
 
+#define TDX_SEAMCALL_GP			(TDX_SW_ERROR | X86_TRAP_GP)
+#define TDX_SEAMCALL_UD			(TDX_SW_ERROR | X86_TRAP_UD)
+
+#endif
+
 #ifndef __ASSEMBLY__
 
 /* TDX supported page sizes from the TDX module ABI. */
diff --git a/arch/x86/virt/vmx/tdx/Makefile b/arch/x86/virt/vmx/tdx/Makefile
index 93ca8b73e1f1..38d534f2c113 100644
--- a/arch/x86/virt/vmx/tdx/Makefile
+++ b/arch/x86/virt/vmx/tdx/Makefile
@@ -1,2 +1,2 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
-obj-y += tdx.o
+obj-y += tdx.o seamcall.o
diff --git a/arch/x86/virt/vmx/tdx/seamcall.S b/arch/x86/virt/vmx/tdx/seamcall.S
new file mode 100644
index 000000000000..f81be6b9c133
--- /dev/null
+++ b/arch/x86/virt/vmx/tdx/seamcall.S
@@ -0,0 +1,52 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#include "tdxcall.S"
+
+/*
+ * __seamcall() - Host-side interface functions to SEAM software module
+ *		  (the P-SEAMLDR or the TDX module).
+ *
+ * Transform function call register arguments into the SEAMCALL register
+ * ABI.  Return TDX_SEAMCALL_VMFAILINVALID if the SEAMCALL itself fails,
+ * or the completion status of the SEAMCALL leaf function.  Additional
+ * output operands are saved in @out (if it is provided by the caller).
+ *
+ *-------------------------------------------------------------------------
+ * SEAMCALL ABI:
+ *-------------------------------------------------------------------------
+ * Input Registers:
+ *
+ * RAX                 - SEAMCALL Leaf number.
+ * RCX,RDX,R8-R9       - SEAMCALL Leaf specific input registers.
+ *
+ * Output Registers:
+ *
+ * RAX                 - SEAMCALL completion status code.
+ * RCX,RDX,R8-R11      - SEAMCALL Leaf specific output registers.
+ *
+ *-------------------------------------------------------------------------
+ *
+ * __seamcall() function ABI:
+ *
+ * @fn  (RDI)          - SEAMCALL Leaf number, moved to RAX
+ * @rcx (RSI)          - Input parameter 1, moved to RCX
+ * @rdx (RDX)          - Input parameter 2, moved to RDX
+ * @r8  (RCX)          - Input parameter 3, moved to R8
+ * @r9  (R8)           - Input parameter 4, moved to R9
+ *
+ * @out (R9)           - struct tdx_module_output pointer
+ *			 stored temporarily in R12 (not
+ *			 used by the P-SEAMLDR or the TDX
+ *			 module). It can be NULL.
+ *
+ * Return (via RAX) the completion status of the SEAMCALL, or
+ * TDX_SEAMCALL_VMFAILINVALID.
+ */
+SYM_FUNC_START(__seamcall)
+	FRAME_BEGIN
+	TDX_MODULE_CALL host=1
+	FRAME_END
+	RET
+SYM_FUNC_END(__seamcall)
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index f5a20d56097c..5ae3d71b70b4 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -110,6 +110,66 @@  bool platform_tdx_enabled(void)
 	return !!tdx_global_keyid;
 }
 
+/*
+ * Wrapper of __seamcall() to convert SEAMCALL leaf function error code
+ * to kernel error code.  @seamcall_ret and @out contain the SEAMCALL
+ * leaf function return code and the additional output respectively if
+ * not NULL.
+ */
+static int __always_unused seamcall(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
+				    u64 *seamcall_ret,
+				    struct tdx_module_output *out)
+{
+	int cpu, ret = 0;
+	u64 sret;
+
+	/* Need a stable CPU id for printing error message */
+	cpu = get_cpu();
+
+	sret = __seamcall(fn, rcx, rdx, r8, r9, out);
+
+	/* Save SEAMCALL return code if the caller wants it */
+	if (seamcall_ret)
+		*seamcall_ret = sret;
+
+	/* SEAMCALL was successful */
+	if (!sret)
+		goto out;
+
+	switch (sret) {
+	case TDX_SEAMCALL_GP:
+		/*
+		 * tdx_enable() has already checked that BIOS has
+		 * enabled TDX at the very beginning before going
+		 * forward.  It's likely a firmware bug if the
+		 * SEAMCALL still caused #GP.
+		 */
+		pr_err_once("[firmware bug]: TDX is not enabled by BIOS.\n");
+		ret = -ENODEV;
+		break;
+	case TDX_SEAMCALL_VMFAILINVALID:
+		pr_err_once("TDX module is not loaded.\n");
+		ret = -ENODEV;
+		break;
+	case TDX_SEAMCALL_UD:
+		pr_err_once("SEAMCALL failed: CPU %d is not in VMX operation.\n",
+				cpu);
+		ret = -EINVAL;
+		break;
+	default:
+		pr_err_once("SEAMCALL failed: CPU %d: leaf %llu, error 0x%llx.\n",
+				cpu, fn, sret);
+		if (out)
+			pr_err_once("additional output: rcx 0x%llx, rdx 0x%llx, r8 0x%llx, r9 0x%llx, r10 0x%llx, r11 0x%llx.\n",
+					out->rcx, out->rdx, out->r8,
+					out->r9, out->r10, out->r11);
+		ret = -EIO;
+	}
+out:
+	put_cpu();
+	return ret;
+}
+
 static int init_tdx_module(void)
 {
 	/*
diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
index 881cca276956..931a50f0f44c 100644
--- a/arch/x86/virt/vmx/tdx/tdx.h
+++ b/arch/x86/virt/vmx/tdx/tdx.h
@@ -2,6 +2,8 @@ 
 #ifndef _X86_VIRT_TDX_H
 #define _X86_VIRT_TDX_H
 
+#include <linux/types.h>
+
 /* Kernel defined TDX module status during module initialization. */
 enum tdx_module_status_t {
 	TDX_MODULE_UNKNOWN,
@@ -9,4 +11,7 @@  enum tdx_module_status_t {
 	TDX_MODULE_ERROR
 };
 
+struct tdx_module_output;
+u64 __seamcall(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
+	       struct tdx_module_output *out);
 #endif
diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S
index 49a54356ae99..757b0c34be10 100644
--- a/arch/x86/virt/vmx/tdx/tdxcall.S
+++ b/arch/x86/virt/vmx/tdx/tdxcall.S
@@ -1,6 +1,7 @@ 
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <asm/asm-offsets.h>
 #include <asm/tdx.h>
+#include <asm/asm.h>
 
 /*
  * TDCALL and SEAMCALL are supported in Binutils >= 2.36.
@@ -45,6 +46,7 @@ 
 	/* Leave input param 2 in RDX */
 
 	.if \host
+1:
 	seamcall
 	/*
 	 * SEAMCALL instruction is essentially a VMExit from VMX root
@@ -57,10 +59,23 @@ 
 	 * This value will never be used as actual SEAMCALL error code as
 	 * it is from the Reserved status code class.
 	 */
-	jnc .Lno_vmfailinvalid
+	jnc .Lseamcall_out
 	mov $TDX_SEAMCALL_VMFAILINVALID, %rax
-.Lno_vmfailinvalid:
+	jmp .Lseamcall_out
+2:
+	/*
+	 * SEAMCALL caused #GP or #UD.  By reaching here %eax contains
+	 * the trap number.  Convert the trap number to the TDX error
+	 * code by setting TDX_SW_ERROR to the high 32-bits of %rax.
+	 *
+	 * Note cannot OR TDX_SW_ERROR directly to %rax as OR instruction
+	 * only accepts 32-bit immediate at most.
+	 */
+	mov $TDX_SW_ERROR, %r12
+	orq %r12, %rax
 
+	_ASM_EXTABLE_FAULT(1b, 2b)
+.Lseamcall_out:
 	.else
 	tdcall
 	.endif