diff mbox series

[v14,004/113] KVM: TDX: Initialize the TDX module when loading the KVM intel kernel module

Message ID e628e2d235d9b6c00b9bd5d81bb69136b77d13c4.1685333727.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM TDX basic feature support | expand

Commit Message

Isaku Yamahata May 29, 2023, 4:18 a.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

TDX requires several initialization steps for KVM to create guest TDs.
Detect CPU feature, enable VMX (TDX is based on VMX) on all online CPUs,
detect the TDX module availability, initialize it and disable VMX.

To enable/disable VMX on all online CPUs, utilize
vmx_hardware_enable/disable().  The method also initializes each CPU for
TDX.  TDX requires calling a TDX initialization function per logical
processor (LP) before the LP uses TDX.  When the CPU is becoming online,
call the TDX LP initialization API.  If it fails to initialize TDX, refuse
CPU online for simplicity instead of TDX avoiding the failed LP.

There are several options on when to initialize the TDX module.  A.) kernel
module loading time, B.) the first guest TD creation time.  A.) was chosen.
With B.), a user may hit an error of the TDX initialization when trying to
create the first guest TD.  The machine that fails to initialize the TDX
module can't boot any guest TD further.  Such failure is undesirable and a
surprise because the user expects that the machine can accommodate guest
TD, but not.  So A.) is better than B.).

Introduce a module parameter, kvm_intel.tdx, to explicitly enable TDX KVM
support.  It's off by default to keep the same behavior for those who don't
use TDX.  Implement hardware_setup method to detect TDX feature of CPU and
initialize TDX module.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>

---
Changes v13 -> v14:
- Use on_each_cpu(vmx_hardware_enable)
---
 arch/x86/kvm/Makefile      |  1 +
 arch/x86/kvm/vmx/main.c    | 34 ++++++++++++++++++--
 arch/x86/kvm/vmx/tdx.c     | 63 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/x86_ops.h |  8 +++++
 4 files changed, 104 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/kvm/vmx/tdx.c

Comments

Zhi Wang May 30, 2023, 2:35 p.m. UTC | #1
On Sun, 28 May 2023 21:18:46 -0700
isaku.yamahata@intel.com wrote:

> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> TDX requires several initialization steps for KVM to create guest TDs.
> Detect CPU feature, enable VMX (TDX is based on VMX) on all online CPUs,
> detect the TDX module availability, initialize it and disable VMX.
> 
> To enable/disable VMX on all online CPUs, utilize
> vmx_hardware_enable/disable().  The method also initializes each CPU for
> TDX.  TDX requires calling a TDX initialization function per logical
> processor (LP) before the LP uses TDX.  When the CPU is becoming online,
> call the TDX LP initialization API.  If it fails to initialize TDX, refuse
> CPU online for simplicity instead of TDX avoiding the failed LP.
> 
> There are several options on when to initialize the TDX module.  A.) kernel
> module loading time, B.) the first guest TD creation time.  A.) was chosen.
> With B.), a user may hit an error of the TDX initialization when trying to
> create the first guest TD.  The machine that fails to initialize the TDX
> module can't boot any guest TD further.  Such failure is undesirable and a
> surprise because the user expects that the machine can accommodate guest
> TD, but not.  So A.) is better than B.).
> 
> Introduce a module parameter, kvm_intel.tdx, to explicitly enable TDX KVM
> support.  It's off by default to keep the same behavior for those who don't
> use TDX.  Implement hardware_setup method to detect TDX feature of CPU and
> initialize TDX module.
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> ---
> Changes v13 -> v14:
> - Use on_each_cpu(vmx_hardware_enable)
> ---
>  arch/x86/kvm/Makefile      |  1 +
>  arch/x86/kvm/vmx/main.c    | 34 ++++++++++++++++++--
>  arch/x86/kvm/vmx/tdx.c     | 63 ++++++++++++++++++++++++++++++++++++++
>  arch/x86/kvm/vmx/x86_ops.h |  8 +++++
>  4 files changed, 104 insertions(+), 2 deletions(-)
>  create mode 100644 arch/x86/kvm/vmx/tdx.c
> 
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index 0e894ae23cbc..4b01ab842ab7 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -25,6 +25,7 @@ kvm-$(CONFIG_KVM_SMM)	+= smm.o
>  kvm-intel-y		+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
>  			   vmx/hyperv.o vmx/nested.o vmx/posted_intr.o vmx/main.o
>  kvm-intel-$(CONFIG_X86_SGX_KVM)	+= vmx/sgx.o
> +kvm-intel-$(CONFIG_INTEL_TDX_HOST)	+= vmx/tdx.o
>  
>  kvm-amd-y		+= svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
>  			   svm/sev.o svm/hyperv.o
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index 791ee271393d..2638b344864c 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -6,6 +6,36 @@
>  #include "nested.h"
>  #include "pmu.h"
>  
> +static bool enable_tdx __ro_after_init;
> +module_param_named(tdx, enable_tdx, bool, 0444);
> +
> +static int vt_hardware_enable(void)
> +{
> +	int ret;
> +
> +	ret = vmx_hardware_enable();
> +	if (ret || !enable_tdx)
> +		return ret;
> +
> +	ret = tdx_cpu_enable();
> +	if (ret)
> +		vmx_hardware_disable();
> +	return ret;
> +}
> +
> +static __init int vt_hardware_setup(void)
> +{
> +	int ret;
> +
> +	ret = vmx_hardware_setup();
> +	if (ret)
> +		return ret;
> +
> +	enable_tdx = enable_tdx && !tdx_hardware_setup(&vt_x86_ops);
> +
> +	return 0;
> +}
> +
>  #define VMX_REQUIRED_APICV_INHIBITS		       \
>  (						       \
>         BIT(APICV_INHIBIT_REASON_DISABLE)|	       \
> @@ -24,7 +54,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>  
>  	.hardware_unsetup = vmx_hardware_unsetup,
>  
> -	.hardware_enable = vmx_hardware_enable,
> +	.hardware_enable = vt_hardware_enable,
>  	.hardware_disable = vmx_hardware_disable,
>  	.has_emulated_msr = vmx_has_emulated_msr,
>  
> @@ -159,7 +189,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>  };
>  
>  struct kvm_x86_init_ops vt_init_ops __initdata = {
> -	.hardware_setup = vmx_hardware_setup,
> +	.hardware_setup = vt_hardware_setup,
>  	.handle_intel_pt_intr = NULL,
>  
>  	.runtime_ops = &vt_x86_ops,
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> new file mode 100644
> index 000000000000..965545a308ad
> --- /dev/null
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -0,0 +1,63 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/cpu.h>
> +
> +#include <asm/tdx.h>
> +
> +#include "capabilities.h"
> +#include "x86_ops.h"
> +#include "x86.h"
> +
> +#undef pr_fmt
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +static int __init tdx_module_setup(void)
> +{
> +	int ret;
> +
> +	ret = tdx_enable();
> +	if (ret) {
> +		pr_info("Failed to initialize TDX module.\n");
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static void __init vmx_tdx_on(void *info)
> +{
> +       atomic_t *err = info;
> +       int r;
> +
> +       r = vmx_hardware_enable();
> +       if (!r)
> +	       r = tdx_cpu_enable();
> +       if (r)
> +	       atomic_set(err, r);
> +}
> +
> +static void __init vmx_off(void *unused)
> +{
> +       vmx_hardware_disable();
> +}
> +
> +int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
> +{
> +	atomic_t err = ATOMIC_INIT(0);
> +	int r = 0;
> +
> +	if (!enable_ept) {
> +		pr_warn("Cannot enable TDX with EPT disabled\n");
> +		return -EINVAL;
> +	}
> +
> +	/* tdx_enable() in tdx_module_setup() requires cpus lock. */
> +	cpus_read_lock();
> +	on_each_cpu(vmx_tdx_on, &err, true);	/* TDX requires vmxon. */
> +	r = atomic_read(&err);
> +	if (!r)
> +		r = tdx_module_setup();
> +	on_each_cpu(vmx_off, NULL, true);

Out of curiosity, why VMX has to be turned off after tdx_module_setup()?

> +	cpus_read_unlock();
> +
> +	return r;
> +}
> diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> index 051b5c4b5c2f..f59e5197836a 100644
> --- a/arch/x86/kvm/vmx/x86_ops.h
> +++ b/arch/x86/kvm/vmx/x86_ops.h
> @@ -20,6 +20,8 @@ bool kvm_is_vmx_supported(void);
>  int __init vmx_init(void);
>  void vmx_exit(void);
>  
> +__init int vmx_hardware_setup(void);
> +
>  extern struct kvm_x86_ops vt_x86_ops __initdata;
>  extern struct kvm_x86_init_ops vt_init_ops __initdata;
>  
> @@ -133,4 +135,10 @@ void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu);
>  #endif
>  void vmx_setup_mce(struct kvm_vcpu *vcpu);
>  
> +#ifdef CONFIG_INTEL_TDX_HOST
> +int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops);
> +#else
> +static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -ENOSYS; }
> +#endif
> +
>  #endif /* __KVM_X86_VMX_X86_OPS_H */
Sean Christopherson May 30, 2023, 5:14 p.m. UTC | #2
On Tue, May 30, 2023, Zhi Wang wrote:
> On Sun, 28 May 2023 21:18:46 -0700 isaku.yamahata@intel.com wrote:
> > +	/* tdx_enable() in tdx_module_setup() requires cpus lock. */
> > +	cpus_read_lock();
> > +	on_each_cpu(vmx_tdx_on, &err, true);	/* TDX requires vmxon. */
> > +	r = atomic_read(&err);
> > +	if (!r)
> > +		r = tdx_module_setup();
> > +	on_each_cpu(vmx_off, NULL, true);
> 
> Out of curiosity, why VMX has to be turned off after tdx_module_setup()?

KVM has historically enabled VMX if and only if KVM has active VMs.  Whether or
not it still makes sense to do dynamic enabling is debatable, but that's a
discussion for another day.
Huang, Kai June 6, 2023, 4:19 a.m. UTC | #3
On Sun, 2023-05-28 at 21:18 -0700, Yamahata, Isaku wrote:
> +static void __init vmx_tdx_on(void *info)
> +{
> +       atomic_t *err = info;
> +       int r;
> +
> +       r = vmx_hardware_enable();
> +       if (!r)
> +	       r = tdx_cpu_enable();
> +       if (r)
> +	       atomic_set(err, r);
> +}
> +
> +static void __init vmx_off(void *unused)
> +{
> +       vmx_hardware_disable();
> +}
> +
> +int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
> +{
> +	atomic_t err = ATOMIC_INIT(0);
> +	int r = 0;
> +
> +	if (!enable_ept) {
> +		pr_warn("Cannot enable TDX with EPT disabled\n");
> +		return -EINVAL;
> +	}
> +
> +	/* tdx_enable() in tdx_module_setup() requires cpus lock. */
> +	cpus_read_lock();
> +	on_each_cpu(vmx_tdx_on, &err, true);	/* TDX requires vmxon. */
> +	r = atomic_read(&err);
> +	if (!r)
> +		r = tdx_module_setup();
> +	on_each_cpu(vmx_off, NULL, true);
> +	cpus_read_unlock();
> +
> +	return r;
> +}

As we discussed in v13, this code doesn't track which CPUs have run
vmx_hardware_enable() successfully.  Thus if ...

	on_each_cpu(vmx_tdx_on, &err, true);	/* TDX requires vmxon. */

... fails on some cpu due to whatever reason, in ...
	
	on_each_cpu(vmx_off, NULL, true);

... vmx_hardware_disable() will fail to do VMXOFF for those cpus that haven't
done VMXON successfully yet, resulting in BUG_ON(!kvm_rebooting) being triggered
in kvm_spurious_fault().

We need a per-cpu flag to track whether cpu has done VMXON successfully.
Isaku Yamahata June 7, 2023, 6:06 p.m. UTC | #4
On Tue, Jun 06, 2023 at 04:19:33AM +0000,
"Huang, Kai" <kai.huang@intel.com> wrote:

> On Sun, 2023-05-28 at 21:18 -0700, Yamahata, Isaku wrote:
> > +static void __init vmx_tdx_on(void *info)
> > +{
> > +       atomic_t *err = info;
> > +       int r;
> > +
> > +       r = vmx_hardware_enable();
> > +       if (!r)
> > +	       r = tdx_cpu_enable();
> > +       if (r)
> > +	       atomic_set(err, r);
> > +}
> > +
> > +static void __init vmx_off(void *unused)
> > +{
> > +       vmx_hardware_disable();
> > +}
> > +
> > +int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
> > +{
> > +	atomic_t err = ATOMIC_INIT(0);
> > +	int r = 0;
> > +
> > +	if (!enable_ept) {
> > +		pr_warn("Cannot enable TDX with EPT disabled\n");
> > +		return -EINVAL;
> > +	}
> > +
> > +	/* tdx_enable() in tdx_module_setup() requires cpus lock. */
> > +	cpus_read_lock();
> > +	on_each_cpu(vmx_tdx_on, &err, true);	/* TDX requires vmxon. */
> > +	r = atomic_read(&err);
> > +	if (!r)
> > +		r = tdx_module_setup();
> > +	on_each_cpu(vmx_off, NULL, true);
> > +	cpus_read_unlock();
> > +
> > +	return r;
> > +}
> 
> As we discussed in v13, this code doesn't track which CPUs have run
> vmx_hardware_enable() successfully.  Thus if ...
> 
> 	on_each_cpu(vmx_tdx_on, &err, true);	/* TDX requires vmxon. */
> 
> ... fails on some cpu due to whatever reason, in ...
> 	
> 	on_each_cpu(vmx_off, NULL, true);
> 
> ... vmx_hardware_disable() will fail to do VMXOFF for those cpus that haven't
> done VMXON successfully yet, resulting in BUG_ON(!kvm_rebooting) being triggered
> in kvm_spurious_fault().
> 
> We need a per-cpu flag to track whether cpu has done VMXON successfully.

Thanks for pointing it out. The following is the fix.

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 8a1d0755d275..b0d3f646afb1 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -4499,26 +4499,39 @@ u64 tdx_non_arch_field_switch(u64 field)
 	}
 }
 
-static void __init vmx_tdx_on(void *info)
+struct vmx_tdx_enabled {
+	cpumask_var_t vmx_enabled;
+	atomic_t *err;
+};
+
+static void __init vmx_tdx_on(void *_vmx_tdx_on)
 {
-	atomic_t *err = info;
+	struct vmx_tdx_enabled *vmx_tdx = _vmx_tdx_on;
 	int r;
 
 	r = vmx_hardware_enable();
-	if (!r)
+	if (!r) {
+		cpumask_set_cpu(smp_processor_id(), vmx_tdx->vmx_enabled);
 		r = tdx_cpu_enable();
+	}
 	if (r)
-		atomic_set(err, r);
+		atomic_set(vmx_tdx->err, r);
 }
 
-static void __init vmx_off(void *unused)
+static void __init vmx_off(void *_vmx_enabled)
 {
-	vmx_hardware_disable();
+	cpumask_var_t vmx_enabled = *(cpumask_var_t *)_vmx_enabled;
+
+	if (cpumask_test_cpu(smp_processor_id(), vmx_enabled))
+		vmx_hardware_disable();
 }
 
 int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
 {
 	atomic_t err = ATOMIC_INIT(0);
+	struct vmx_tdx_enabled vmx_tdx = {
+		.err = &err,
+	};
 	int max_pkgs;
 	int r = 0;
 	int i;
@@ -4582,6 +4595,11 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
 	for (i = 0; i < max_pkgs; i++)
 		mutex_init(&tdx_mng_key_config_lock[i]);
 
+	if (!zalloc_cpumask_var(&vmx_tdx.vmx_enabled, GFP_KERNEL)) {
+		r = -ENOMEM;
+		goto out;
+	}
+
 	/* tdx_enable() in tdx_module_setup() requires cpus lock. */
 	cpus_read_lock();
 	/*
@@ -4592,12 +4610,15 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
 	 */
 	if (!cpumask_equal(cpu_online_mask, cpu_present_mask))
 		pr_warn("The old TDX module requires all present CPUs to be online to initialize.\n");
-	on_each_cpu(vmx_tdx_on, &err, true);	/* TDX requires vmxon. */
-	r = atomic_read(&err);
+	on_each_cpu(vmx_tdx_on, &vmx_tdx, true);	/* TDX requires vmxon. */
+	r = atomic_read(vmx_tdx.err);
 	if (!r)
 		r = tdx_module_setup();
-	on_each_cpu(vmx_off, NULL, true);
+	else
+		r = -EIO;
+	on_each_cpu(vmx_off, vmx_tdx.vmx_enabled, true);
 	cpus_read_unlock();
+	free_cpumask_var(vmx_tdx.vmx_enabled);
 	if (r)
 		goto out;
Huang, Kai June 12, 2023, 11:55 p.m. UTC | #5
On Wed, 2023-06-07 at 11:06 -0700, Isaku Yamahata wrote:
> Thanks for pointing it out. The following is the fix.
> 
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 8a1d0755d275..b0d3f646afb1 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -4499,26 +4499,39 @@ u64 tdx_non_arch_field_switch(u64 field)
>  	}
>  }
>  
> -static void __init vmx_tdx_on(void *info)
> +struct vmx_tdx_enabled {
> +	cpumask_var_t vmx_enabled;
> +	atomic_t *err;
> +};
> +

Sorry for late reply.

I think you just need to mimic hardware_enable_all() -- using a per-cpu
variable.  In this way you can get rid of this structure.

But again, we have listed a couple of options in the v13 discussion [1]:

1) Call kvm_ops_update() twice before and after hardware_setup() in order to use
hardware_enable_all() directly.

2) Expose kvm_x86_ops as symbol so VMX can set hardware_{enable|disable}()
callback before hardware_setup() in order to use hardware_enable_all().

3) Implement VMX's own hardware_enable_all() logic as shown in this patch.

4) ???

I think it would be better if Sean can provide some comments here, but until he
does, we can keep using option 3) (this patch).

[1]
https://lore.kernel.org/lkml/5dc84a2601a47ccc29ef43200cf3ec0d1b485d23.camel@intel.com/
Isaku Yamahata June 13, 2023, 5:38 p.m. UTC | #6
On Mon, Jun 12, 2023 at 11:55:14PM +0000,
"Huang, Kai" <kai.huang@intel.com> wrote:

> On Wed, 2023-06-07 at 11:06 -0700, Isaku Yamahata wrote:
> > Thanks for pointing it out. The following is the fix.
> > 
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index 8a1d0755d275..b0d3f646afb1 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -4499,26 +4499,39 @@ u64 tdx_non_arch_field_switch(u64 field)
> >  	}
> >  }
> >  
> > -static void __init vmx_tdx_on(void *info)
> > +struct vmx_tdx_enabled {
> > +	cpumask_var_t vmx_enabled;
> > +	atomic_t *err;
> > +};
> > +
> 
> Sorry for late reply.
> 
> I think you just need to mimic hardware_enable_all() -- using a per-cpu
> variable.  In this way you can get rid of this structure.
> 
> But again, we have listed a couple of options in the v13 discussion [1]:
> 
> 1) Call kvm_ops_update() twice before and after hardware_setup() in order to use
> hardware_enable_all() directly.
> 
> 2) Expose kvm_x86_ops as symbol so VMX can set hardware_{enable|disable}()
> callback before hardware_setup() in order to use hardware_enable_all().
> 
> 3) Implement VMX's own hardware_enable_all() logic as shown in this patch.
> 
> 4) ???
> 
> I think it would be better if Sean can provide some comments here, but until he
> does, we can keep using option 3) (this patch).
> 
> [1]
> https://lore.kernel.org/lkml/5dc84a2601a47ccc29ef43200cf3ec0d1b485d23.camel@intel.com/

Ok, makes sense. Here is the updated version with the fix for the error you
pointed out.  Introduce cpu bitmap to track which cpu enable VMX(VMXON)
successfully.  Disable VMX off only for cpu with bit set.


From 01dbb58b50e3119da9c3d639a10eb5d0029ee944 Mon Sep 17 00:00:00 2001
Message-Id: <01dbb58b50e3119da9c3d639a10eb5d0029ee944.1686677692.git.isaku.yamahata@intel.com>
From: Isaku Yamahata <isaku.yamahata@intel.com>
Date: Tue, 22 Feb 2022 14:44:15 -0800
Subject: [PATCH] KVM: TDX: Initialize the TDX module when loading the KVM
 intel kernel module

TDX requires several initialization steps for KVM to create guest TDs.
Detect CPU feature, enable VMX (TDX is based on VMX) on all online CPUs,
detect the TDX module availability, initialize it and disable VMX.

To enable/disable VMX on all online CPUs, utilize
vmx_hardware_enable/disable().  The method also initializes each CPU for
TDX.  TDX requires calling a TDX initialization function per logical
processor (LP) before the LP uses TDX.  When the CPU is becoming online,
call the TDX LP initialization API.  If it fails to initialize TDX, refuse
CPU online for simplicity instead of TDX avoiding the failed LP.

There are several options on when to initialize the TDX module.  A.) kernel
module loading time, B.) the first guest TD creation time.  A.) was chosen.
With B.), a user may hit an error of the TDX initialization when trying to
create the first guest TD.  The machine that fails to initialize the TDX
module can't boot any guest TD further.  Such failure is undesirable and a
surprise because the user expects that the machine can accommodate guest
TD, but not.  So A.) is better than B.).

Introduce a module parameter, kvm_intel.tdx, to explicitly enable TDX KVM
support.  It's off by default to keep the same behavior for those who don't
use TDX.  Implement hardware_setup method to detect TDX feature of CPU and
initialize TDX module.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/kvm/Makefile      |  1 +
 arch/x86/kvm/vmx/main.c    | 34 ++++++++++++++-
 arch/x86/kvm/vmx/tdx.c     | 84 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/x86_ops.h |  8 ++++
 4 files changed, 125 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/kvm/vmx/tdx.c

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 0e894ae23cbc..4b01ab842ab7 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -25,6 +25,7 @@ kvm-$(CONFIG_KVM_SMM)	+= smm.o
 kvm-intel-y		+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
 			   vmx/hyperv.o vmx/nested.o vmx/posted_intr.o vmx/main.o
 kvm-intel-$(CONFIG_X86_SGX_KVM)	+= vmx/sgx.o
+kvm-intel-$(CONFIG_INTEL_TDX_HOST)	+= vmx/tdx.o
 
 kvm-amd-y		+= svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
 			   svm/sev.o svm/hyperv.o
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index bbeb32d40b8b..af037d1367e7 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -6,6 +6,36 @@
 #include "nested.h"
 #include "pmu.h"
 
+static bool enable_tdx __ro_after_init;
+module_param_named(tdx, enable_tdx, bool, 0444);
+
+static int vt_hardware_enable(void)
+{
+	int ret;
+
+	ret = vmx_hardware_enable();
+	if (ret || !enable_tdx)
+		return ret;
+
+	ret = tdx_cpu_enable();
+	if (ret)
+		vmx_hardware_disable();
+	return ret;
+}
+
+static __init int vt_hardware_setup(void)
+{
+	int ret;
+
+	ret = vmx_hardware_setup();
+	if (ret)
+		return ret;
+
+	enable_tdx = enable_tdx && !tdx_hardware_setup(&vt_x86_ops);
+
+	return 0;
+}
+
 #define VMX_REQUIRED_APICV_INHIBITS				\
 	(BIT(APICV_INHIBIT_REASON_DISABLE)|			\
 	 BIT(APICV_INHIBIT_REASON_ABSENT) |			\
@@ -22,7 +52,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
 
 	.hardware_unsetup = vmx_hardware_unsetup,
 
-	.hardware_enable = vmx_hardware_enable,
+	.hardware_enable = vt_hardware_enable,
 	.hardware_disable = vmx_hardware_disable,
 	.has_emulated_msr = vmx_has_emulated_msr,
 
@@ -157,7 +187,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
 };
 
 struct kvm_x86_init_ops vt_init_ops __initdata = {
-	.hardware_setup = vmx_hardware_setup,
+	.hardware_setup = vt_hardware_setup,
 	.handle_intel_pt_intr = NULL,
 
 	.runtime_ops = &vt_x86_ops,
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
new file mode 100644
index 000000000000..8a378fb6f1d4
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <asm/tdx.h>
+
+#include "capabilities.h"
+#include "x86_ops.h"
+#include "x86.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+static int __init tdx_module_setup(void)
+{
+	int ret;
+
+	ret = tdx_enable();
+	if (ret) {
+		pr_info("Failed to initialize TDX module.\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+struct vmx_tdx_enabled {
+	cpumask_var_t vmx_enabled;
+	atomic_t err;
+};
+
+static void __init vmx_tdx_on(void *_vmx_tdx)
+{
+	struct vmx_tdx_enabled *vmx_tdx = _vmx_tdx;
+	int r;
+
+	r = vmx_hardware_enable();
+	if (!r) {
+		cpumask_set_cpu(smp_processor_id(), vmx_tdx->vmx_enabled);
+		r = tdx_cpu_enable();
+	}
+	if (r)
+		atomic_set(&vmx_tdx->err, r);
+}
+
+static void __init vmx_off(void *_vmx_enabled)
+{
+	cpumask_var_t *vmx_enabled = (cpumask_var_t *)_vmx_enabled;
+
+	if (cpumask_test_cpu(smp_processor_id(), *vmx_enabled))
+		vmx_hardware_disable();
+}
+
+int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
+{
+	struct vmx_tdx_enabled vmx_tdx = {
+		.err = ATOMIC_INIT(0),
+	};
+	int r = 0;
+
+	if (!enable_ept) {
+		pr_warn("Cannot enable TDX with EPT disabled\n");
+		return -EINVAL;
+	}
+
+	if (!zalloc_cpumask_var(&vmx_tdx.vmx_enabled, GFP_KERNEL)) {
+		r = -ENOMEM;
+		goto out;
+	}
+
+	/* tdx_enable() in tdx_module_setup() requires cpus lock. */
+	cpus_read_lock();
+	on_each_cpu(vmx_tdx_on, &vmx_tdx, true);	/* TDX requires vmxon. */
+	r = atomic_read(&vmx_tdx.err);
+	if (!r)
+		r = tdx_module_setup();
+	else
+		r = -EIO;
+	on_each_cpu(vmx_off, &vmx_tdx.vmx_enabled, true);
+	cpus_read_unlock();
+	free_cpumask_var(vmx_tdx.vmx_enabled);
+
+out:
+	return r;
+}
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 051b5c4b5c2f..a30683c6d822 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -20,6 +20,8 @@ bool kvm_is_vmx_supported(void);
 int __init vmx_init(void);
 void vmx_exit(void);
 
+__init int vmx_hardware_setup(void);
+
 extern struct kvm_x86_ops vt_x86_ops __initdata;
 extern struct kvm_x86_init_ops vt_init_ops __initdata;
 
@@ -133,4 +135,10 @@ void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu);
 #endif
 void vmx_setup_mce(struct kvm_vcpu *vcpu);
 
+#ifdef CONFIG_INTEL_TDX_HOST
+int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops);
+#else
+static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -EOPNOTSUPP; }
+#endif
+
 #endif /* __KVM_X86_VMX_X86_OPS_H */
Huang, Kai June 14, 2023, 9:41 a.m. UTC | #7
On Tue, 2023-06-13 at 10:38 -0700, Isaku Yamahata wrote:
> On Mon, Jun 12, 2023 at 11:55:14PM +0000,
> "Huang, Kai" <kai.huang@intel.com> wrote:
> 
> > On Wed, 2023-06-07 at 11:06 -0700, Isaku Yamahata wrote:
> > > Thanks for pointing it out. The following is the fix.
> > > 
> > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > > index 8a1d0755d275..b0d3f646afb1 100644
> > > --- a/arch/x86/kvm/vmx/tdx.c
> > > +++ b/arch/x86/kvm/vmx/tdx.c
> > > @@ -4499,26 +4499,39 @@ u64 tdx_non_arch_field_switch(u64 field)
> > >  	}
> > >  }
> > >  
> > > -static void __init vmx_tdx_on(void *info)
> > > +struct vmx_tdx_enabled {
> > > +	cpumask_var_t vmx_enabled;
> > > +	atomic_t *err;
> > > +};
> > > +
> > 
> > Sorry for late reply.
> > 
> > I think you just need to mimic hardware_enable_all() -- using a per-cpu
> > variable.  In this way you can get rid of this structure.
> > 
> > But again, we have listed a couple of options in the v13 discussion [1]:
> > 
> > 1) Call kvm_ops_update() twice before and after hardware_setup() in order to use
> > hardware_enable_all() directly.
> > 
> > 2) Expose kvm_x86_ops as symbol so VMX can set hardware_{enable|disable}()
> > callback before hardware_setup() in order to use hardware_enable_all().
> > 
> > 3) Implement VMX's own hardware_enable_all() logic as shown in this patch.
> > 
> > 4) ???
> > 
> > I think it would be better if Sean can provide some comments here, but until he
> > does, we can keep using option 3) (this patch).
> > 
> > [1]
> > https://lore.kernel.org/lkml/5dc84a2601a47ccc29ef43200cf3ec0d1b485d23.camel@intel.com/
> 
> Ok, makes sense. Here is the updated version with the fix for the error you
> pointed out.  Introduce cpu bitmap to track which cpu enable VMX(VMXON)
> successfully.  Disable VMX off only for cpu with bit set.
> 
> 
[...]

> +struct vmx_tdx_enabled {
> +	cpumask_var_t vmx_enabled;
> +	atomic_t err;
> +};
> +

Again (and again), why not just mimic hardware_enable_all() to use a per-cpu
variable instead of a cpumask, so that you can get rid of this structure?
Isaku Yamahata June 14, 2023, 4:05 p.m. UTC | #8
On Wed, Jun 14, 2023 at 09:41:58AM +0000,
"Huang, Kai" <kai.huang@intel.com> wrote:

> On Tue, 2023-06-13 at 10:38 -0700, Isaku Yamahata wrote:
> > On Mon, Jun 12, 2023 at 11:55:14PM +0000,
> > "Huang, Kai" <kai.huang@intel.com> wrote:
> > 
> > > On Wed, 2023-06-07 at 11:06 -0700, Isaku Yamahata wrote:
> > > > Thanks for pointing it out. The following is the fix.
> > > > 
> > > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > > > index 8a1d0755d275..b0d3f646afb1 100644
> > > > --- a/arch/x86/kvm/vmx/tdx.c
> > > > +++ b/arch/x86/kvm/vmx/tdx.c
> > > > @@ -4499,26 +4499,39 @@ u64 tdx_non_arch_field_switch(u64 field)
> > > >  	}
> > > >  }
> > > >  
> > > > -static void __init vmx_tdx_on(void *info)
> > > > +struct vmx_tdx_enabled {
> > > > +	cpumask_var_t vmx_enabled;
> > > > +	atomic_t *err;
> > > > +};
> > > > +
> > > 
> > > Sorry for late reply.
> > > 
> > > I think you just need to mimic hardware_enable_all() -- using a per-cpu
> > > variable.  In this way you can get rid of this structure.
> > > 
> > > But again, we have listed a couple of options in the v13 discussion [1]:
> > > 
> > > 1) Call kvm_ops_update() twice before and after hardware_setup() in order to use
> > > hardware_enable_all() directly.
> > > 
> > > 2) Expose kvm_x86_ops as symbol so VMX can set hardware_{enable|disable}()
> > > callback before hardware_setup() in order to use hardware_enable_all().
> > > 
> > > 3) Implement VMX's own hardware_enable_all() logic as shown in this patch.
> > > 
> > > 4) ???
> > > 
> > > I think it would be better if Sean can provide some comments here, but until he
> > > does, we can keep using option 3) (this patch).
> > > 
> > > [1]
> > > https://lore.kernel.org/lkml/5dc84a2601a47ccc29ef43200cf3ec0d1b485d23.camel@intel.com/
> > 
> > Ok, makes sense. Here is the updated version with the fix for the error you
> > pointed out.  Introduce cpu bitmap to track which cpu enable VMX(VMXON)
> > successfully.  Disable VMX off only for cpu with bit set.
> > 
> > 
> [...]
> 
> > +struct vmx_tdx_enabled {
> > +	cpumask_var_t vmx_enabled;
> > +	atomic_t err;
> > +};
> > +
> 
> Again (and again), why not just mimic hardware_enable_all() to use a per-cpu
> variable instead of a cpumask, so that you can get rid of this structure?

Do you mean __hardware_enable_nolock() uses per-cpu variable?
Because hardware setup is one shot on the initialization, we don't want to
allocate the variable statically. Anyway the following is a patch to use
per-cpu variable with dynamic allocation.  Which version do you prefer?


diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 95be6b2fba83..40a3c9c01ac6 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -4511,38 +4511,34 @@ u64 tdx_non_arch_field_switch(u64 field)
 }
 
 struct vmx_tdx_enabled {
-	cpumask_var_t vmx_enabled;
-	atomic_t err;
+	bool vmx_enabled;
+	int err;
 };
 
 static void __init vmx_tdx_on(void *_vmx_tdx)
 {
-	struct vmx_tdx_enabled *vmx_tdx = _vmx_tdx;
-	int r;
+	struct vmx_tdx_enabled *vmx_tdx = this_cpu_ptr(_vmx_tdx);
 
-	r = vmx_hardware_enable();
-	if (!r) {
-		cpumask_set_cpu(smp_processor_id(), vmx_tdx->vmx_enabled);
-		r = tdx_cpu_enable();
+	vmx_tdx->err = vmx_hardware_enable();
+	if (!vmx_tdx->err) {
+		vmx_tdx->vmx_enabled = true;
+		vmx_tdx->err = tdx_cpu_enable();
 	}
-	if (r)
-		atomic_set(&vmx_tdx->err, r);
 }
 
-static void __init vmx_off(void *_vmx_enabled)
+static void __init vmx_off(void *_vmx_tdx)
 {
-	cpumask_var_t *vmx_enabled = (cpumask_var_t *)_vmx_enabled;
+	struct vmx_tdx_enabled *vmx_tdx = this_cpu_ptr(_vmx_tdx);
 
-	if (cpumask_test_cpu(smp_processor_id(), *vmx_enabled))
+	if (vmx_tdx->vmx_enabled)
 		vmx_hardware_disable();
 }
 
 int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
 {
-	struct vmx_tdx_enabled vmx_tdx = {
-		.err = ATOMIC_INIT(0),
-	};
+	struct vmx_tdx_enabled __percpu *vmx_tdx_enabled;
 	int max_pkgs;
+	int cpu;
 	int r = 0;
 	int i;
 
@@ -4603,7 +4599,8 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
 	for (i = 0; i < max_pkgs; i++)
 		mutex_init(&tdx_mng_key_config_lock[i]);
 
-	if (!zalloc_cpumask_var(&vmx_tdx.vmx_enabled, GFP_KERNEL)) {
+	vmx_tdx_enabled = alloc_percpu_gfp(struct vmx_tdx_enabled, GFP_KERNEL | __GFP_ZERO);
+	if (!vmx_tdx_enabled) {
 		r = -ENOMEM;
 		goto out;
 	}
@@ -4618,15 +4615,21 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
 	 */
 	if (!cpumask_equal(cpu_online_mask, cpu_present_mask))
 		pr_warn("The old TDX module requires all present CPUs to be online to initialize.\n");
-	on_each_cpu(vmx_tdx_on, &vmx_tdx, true);	/* TDX requires vmxon. */
-	r = atomic_read(&vmx_tdx.err);
+	on_each_cpu(vmx_tdx_on, vmx_tdx_enabled, true);	/* TDX requires vmxon. */
+	for_each_present_cpu(cpu) {
+		struct vmx_tdx_enabled *vmx_tdx = per_cpu_ptr(vmx_tdx_enabled, cpu);
+		if (vmx_tdx->err) {
+			r = vmx_tdx->err;
+			break;
+		}
+	}
 	if (!r)
 		r = tdx_module_setup();
 	else
 		r = -EIO;
-	on_each_cpu(vmx_off, &vmx_tdx.vmx_enabled, true);
+	on_each_cpu(vmx_off, vmx_tdx_enabled, true);
 	cpus_read_unlock();
-	free_cpumask_var(vmx_tdx.vmx_enabled);
+	free_percpu(vmx_tdx_enabled);
 	if (r)
 		goto out;
Huang, Kai June 14, 2023, 11:14 p.m. UTC | #9
On Wed, 2023-06-14 at 09:05 -0700, Isaku Yamahata wrote:
> On Wed, Jun 14, 2023 at 09:41:58AM +0000,
> "Huang, Kai" <kai.huang@intel.com> wrote:
> 
> > On Tue, 2023-06-13 at 10:38 -0700, Isaku Yamahata wrote:
> > > On Mon, Jun 12, 2023 at 11:55:14PM +0000,
> > > "Huang, Kai" <kai.huang@intel.com> wrote:
> > > 
> > > > On Wed, 2023-06-07 at 11:06 -0700, Isaku Yamahata wrote:
> > > > > Thanks for pointing it out. The following is the fix.
> > > > > 
> > > > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > > > > index 8a1d0755d275..b0d3f646afb1 100644
> > > > > --- a/arch/x86/kvm/vmx/tdx.c
> > > > > +++ b/arch/x86/kvm/vmx/tdx.c
> > > > > @@ -4499,26 +4499,39 @@ u64 tdx_non_arch_field_switch(u64 field)
> > > > >  	}
> > > > >  }
> > > > >  
> > > > > -static void __init vmx_tdx_on(void *info)
> > > > > +struct vmx_tdx_enabled {
> > > > > +	cpumask_var_t vmx_enabled;
> > > > > +	atomic_t *err;
> > > > > +};
> > > > > +
> > > > 
> > > > Sorry for late reply.
> > > > 
> > > > I think you just need to mimic hardware_enable_all() -- using a per-cpu
> > > > variable.  In this way you can get rid of this structure.
> > > > 
> > > > But again, we have listed a couple of options in the v13 discussion [1]:
> > > > 
> > > > 1) Call kvm_ops_update() twice before and after hardware_setup() in order to use
> > > > hardware_enable_all() directly.
> > > > 
> > > > 2) Expose kvm_x86_ops as symbol so VMX can set hardware_{enable|disable}()
> > > > callback before hardware_setup() in order to use hardware_enable_all().
> > > > 
> > > > 3) Implement VMX's own hardware_enable_all() logic as shown in this patch.
> > > > 
> > > > 4) ???
> > > > 
> > > > I think it would be better if Sean can provide some comments here, but until he
> > > > does, we can keep using option 3) (this patch).
> > > > 
> > > > [1]
> > > > https://lore.kernel.org/lkml/5dc84a2601a47ccc29ef43200cf3ec0d1b485d23.camel@intel.com/
> > > 
> > > Ok, makes sense. Here is the updated version with the fix for the error you
> > > pointed out.  Introduce cpu bitmap to track which cpu enable VMX(VMXON)
> > > successfully.  Disable VMX off only for cpu with bit set.
> > > 
> > > 
> > [...]
> > 
> > > +struct vmx_tdx_enabled {
> > > +	cpumask_var_t vmx_enabled;
> > > +	atomic_t err;
> > > +};
> > > +
> > 
> > Again (and again), why not just mimic hardware_enable_all() to use a per-cpu
> > variable instead of a cpumask, so that you can get rid of this structure?
> 
> Do you mean __hardware_enable_nolock() uses per-cpu variable?

Yes.

> Because hardware setup is one shot on the initialization, we don't want to
> allocate the variable statically. Anyway the following is a patch to use
> per-cpu variable with dynamic allocation.  Which version do you prefer?

This looks over-complicated.  My preference is to mimic
__hardware_enable_nolock() to avoid the 'struct vmx_tdx_enabled'.  But if you
care about the variable being statically, then looks your current cpumask_var_t
looks cleaner to me

Anyway, since you have mentioned the reason, and I understand your concern now,
I will leave this to you.
diff mbox series

Patch

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 0e894ae23cbc..4b01ab842ab7 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -25,6 +25,7 @@  kvm-$(CONFIG_KVM_SMM)	+= smm.o
 kvm-intel-y		+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
 			   vmx/hyperv.o vmx/nested.o vmx/posted_intr.o vmx/main.o
 kvm-intel-$(CONFIG_X86_SGX_KVM)	+= vmx/sgx.o
+kvm-intel-$(CONFIG_INTEL_TDX_HOST)	+= vmx/tdx.o
 
 kvm-amd-y		+= svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
 			   svm/sev.o svm/hyperv.o
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 791ee271393d..2638b344864c 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -6,6 +6,36 @@ 
 #include "nested.h"
 #include "pmu.h"
 
+static bool enable_tdx __ro_after_init;
+module_param_named(tdx, enable_tdx, bool, 0444);
+
+static int vt_hardware_enable(void)
+{
+	int ret;
+
+	ret = vmx_hardware_enable();
+	if (ret || !enable_tdx)
+		return ret;
+
+	ret = tdx_cpu_enable();
+	if (ret)
+		vmx_hardware_disable();
+	return ret;
+}
+
+static __init int vt_hardware_setup(void)
+{
+	int ret;
+
+	ret = vmx_hardware_setup();
+	if (ret)
+		return ret;
+
+	enable_tdx = enable_tdx && !tdx_hardware_setup(&vt_x86_ops);
+
+	return 0;
+}
+
 #define VMX_REQUIRED_APICV_INHIBITS		       \
 (						       \
        BIT(APICV_INHIBIT_REASON_DISABLE)|	       \
@@ -24,7 +54,7 @@  struct kvm_x86_ops vt_x86_ops __initdata = {
 
 	.hardware_unsetup = vmx_hardware_unsetup,
 
-	.hardware_enable = vmx_hardware_enable,
+	.hardware_enable = vt_hardware_enable,
 	.hardware_disable = vmx_hardware_disable,
 	.has_emulated_msr = vmx_has_emulated_msr,
 
@@ -159,7 +189,7 @@  struct kvm_x86_ops vt_x86_ops __initdata = {
 };
 
 struct kvm_x86_init_ops vt_init_ops __initdata = {
-	.hardware_setup = vmx_hardware_setup,
+	.hardware_setup = vt_hardware_setup,
 	.handle_intel_pt_intr = NULL,
 
 	.runtime_ops = &vt_x86_ops,
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
new file mode 100644
index 000000000000..965545a308ad
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -0,0 +1,63 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <asm/tdx.h>
+
+#include "capabilities.h"
+#include "x86_ops.h"
+#include "x86.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+static int __init tdx_module_setup(void)
+{
+	int ret;
+
+	ret = tdx_enable();
+	if (ret) {
+		pr_info("Failed to initialize TDX module.\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void __init vmx_tdx_on(void *info)
+{
+       atomic_t *err = info;
+       int r;
+
+       r = vmx_hardware_enable();
+       if (!r)
+	       r = tdx_cpu_enable();
+       if (r)
+	       atomic_set(err, r);
+}
+
+static void __init vmx_off(void *unused)
+{
+       vmx_hardware_disable();
+}
+
+int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
+{
+	atomic_t err = ATOMIC_INIT(0);
+	int r = 0;
+
+	if (!enable_ept) {
+		pr_warn("Cannot enable TDX with EPT disabled\n");
+		return -EINVAL;
+	}
+
+	/* tdx_enable() in tdx_module_setup() requires cpus lock. */
+	cpus_read_lock();
+	on_each_cpu(vmx_tdx_on, &err, true);	/* TDX requires vmxon. */
+	r = atomic_read(&err);
+	if (!r)
+		r = tdx_module_setup();
+	on_each_cpu(vmx_off, NULL, true);
+	cpus_read_unlock();
+
+	return r;
+}
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 051b5c4b5c2f..f59e5197836a 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -20,6 +20,8 @@  bool kvm_is_vmx_supported(void);
 int __init vmx_init(void);
 void vmx_exit(void);
 
+__init int vmx_hardware_setup(void);
+
 extern struct kvm_x86_ops vt_x86_ops __initdata;
 extern struct kvm_x86_init_ops vt_init_ops __initdata;
 
@@ -133,4 +135,10 @@  void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu);
 #endif
 void vmx_setup_mce(struct kvm_vcpu *vcpu);
 
+#ifdef CONFIG_INTEL_TDX_HOST
+int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops);
+#else
+static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -ENOSYS; }
+#endif
+
 #endif /* __KVM_X86_VMX_X86_OPS_H */