diff mbox series

[v19,023/130] KVM: TDX: Initialize the TDX module when loading the KVM intel kernel module

Message ID f028d43abeadaa3134297d28fb99f283445c0333.1708933498.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series [v19,001/130] x86/virt/tdx: Rename _offset to _member for TD_SYSINFO_MAP() macro | expand

Commit Message

Isaku Yamahata Feb. 26, 2024, 8:25 a.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

TDX requires several initialization steps for KVM to create guest TDs.
Detect CPU feature, enable VMX (TDX is based on VMX) on all online CPUs,
detect the TDX module availability, initialize it and disable VMX.

To enable/disable VMX on all online CPUs, utilize
vmx_hardware_enable/disable().  The method also initializes each CPU for
TDX.  TDX requires calling a TDX initialization function per logical
processor (LP) before the LP uses TDX.  When the CPU is becoming online,
call the TDX LP initialization API.  If it fails to initialize TDX, refuse
CPU online for simplicity instead of TDX avoiding the failed LP.

There are several options on when to initialize the TDX module.  A.) kernel
module loading time, B.) the first guest TD creation time.  A.) was chosen.
With B.), a user may hit an error of the TDX initialization when trying to
create the first guest TD.  The machine that fails to initialize the TDX
module can't boot any guest TD further.  Such failure is undesirable and a
surprise because the user expects that the machine can accommodate guest
TD, but not.  So A.) is better than B.).

Introduce a module parameter, kvm_intel.tdx, to explicitly enable TDX KVM
support.  It's off by default to keep the same behavior for those who don't
use TDX.  Implement hardware_setup method to detect TDX feature of CPU and
initialize TDX module.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
v19:
- fixed vt_hardware_enable() to use vmx_hardware_enable()
- renamed vmx_tdx_enabled => tdx_enabled
- renamed vmx_tdx_on() => tdx_on()

v18:
- Added comment in vt_hardware_enable() by Binbin.

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/kvm/Makefile      |  1 +
 arch/x86/kvm/vmx/main.c    | 19 ++++++++-
 arch/x86/kvm/vmx/tdx.c     | 84 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/x86_ops.h |  6 +++
 4 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kvm/vmx/tdx.c

Comments

Binbin Wu March 14, 2024, 2:05 a.m. UTC | #1
On 2/26/2024 4:25 PM, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
>
> TDX requires several initialization steps for KVM to create guest TDs.
> Detect CPU feature, enable VMX (TDX is based on VMX) on all online CPUs,
> detect the TDX module availability, initialize it and disable VMX.
>
> To enable/disable VMX on all online CPUs, utilize
> vmx_hardware_enable/disable().  The method also initializes each CPU for
> TDX.  TDX requires calling a TDX initialization function per logical
> processor (LP) before the LP uses TDX.  When the CPU is becoming online,
> call the TDX LP initialization API.  If it fails to initialize TDX, refuse
> CPU online for simplicity instead of TDX avoiding the failed LP.
>
> There are several options on when to initialize the TDX module.  A.) kernel
> module loading time, B.) the first guest TD creation time.  A.) was chosen.
> With B.), a user may hit an error of the TDX initialization when trying to
> create the first guest TD.  The machine that fails to initialize the TDX
> module can't boot any guest TD further.  Such failure is undesirable and a
> surprise because the user expects that the machine can accommodate guest
> TD, but not.  So A.) is better than B.).
>
> Introduce a module parameter, kvm_intel.tdx, to explicitly enable TDX KVM
> support.  It's off by default to keep the same behavior for those who don't
> use TDX.  Implement hardware_setup method to detect TDX feature of CPU and
> initialize TDX module.
>
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
> v19:
> - fixed vt_hardware_enable() to use vmx_hardware_enable()
> - renamed vmx_tdx_enabled => tdx_enabled
> - renamed vmx_tdx_on() => tdx_on()
>
> v18:
> - Added comment in vt_hardware_enable() by Binbin.
>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>   arch/x86/kvm/Makefile      |  1 +
>   arch/x86/kvm/vmx/main.c    | 19 ++++++++-
>   arch/x86/kvm/vmx/tdx.c     | 84 ++++++++++++++++++++++++++++++++++++++
>   arch/x86/kvm/vmx/x86_ops.h |  6 +++
>   4 files changed, 109 insertions(+), 1 deletion(-)
>   create mode 100644 arch/x86/kvm/vmx/tdx.c
>
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index 274df24b647f..5b85ef84b2e9 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -24,6 +24,7 @@ kvm-intel-y		+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
>   
>   kvm-intel-$(CONFIG_X86_SGX_KVM)	+= vmx/sgx.o
>   kvm-intel-$(CONFIG_KVM_HYPERV)	+= vmx/hyperv.o vmx/hyperv_evmcs.o
> +kvm-intel-$(CONFIG_INTEL_TDX_HOST)	+= vmx/tdx.o
>   
>   kvm-amd-y		+= svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
>   			   svm/sev.o
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index 18cecf12c7c8..18aef6e23aab 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -6,6 +6,22 @@
>   #include "nested.h"
>   #include "pmu.h"
>   
> +static bool enable_tdx __ro_after_init;
> +module_param_named(tdx, enable_tdx, bool, 0444);
> +
> +static __init int vt_hardware_setup(void)
> +{
> +	int ret;
> +
> +	ret = vmx_hardware_setup();
> +	if (ret)
> +		return ret;
> +
> +	enable_tdx = enable_tdx && !tdx_hardware_setup(&vt_x86_ops);
> +
> +	return 0;
> +}
> +
>   #define VMX_REQUIRED_APICV_INHIBITS				\
>   	(BIT(APICV_INHIBIT_REASON_DISABLE)|			\
>   	 BIT(APICV_INHIBIT_REASON_ABSENT) |			\
> @@ -22,6 +38,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>   
>   	.hardware_unsetup = vmx_hardware_unsetup,
>   
> +	/* TDX cpu enablement is done by tdx_hardware_setup(). */

How about if there are some LPs that are offline.
In tdx_hardware_setup(), only online LPs are initialed for TDX, right?
Then when an offline LP becoming online, it doesn't have a chance to call
tdx_cpu_enable()?

>   	.hardware_enable = vmx_hardware_enable,
>   	.hardware_disable = vmx_hardware_disable,
>   	.has_emulated_msr = vmx_has_emulated_msr,
> @@ -161,7 +178,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>   };
>   
>   struct kvm_x86_init_ops vt_init_ops __initdata = {
> -	.hardware_setup = vmx_hardware_setup,
> +	.hardware_setup = vt_hardware_setup,
>   	.handle_intel_pt_intr = NULL,
>   
>   	.runtime_ops = &vt_x86_ops,
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> new file mode 100644
> index 000000000000..43c504fb4fed
> --- /dev/null
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -0,0 +1,84 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/cpu.h>
> +
> +#include <asm/tdx.h>
> +
> +#include "capabilities.h"
> +#include "x86_ops.h"
> +#include "x86.h"
> +
> +#undef pr_fmt
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +static int __init tdx_module_setup(void)
> +{
> +	int ret;
> +
> +	ret = tdx_enable();
> +	if (ret) {
> +		pr_info("Failed to initialize TDX module.\n");
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +struct tdx_enabled {
> +	cpumask_var_t enabled;
> +	atomic_t err;
> +};
> +
> +static void __init tdx_on(void *_enable)
> +{
> +	struct tdx_enabled *enable = _enable;
> +	int r;
> +
> +	r = vmx_hardware_enable();
> +	if (!r) {
> +		cpumask_set_cpu(smp_processor_id(), enable->enabled);
> +		r = tdx_cpu_enable();
> +	}
> +	if (r)
> +		atomic_set(&enable->err, r);
> +}
> +
> +static void __init vmx_off(void *_enabled)
> +{
> +	cpumask_var_t *enabled = (cpumask_var_t *)_enabled;
> +
> +	if (cpumask_test_cpu(smp_processor_id(), *enabled))
> +		vmx_hardware_disable();
> +}
> +
> +int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
> +{
> +	struct tdx_enabled enable = {
> +		.err = ATOMIC_INIT(0),
> +	};
> +	int r = 0;
> +
> +	if (!enable_ept) {
> +		pr_warn("Cannot enable TDX with EPT disabled\n");
> +		return -EINVAL;
> +	}
> +
> +	if (!zalloc_cpumask_var(&enable.enabled, GFP_KERNEL)) {
> +		r = -ENOMEM;
> +		goto out;
> +	}
> +
> +	/* tdx_enable() in tdx_module_setup() requires cpus lock. */
> +	cpus_read_lock();
> +	on_each_cpu(tdx_on, &enable, true); /* TDX requires vmxon. */
> +	r = atomic_read(&enable.err);
> +	if (!r)
> +		r = tdx_module_setup();
> +	else
> +		r = -EIO;
> +	on_each_cpu(vmx_off, &enable.enabled, true);
> +	cpus_read_unlock();
> +	free_cpumask_var(enable.enabled);
> +
> +out:
> +	return r;
> +}
> diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> index b936388853ab..346289a2a01c 100644
> --- a/arch/x86/kvm/vmx/x86_ops.h
> +++ b/arch/x86/kvm/vmx/x86_ops.h
> @@ -135,4 +135,10 @@ void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu);
>   #endif
>   void vmx_setup_mce(struct kvm_vcpu *vcpu);
>   
> +#ifdef CONFIG_INTEL_TDX_HOST
> +int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops);
> +#else
> +static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -EOPNOTSUPP; }
> +#endif
> +
>   #endif /* __KVM_X86_VMX_X86_OPS_H */
Isaku Yamahata March 14, 2024, 4:27 p.m. UTC | #2
On Thu, Mar 14, 2024 at 10:05:35AM +0800,
Binbin Wu <binbin.wu@linux.intel.com> wrote:

> > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> > index 18cecf12c7c8..18aef6e23aab 100644
> > --- a/arch/x86/kvm/vmx/main.c
> > +++ b/arch/x86/kvm/vmx/main.c
> > @@ -6,6 +6,22 @@
> >   #include "nested.h"
> >   #include "pmu.h"
> > +static bool enable_tdx __ro_after_init;
> > +module_param_named(tdx, enable_tdx, bool, 0444);
> > +
> > +static __init int vt_hardware_setup(void)
> > +{
> > +	int ret;
> > +
> > +	ret = vmx_hardware_setup();
> > +	if (ret)
> > +		return ret;
> > +
> > +	enable_tdx = enable_tdx && !tdx_hardware_setup(&vt_x86_ops);
> > +
> > +	return 0;
> > +}
> > +
> >   #define VMX_REQUIRED_APICV_INHIBITS				\
> >   	(BIT(APICV_INHIBIT_REASON_DISABLE)|			\
> >   	 BIT(APICV_INHIBIT_REASON_ABSENT) |			\
> > @@ -22,6 +38,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> >   	.hardware_unsetup = vmx_hardware_unsetup,
> > +	/* TDX cpu enablement is done by tdx_hardware_setup(). */
> 
> How about if there are some LPs that are offline.
> In tdx_hardware_setup(), only online LPs are initialed for TDX, right?

Correct.


> Then when an offline LP becoming online, it doesn't have a chance to call
> tdx_cpu_enable()?

KVM registers kvm_online/offline_cpu() @ kvm_main.c as cpu hotplug callbacks.
Eventually x86 kvm hardware_enable() is called on online/offline event.
Binbin Wu March 15, 2024, 4:44 a.m. UTC | #3
On 3/15/2024 12:27 AM, Isaku Yamahata wrote:
> On Thu, Mar 14, 2024 at 10:05:35AM +0800,
> Binbin Wu <binbin.wu@linux.intel.com> wrote:
>
>>> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
>>> index 18cecf12c7c8..18aef6e23aab 100644
>>> --- a/arch/x86/kvm/vmx/main.c
>>> +++ b/arch/x86/kvm/vmx/main.c
>>> @@ -6,6 +6,22 @@
>>>    #include "nested.h"
>>>    #include "pmu.h"
>>> +static bool enable_tdx __ro_after_init;
>>> +module_param_named(tdx, enable_tdx, bool, 0444);
>>> +
>>> +static __init int vt_hardware_setup(void)
>>> +{
>>> +	int ret;
>>> +
>>> +	ret = vmx_hardware_setup();
>>> +	if (ret)
>>> +		return ret;
>>> +
>>> +	enable_tdx = enable_tdx && !tdx_hardware_setup(&vt_x86_ops);
>>> +
>>> +	return 0;
>>> +}
>>> +
>>>    #define VMX_REQUIRED_APICV_INHIBITS				\
>>>    	(BIT(APICV_INHIBIT_REASON_DISABLE)|			\
>>>    	 BIT(APICV_INHIBIT_REASON_ABSENT) |			\
>>> @@ -22,6 +38,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>>>    	.hardware_unsetup = vmx_hardware_unsetup,
>>> +	/* TDX cpu enablement is done by tdx_hardware_setup(). */
>> How about if there are some LPs that are offline.
>> In tdx_hardware_setup(), only online LPs are initialed for TDX, right?
> Correct.
>
>
>> Then when an offline LP becoming online, it doesn't have a chance to call
>> tdx_cpu_enable()?
> KVM registers kvm_online/offline_cpu() @ kvm_main.c as cpu hotplug callbacks.
> Eventually x86 kvm hardware_enable() is called on online/offline event.

Yes, hardware_enable() will be called when online,
but  hardware_enable() now is vmx_hardware_enable() right?
It doens't call tdx_cpu_enable() during the online path.
Isaku Yamahata March 15, 2024, 11:25 p.m. UTC | #4
On Fri, Mar 15, 2024 at 12:44:46PM +0800,
Binbin Wu <binbin.wu@linux.intel.com> wrote:

> On 3/15/2024 12:27 AM, Isaku Yamahata wrote:
> > On Thu, Mar 14, 2024 at 10:05:35AM +0800,
> > Binbin Wu <binbin.wu@linux.intel.com> wrote:
> > 
> > > > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> > > > index 18cecf12c7c8..18aef6e23aab 100644
> > > > --- a/arch/x86/kvm/vmx/main.c
> > > > +++ b/arch/x86/kvm/vmx/main.c
> > > > @@ -6,6 +6,22 @@
> > > >    #include "nested.h"
> > > >    #include "pmu.h"
> > > > +static bool enable_tdx __ro_after_init;
> > > > +module_param_named(tdx, enable_tdx, bool, 0444);
> > > > +
> > > > +static __init int vt_hardware_setup(void)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	ret = vmx_hardware_setup();
> > > > +	if (ret)
> > > > +		return ret;
> > > > +
> > > > +	enable_tdx = enable_tdx && !tdx_hardware_setup(&vt_x86_ops);
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > >    #define VMX_REQUIRED_APICV_INHIBITS				\
> > > >    	(BIT(APICV_INHIBIT_REASON_DISABLE)|			\
> > > >    	 BIT(APICV_INHIBIT_REASON_ABSENT) |			\
> > > > @@ -22,6 +38,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> > > >    	.hardware_unsetup = vmx_hardware_unsetup,
> > > > +	/* TDX cpu enablement is done by tdx_hardware_setup(). */
> > > How about if there are some LPs that are offline.
> > > In tdx_hardware_setup(), only online LPs are initialed for TDX, right?
> > Correct.
> > 
> > 
> > > Then when an offline LP becoming online, it doesn't have a chance to call
> > > tdx_cpu_enable()?
> > KVM registers kvm_online/offline_cpu() @ kvm_main.c as cpu hotplug callbacks.
> > Eventually x86 kvm hardware_enable() is called on online/offline event.
> 
> Yes, hardware_enable() will be called when online,
> but  hardware_enable() now is vmx_hardware_enable() right?
> It doens't call tdx_cpu_enable() during the online path.

TDX module requires TDH.SYS.LP.INIT() on all logical processors(LPs).  If we
successfully initialized TDX module, we don't need further action for TDX on cpu
online/offline.

If some of LPs are not online when loading kvm_intel.ko, KVM fails to initialize
TDX module. TDX support is disabled.  We don't bother to attempt it.  Leave it
to the admin of the machine.
Huang, Kai March 21, 2024, 12:39 p.m. UTC | #5
On Fri, 2024-03-15 at 16:25 -0700, Isaku Yamahata wrote:
> > > > How about if there are some LPs that are offline.
> > > > In tdx_hardware_setup(), only online LPs are initialed for TDX, right?
> > > Correct.
> > > 
> > > 
> > > > Then when an offline LP becoming online, it doesn't have a chance to call
> > > > tdx_cpu_enable()?
> > > KVM registers kvm_online/offline_cpu() @ kvm_main.c as cpu hotplug callbacks.
> > > Eventually x86 kvm hardware_enable() is called on online/offline event.
> > 
> > Yes, hardware_enable() will be called when online,
> > but  hardware_enable() now is vmx_hardware_enable() right?
> > It doens't call tdx_cpu_enable() during the online path.
> 
> TDX module requires TDH.SYS.LP.INIT() on all logical processors(LPs).  If we
> successfully initialized TDX module, we don't need further action for TDX on cpu
> online/offline.
> 
> If some of LPs are not online when loading kvm_intel.ko, KVM fails to initialize
> TDX module. TDX support is disabled.  We don't bother to attempt it.  Leave it
> to the admin of the machine.

No.  We have relaxed this.  Now the TDX module can be initialized on a subset of
all logical cpus, with arbitrary number of cpus being offline.  

Those cpus can become online after module initialization, and TDH.SYS.LP.INIT on
them won't fail.
Huang, Kai March 21, 2024, 1:07 p.m. UTC | #6
On Mon, 2024-02-26 at 00:25 -0800, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> TDX requires several initialization steps for KVM to create guest TDs.
> Detect CPU feature, enable VMX (TDX is based on VMX) on all online CPUs,
> detect the TDX module availability, initialize it and disable VMX.

Before KVM can use TDX to create and run TDX guests, the kernel needs to
initialize TDX from two perspectives:

1) Initialize the TDX module.
1) Do the "per-cpu initialization" on any logical cpu before running any TDX   
code on that cpu.

The host kernel provides two functions to do them respectively: tdx_cpu_enable()
and tdx_enable().  

Currently, tdx_enable() requires all online cpus being in VMX operation with CPU
hotplug disabled, and tdx_cpu_enable() needs to be called on local cpu with that
cpu being in VMX operation and IRQ disabled.

> 
> To enable/disable VMX on all online CPUs, utilize
> vmx_hardware_enable/disable().  The method also initializes each CPU for
> TDX.  
> 

I don't understand what you are saying here.

Did you mean you put tdx_cpu_enable() inside vmx_hardware_enable()?

> TDX requires calling a TDX initialization function per logical
> processor (LP) before the LP uses TDX.  
> 

[...]

> When the CPU is becoming online,
> call the TDX LP initialization API.  If it fails to initialize TDX, refuse
> CPU online for simplicity instead of TDX avoiding the failed LP.

Unless I am missing something, I don't see this has been done in the code.

> 
> There are several options on when to initialize the TDX module.  A.) kernel
> module loading time, B.) the first guest TD creation time.  A.) was chosen.

A.) was chosen -> Choose A).

Describe your change in "imperative mood".

> With B.), a user may hit an error of the TDX initialization when trying to
> create the first guest TD.  The machine that fails to initialize the TDX
> module can't boot any guest TD further.  Such failure is undesirable and a
> surprise because the user expects that the machine can accommodate guest
> TD, but not.  So A.) is better than B.).
> 
> Introduce a module parameter, kvm_intel.tdx, to explicitly enable TDX KVM

You don't have to say the name of the new parameter.  It's shown in the code.

> support.  It's off by default to keep the same behavior for those who don't
> use TDX.  
> 

[...]


> Implement hardware_setup method to detect TDX feature of CPU and
> initialize TDX module.

You are not detecting TDX feature anymore.

And put this in a separate paragraph (at a better place), as I don't see how
this is connected to "introduce a module parameter".

> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
> v19:
> - fixed vt_hardware_enable() to use vmx_hardware_enable()
> - renamed vmx_tdx_enabled => tdx_enabled
> - renamed vmx_tdx_on() => tdx_on()
> 
> v18:
> - Added comment in vt_hardware_enable() by Binbin.
> 
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/kvm/Makefile      |  1 +
>  arch/x86/kvm/vmx/main.c    | 19 ++++++++-
>  arch/x86/kvm/vmx/tdx.c     | 84 ++++++++++++++++++++++++++++++++++++++
>  arch/x86/kvm/vmx/x86_ops.h |  6 +++
>  4 files changed, 109 insertions(+), 1 deletion(-)
>  create mode 100644 arch/x86/kvm/vmx/tdx.c
> 
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index 274df24b647f..5b85ef84b2e9 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -24,6 +24,7 @@ kvm-intel-y		+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
>  
>  kvm-intel-$(CONFIG_X86_SGX_KVM)	+= vmx/sgx.o
>  kvm-intel-$(CONFIG_KVM_HYPERV)	+= vmx/hyperv.o vmx/hyperv_evmcs.o
> +kvm-intel-$(CONFIG_INTEL_TDX_HOST)	+= vmx/tdx.o
>  
>  kvm-amd-y		+= svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
>  			   svm/sev.o
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index 18cecf12c7c8..18aef6e23aab 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -6,6 +6,22 @@
>  #include "nested.h"
>  #include "pmu.h"
>  
> +static bool enable_tdx __ro_after_init;
> +module_param_named(tdx, enable_tdx, bool, 0444);
> +
> +static __init int vt_hardware_setup(void)
> +{
> +	int ret;
> +
> +	ret = vmx_hardware_setup();
> +	if (ret)
> +		return ret;
> +
> +	enable_tdx = enable_tdx && !tdx_hardware_setup(&vt_x86_ops);
> +
> +	return 0;
> +}
> +
>  #define VMX_REQUIRED_APICV_INHIBITS				\
>  	(BIT(APICV_INHIBIT_REASON_DISABLE)|			\
>  	 BIT(APICV_INHIBIT_REASON_ABSENT) |			\
> @@ -22,6 +38,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>  
>  	.hardware_unsetup = vmx_hardware_unsetup,
>  
> +	/* TDX cpu enablement is done by tdx_hardware_setup(). */

What's the point of this comment?  I don't understand it either.

>  	.hardware_enable = vmx_hardware_enable,
>  	.hardware_disable = vmx_hardware_disable,

Shouldn't you also implement vt_hardware_enable(), which also does
tdx_cpu_enable()? 

Because I don't see vmx_hardware_enable() is changed to call tdx_cpu_enable() to
make CPU hotplug work with TDX.

>  	.has_emulated_msr = vmx_has_emulated_msr,
> @@ -161,7 +178,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>  };
>  
>  struct kvm_x86_init_ops vt_init_ops __initdata = {
> -	.hardware_setup = vmx_hardware_setup,
> +	.hardware_setup = vt_hardware_setup,
>  	.handle_intel_pt_intr = NULL,
>  
>  	.runtime_ops = &vt_x86_ops,
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> new file mode 100644
> index 000000000000..43c504fb4fed
> --- /dev/null
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -0,0 +1,84 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/cpu.h>
> +
> +#include <asm/tdx.h>
> +
> +#include "capabilities.h"
> +#include "x86_ops.h"
> +#include "x86.h"
> +
> +#undef pr_fmt
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +static int __init tdx_module_setup(void)
> +{
> +	int ret;
> +
> +	ret = tdx_enable();
> +	if (ret) {
> +		pr_info("Failed to initialize TDX module.\n");

As I commented before, tdx_enable() itself will print similar message when it
fails, so no need to print again.

> +		return ret;
> +	}
> +
> +	return 0;
> +}

That being said, I don't think tdx_module_setup() is necessary.  Just call
tdx_enable() directly.

> +
> +struct tdx_enabled {
> +	cpumask_var_t enabled;
> +	atomic_t err;
> +};

struct cpu_tdx_init_ctx {
	cpumask_var_t vmx_enabled_cpumask;
	atomic_t err;
};

?

> +
> +static void __init tdx_on(void *_enable)

tdx_on() -> cpu_tdx_init(), or cpu_tdx_on()?

> +{
> +	struct tdx_enabled *enable = _enable;
> +	int r;
> +
> +	r = vmx_hardware_enable();
> +	if (!r) {
> +		cpumask_set_cpu(smp_processor_id(), enable->enabled);
> +		r = tdx_cpu_enable();
> +	}
> +	if (r)
> +		atomic_set(&enable->err, r);
> +}
> +
> +static void __init vmx_off(void *_enabled)

cpu_vmx_off() ?

> +{
> +	cpumask_var_t *enabled = (cpumask_var_t *)_enabled;
> +
> +	if (cpumask_test_cpu(smp_processor_id(), *enabled))
> +		vmx_hardware_disable();
> +}
> +
> +int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)

Why do you need the 'x86_ops' function argument?  I don't see it is used?

> +{
> +	struct tdx_enabled enable = {
> +		.err = ATOMIC_INIT(0),
> +	};
> +	int r = 0;
> +
> +	if (!enable_ept) {
> +		pr_warn("Cannot enable TDX with EPT disabled\n");
> +		return -EINVAL;
> +	}
> +
> +	if (!zalloc_cpumask_var(&enable.enabled, GFP_KERNEL)) {
> +		r = -ENOMEM;
> +		goto out;
> +	}
> +
> +	/* tdx_enable() in tdx_module_setup() requires cpus lock. */

	/* tdx_enable() must be called with CPU hotplug disabled */

> +	cpus_read_lock();
> +	on_each_cpu(tdx_on, &enable, true); /* TDX requires vmxon. */

I don't think you need this comment _here_.

If you want keep it, move to the tdx_on() where the code does what this comment
say.

> +	r = atomic_read(&enable.err);
> +	if (!r)
> +		r = tdx_module_setup();
> +	else
> +		r = -EIO;
> +	on_each_cpu(vmx_off, &enable.enabled, true);
> +	cpus_read_unlock();
> +	free_cpumask_var(enable.enabled);
> +
> +out:
> +	return r;
> +}

At last, I think there's one problem here:

KVM actually only registers CPU hotplug callback in kvm_init(), which happens
way after tdx_hardware_setup().

What happens if any CPU goes online *BETWEEN* tdx_hardware_setup() and
kvm_init()?

Looks we have two options:

1) move registering CPU hotplug callback before tdx_hardware_setup(), or
2) we need to disable CPU hotplug until callbacks have been registered.

Perhaps the second one is easier, because for the first one we need to make sure
the kvm_cpu_online() is ready to be called right after tdx_hardware_setup().

And no one cares if CPU hotplug is disabled during KVM module loading.

That being said, we can even just disable CPU hotplug during the entire
vt_init(), if in this way the code change is simple?

But anyway, to make this patch complete, I think you need to replace
vmx_hardware_enable() to vt_hardware_enable() and do tdx_cpu_enable() to handle
TDX vs CPU hotplug in _this_ patch.
Isaku Yamahata March 22, 2024, 6:01 p.m. UTC | #7
On Thu, Mar 21, 2024 at 12:39:34PM +0000,
"Huang, Kai" <kai.huang@intel.com> wrote:

> On Fri, 2024-03-15 at 16:25 -0700, Isaku Yamahata wrote:
> > > > > How about if there are some LPs that are offline.
> > > > > In tdx_hardware_setup(), only online LPs are initialed for TDX, right?
> > > > Correct.
> > > > 
> > > > 
> > > > > Then when an offline LP becoming online, it doesn't have a chance to call
> > > > > tdx_cpu_enable()?
> > > > KVM registers kvm_online/offline_cpu() @ kvm_main.c as cpu hotplug callbacks.
> > > > Eventually x86 kvm hardware_enable() is called on online/offline event.
> > > 
> > > Yes, hardware_enable() will be called when online,
> > > but  hardware_enable() now is vmx_hardware_enable() right?
> > > It doens't call tdx_cpu_enable() during the online path.
> > 
> > TDX module requires TDH.SYS.LP.INIT() on all logical processors(LPs).  If we
> > successfully initialized TDX module, we don't need further action for TDX on cpu
> > online/offline.
> > 
> > If some of LPs are not online when loading kvm_intel.ko, KVM fails to initialize
> > TDX module. TDX support is disabled.  We don't bother to attempt it.  Leave it
> > to the admin of the machine.
> 
> No.  We have relaxed this.  Now the TDX module can be initialized on a subset of
> all logical cpus, with arbitrary number of cpus being offline.  
> 
> Those cpus can become online after module initialization, and TDH.SYS.LP.INIT on
> them won't fail.

Ah, you're right. So we need to call tdx_cpu_enable() on online.  For offline,
KVM has to do nothing.  It's another story to shutdown TDX module.
Isaku Yamahata March 22, 2024, 9:23 p.m. UTC | #8
On Thu, Mar 21, 2024 at 01:07:27PM +0000,
"Huang, Kai" <kai.huang@intel.com> wrote:

> On Mon, 2024-02-26 at 00:25 -0800, isaku.yamahata@intel.com wrote:
> > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > 
> > TDX requires several initialization steps for KVM to create guest TDs.
> > Detect CPU feature, enable VMX (TDX is based on VMX) on all online CPUs,
> > detect the TDX module availability, initialize it and disable VMX.
> 
> Before KVM can use TDX to create and run TDX guests, the kernel needs to
> initialize TDX from two perspectives:
> 
> 1) Initialize the TDX module.
> 1) Do the "per-cpu initialization" on any logical cpu before running any TDX   
> code on that cpu.
> 
> The host kernel provides two functions to do them respectively: tdx_cpu_enable()
> and tdx_enable().  
> 
> Currently, tdx_enable() requires all online cpus being in VMX operation with CPU
> hotplug disabled, and tdx_cpu_enable() needs to be called on local cpu with that
> cpu being in VMX operation and IRQ disabled.
> 
> > 
> > To enable/disable VMX on all online CPUs, utilize
> > vmx_hardware_enable/disable().  The method also initializes each CPU for
> > TDX.  
> > 
> 
> I don't understand what you are saying here.
> 
> Did you mean you put tdx_cpu_enable() inside vmx_hardware_enable()?

Now the section doesn't make sense. Will remove it.


> > TDX requires calling a TDX initialization function per logical
> > processor (LP) before the LP uses TDX.  
> > 
> 
> [...]
> 
> > When the CPU is becoming online,
> > call the TDX LP initialization API.  If it fails to initialize TDX, refuse
> > CPU online for simplicity instead of TDX avoiding the failed LP.
> 
> Unless I am missing something, I don't see this has been done in the code.

You're right. Somehow the code was lost.  Let me revive it with the next
version.


> > There are several options on when to initialize the TDX module.  A.) kernel
> > module loading time, B.) the first guest TD creation time.  A.) was chosen.
> 
> A.) was chosen -> Choose A).
> 
> Describe your change in "imperative mood".
> 
> > With B.), a user may hit an error of the TDX initialization when trying to
> > create the first guest TD.  The machine that fails to initialize the TDX
> > module can't boot any guest TD further.  Such failure is undesirable and a
> > surprise because the user expects that the machine can accommodate guest
> > TD, but not.  So A.) is better than B.).
> > 
> > Introduce a module parameter, kvm_intel.tdx, to explicitly enable TDX KVM
> 
> You don't have to say the name of the new parameter.  It's shown in the code.
> 
> > support.  It's off by default to keep the same behavior for those who don't
> > use TDX.  
> > 
> 
> [...]
> 
> 
> > Implement hardware_setup method to detect TDX feature of CPU and
> > initialize TDX module.
> 
> You are not detecting TDX feature anymore.
> 
> And put this in a separate paragraph (at a better place), as I don't see how
> this is connected to "introduce a module parameter".

Let me update those sentences.


> > Suggested-by: Sean Christopherson <seanjc@google.com>
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > ---
> > v19:
> > - fixed vt_hardware_enable() to use vmx_hardware_enable()
> > - renamed vmx_tdx_enabled => tdx_enabled
> > - renamed vmx_tdx_on() => tdx_on()
> > 
> > v18:
> > - Added comment in vt_hardware_enable() by Binbin.
> > 
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > ---
> >  arch/x86/kvm/Makefile      |  1 +
> >  arch/x86/kvm/vmx/main.c    | 19 ++++++++-
> >  arch/x86/kvm/vmx/tdx.c     | 84 ++++++++++++++++++++++++++++++++++++++
> >  arch/x86/kvm/vmx/x86_ops.h |  6 +++
> >  4 files changed, 109 insertions(+), 1 deletion(-)
> >  create mode 100644 arch/x86/kvm/vmx/tdx.c
> > 
> > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > index 274df24b647f..5b85ef84b2e9 100644
> > --- a/arch/x86/kvm/Makefile
> > +++ b/arch/x86/kvm/Makefile
> > @@ -24,6 +24,7 @@ kvm-intel-y		+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
> >  
> >  kvm-intel-$(CONFIG_X86_SGX_KVM)	+= vmx/sgx.o
> >  kvm-intel-$(CONFIG_KVM_HYPERV)	+= vmx/hyperv.o vmx/hyperv_evmcs.o
> > +kvm-intel-$(CONFIG_INTEL_TDX_HOST)	+= vmx/tdx.o
> >  
> >  kvm-amd-y		+= svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
> >  			   svm/sev.o
> > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> > index 18cecf12c7c8..18aef6e23aab 100644
> > --- a/arch/x86/kvm/vmx/main.c
> > +++ b/arch/x86/kvm/vmx/main.c
> > @@ -6,6 +6,22 @@
> >  #include "nested.h"
> >  #include "pmu.h"
> >  
> > +static bool enable_tdx __ro_after_init;
> > +module_param_named(tdx, enable_tdx, bool, 0444);
> > +
> > +static __init int vt_hardware_setup(void)
> > +{
> > +	int ret;
> > +
> > +	ret = vmx_hardware_setup();
> > +	if (ret)
> > +		return ret;
> > +
> > +	enable_tdx = enable_tdx && !tdx_hardware_setup(&vt_x86_ops);
> > +
> > +	return 0;
> > +}
> > +
> >  #define VMX_REQUIRED_APICV_INHIBITS				\
> >  	(BIT(APICV_INHIBIT_REASON_DISABLE)|			\
> >  	 BIT(APICV_INHIBIT_REASON_ABSENT) |			\
> > @@ -22,6 +38,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> >  
> >  	.hardware_unsetup = vmx_hardware_unsetup,
> >  
> > +	/* TDX cpu enablement is done by tdx_hardware_setup(). */
> 
> What's the point of this comment?  I don't understand it either.

Will delete the comment.


> >  	.hardware_enable = vmx_hardware_enable,
> >  	.hardware_disable = vmx_hardware_disable,
> 
> Shouldn't you also implement vt_hardware_enable(), which also does
> tdx_cpu_enable()? 
> 
> Because I don't see vmx_hardware_enable() is changed to call tdx_cpu_enable() to
> make CPU hotplug work with TDX.

hardware_enable() doesn't help for cpu hot plug support. See below.


> >  	.has_emulated_msr = vmx_has_emulated_msr,
> > @@ -161,7 +178,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> >  };
> >  
> >  struct kvm_x86_init_ops vt_init_ops __initdata = {
> > -	.hardware_setup = vmx_hardware_setup,
> > +	.hardware_setup = vt_hardware_setup,
> >  	.handle_intel_pt_intr = NULL,
> >  
> >  	.runtime_ops = &vt_x86_ops,
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > new file mode 100644
> > index 000000000000..43c504fb4fed
> > --- /dev/null
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -0,0 +1,84 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#include <linux/cpu.h>
> > +
> > +#include <asm/tdx.h>
> > +
> > +#include "capabilities.h"
> > +#include "x86_ops.h"
> > +#include "x86.h"
> > +
> > +#undef pr_fmt
> > +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> > +
> > +static int __init tdx_module_setup(void)
> > +{
> > +	int ret;
> > +
> > +	ret = tdx_enable();
> > +	if (ret) {
> > +		pr_info("Failed to initialize TDX module.\n");
> 
> As I commented before, tdx_enable() itself will print similar message when it
> fails, so no need to print again.
> 
> > +		return ret;
> > +	}
> > +
> > +	return 0;
> > +}
> 
> That being said, I don't think tdx_module_setup() is necessary.  Just call
> tdx_enable() directly.

Ok, Will move this funciton to the patch that uses it first.


> > +
> > +struct tdx_enabled {
> > +	cpumask_var_t enabled;
> > +	atomic_t err;
> > +};
> 
> struct cpu_tdx_init_ctx {
> 	cpumask_var_t vmx_enabled_cpumask;
> 	atomic_t err;
> };
> 
> ?
> 
> > +
> > +static void __init tdx_on(void *_enable)
> 
> tdx_on() -> cpu_tdx_init(), or cpu_tdx_on()?
> 
> > +{
> > +	struct tdx_enabled *enable = _enable;
> > +	int r;
> > +
> > +	r = vmx_hardware_enable();
> > +	if (!r) {
> > +		cpumask_set_cpu(smp_processor_id(), enable->enabled);
> > +		r = tdx_cpu_enable();
> > +	}
> > +	if (r)
> > +		atomic_set(&enable->err, r);
> > +}
> > +
> > +static void __init vmx_off(void *_enabled)
> 
> cpu_vmx_off() ?

Ok, let's add cpu_ prefix.


> > +{
> > +	cpumask_var_t *enabled = (cpumask_var_t *)_enabled;
> > +
> > +	if (cpumask_test_cpu(smp_processor_id(), *enabled))
> > +		vmx_hardware_disable();
> > +}
> > +
> > +int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
> 
> Why do you need the 'x86_ops' function argument?  I don't see it is used?

Will move it to the patch that uses it first.


> > +{
> > +	struct tdx_enabled enable = {
> > +		.err = ATOMIC_INIT(0),
> > +	};
> > +	int r = 0;
> > +
> > +	if (!enable_ept) {
> > +		pr_warn("Cannot enable TDX with EPT disabled\n");
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (!zalloc_cpumask_var(&enable.enabled, GFP_KERNEL)) {
> > +		r = -ENOMEM;
> > +		goto out;
> > +	}
> > +
> > +	/* tdx_enable() in tdx_module_setup() requires cpus lock. */
> 
> 	/* tdx_enable() must be called with CPU hotplug disabled */
> 
> > +	cpus_read_lock();
> > +	on_each_cpu(tdx_on, &enable, true); /* TDX requires vmxon. */
> 
> I don't think you need this comment _here_.
> 
> If you want keep it, move to the tdx_on() where the code does what this comment
> say.

Will move the comment into cpu_tdx_on().


> > +	r = atomic_read(&enable.err);
> > +	if (!r)
> > +		r = tdx_module_setup();
> > +	else
> > +		r = -EIO;
> > +	on_each_cpu(vmx_off, &enable.enabled, true);
> > +	cpus_read_unlock();
> > +	free_cpumask_var(enable.enabled);
> > +
> > +out:
> > +	return r;
> > +}
> 
> At last, I think there's one problem here:
> 
> KVM actually only registers CPU hotplug callback in kvm_init(), which happens
> way after tdx_hardware_setup().
> 
> What happens if any CPU goes online *BETWEEN* tdx_hardware_setup() and
> kvm_init()?
> 
> Looks we have two options:
> 
> 1) move registering CPU hotplug callback before tdx_hardware_setup(), or
> 2) we need to disable CPU hotplug until callbacks have been registered.
> 
> Perhaps the second one is easier, because for the first one we need to make sure
> the kvm_cpu_online() is ready to be called right after tdx_hardware_setup().
> 
> And no one cares if CPU hotplug is disabled during KVM module loading.
> 
> That being said, we can even just disable CPU hotplug during the entire
> vt_init(), if in this way the code change is simple?
> 
> But anyway, to make this patch complete, I think you need to replace
> vmx_hardware_enable() to vt_hardware_enable() and do tdx_cpu_enable() to handle
> TDX vs CPU hotplug in _this_ patch.

The option 2 sounds easier. But hardware_enable() doesn't help because it's
called when the first guest is created. It's risky to change it's semantics
because it's arch-independent callback.

- Disable CPU hot plug during TDX module initialization.
- During hardware_setup(), enable VMX, tdx_cpu_enable(), disable VMX
  on online cpu. Don't rely on KVM hooks.
- Add a new arch-independent hook, int kvm_arch_online_cpu(). It's called always
  on cpu onlining. It eventually calls tdx_cpu_enabel(). If it fails, refuse
  onlining.
Huang, Kai April 9, 2024, 12:37 a.m. UTC | #9
On 26/02/2024 9:25 pm, Yamahata, Isaku wrote:
> +struct tdx_enabled {
> +	cpumask_var_t enabled;
> +	atomic_t err;
> +};
> +
> +static void __init tdx_on(void *_enable)
> +{
> +	struct tdx_enabled *enable = _enable;
> +	int r;
> +
> +	r = vmx_hardware_enable();
> +	if (!r) {
> +		cpumask_set_cpu(smp_processor_id(), enable->enabled);
> +		r = tdx_cpu_enable();
> +	}
> +	if (r)
> +		atomic_set(&enable->err, r);
> +}
> +
> +static void __init vmx_off(void *_enabled)
> +{
> +	cpumask_var_t *enabled = (cpumask_var_t *)_enabled;
> +
> +	if (cpumask_test_cpu(smp_processor_id(), *enabled))
> +		vmx_hardware_disable();
> +}
> +
> +int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
> +{
> +	struct tdx_enabled enable = {
> +		.err = ATOMIC_INIT(0),
> +	};
> +	int r = 0;
> +
> +	if (!enable_ept) {
> +		pr_warn("Cannot enable TDX with EPT disabled\n");
> +		return -EINVAL;
> +	}
> +
> +	if (!zalloc_cpumask_var(&enable.enabled, GFP_KERNEL)) {
> +		r = -ENOMEM;
> +		goto out;
> +	}
> +
> +	/* tdx_enable() in tdx_module_setup() requires cpus lock. */
> +	cpus_read_lock();
> +	on_each_cpu(tdx_on, &enable, true); /* TDX requires vmxon. */
> +	r = atomic_read(&enable.err);
> +	if (!r)
> +		r = tdx_module_setup();
> +	else
> +		r = -EIO;

I was thinking why do we need to convert to -EIO.

Convert error code to -EIO unconditionally would cause the original 
error code being lost.  Although given tdx_on() is called on all online 
cpus in parallel, the @enable.err could be imprecise anyway, the 
explicit conversion seems not quite reasonable to be done _here_.

I think it would be more reasonable to explicitly set the error code to 
-EIO in tdx_on(), where we _exactly_ know what went wrong and can still 
possibly do something before losing the error code.

E.g., we can dump the error code to the user, but looks both 
vmx_hardware_enable() and tdx_cpu_enable() will do so already so we can 
safely lose the error code there.

We can perhaps add a comment to point this out before losing the error 
code if that's better:

	/*
	 * Both vmx_hardware_enable() and tdx_cpu_enable() print error
	 * message when they fail.  Just convert the error code to -EIO
	 * when multiple cpu fault the @err cannot be used to precisely
	 * record the error code for them anyway.
	 */

> +	on_each_cpu(vmx_off, &enable.enabled, true);
> +	cpus_read_unlock();
> +	free_cpumask_var(enable.enabled);
> +
> +out:
> +	return r;
> +}
Huang, Kai April 10, 2024, 1:12 p.m. UTC | #10
On Fri, 2024-03-22 at 14:23 -0700, Isaku Yamahata wrote:
> > > +	r = atomic_read(&enable.err);
> > > +	if (!r)
> > > +		r = tdx_module_setup();
> > > +	else
> > > +		r = -EIO;
> > > +	on_each_cpu(vmx_off, &enable.enabled, true);
> > > +	cpus_read_unlock();
> > > +	free_cpumask_var(enable.enabled);
> > > +
> > > +out:
> > > +	return r;
> > > +}
> > 
> > At last, I think there's one problem here:
> > 
> > KVM actually only registers CPU hotplug callback in kvm_init(), which happens
> > way after tdx_hardware_setup().
> > 
> > What happens if any CPU goes online *BETWEEN* tdx_hardware_setup() and
> > kvm_init()?
> > 
> > Looks we have two options:
> > 
> > 1) move registering CPU hotplug callback before tdx_hardware_setup(), or
> > 2) we need to disable CPU hotplug until callbacks have been registered.
> > 
> > Perhaps the second one is easier, because for the first one we need to make sure
> > the kvm_cpu_online() is ready to be called right after tdx_hardware_setup().
> > 
> > And no one cares if CPU hotplug is disabled during KVM module loading.
> > 
> > That being said, we can even just disable CPU hotplug during the entire
> > vt_init(), if in this way the code change is simple?
> > 
> > But anyway, to make this patch complete, I think you need to replace
> > vmx_hardware_enable() to vt_hardware_enable() and do tdx_cpu_enable() to handle
> > TDX vs CPU hotplug in _this_ patch.
> 
> The option 2 sounds easier. But hardware_enable() doesn't help because it's
> called when the first guest is created. It's risky to change it's semantics
> because it's arch-independent callback.
> 
> - Disable CPU hot plug during TDX module initialization.

As we talked, it turns out it is problematic to do so, because cpus_read_lock()
is also called by some other internal functions like static_call_update().  If
we take cpus_read_lock() for the entire vt_init() then we will have nested
cpus_read_lock().

> - During hardware_setup(), enable VMX, tdx_cpu_enable(), disable VMX
>   on online cpu. Don't rely on KVM hooks.
> - Add a new arch-independent hook, int kvm_arch_online_cpu(). It's called always
>   on cpu onlining. It eventually calls tdx_cpu_enabel(). If it fails, refuse
>   onlining.

So the purpose of kvm_arch_online_cpu() is to always do "VMXON +
tdx_cpu_enable() + VMXOFF" _regardless_ of the kvm_usage_count, so that we can
make sure that:

When TDX is enabled by KVM, all online cpus are TDX-capable (have done
tdx_cpu_enable() successfully).

And the code will be like:

	static int kvm_online_cpu(unsigned int cpu)
	{
		mutex_lock(&kvm_lock);
		ret = kvm_arch_online_cpu(cpu);
		if (!ret && kvm_usage_count)
			ret = __hardware_enable_nolock();
		mutex_unlock(&kvm_lock);
	}

This will need another kvm_x86_ops->online_cpu() where we can implement the TDX
specific "VMXON + tdx_cpu_enable() + VMXOFF":

	int kvm_arch_online_cpu(unsigned int cpu)
	{
		return static_call(kvm_x86_online_cpu)(cpu);
	}

Somehow I don't quite like this because: 1) it introduces a new kvm_x86_ops-
>online_cpu(); 2) it's a little bit silly to do "VMXON + tdx_cpu_enable() +
VMXOFF" just for TDX and then immediately do VMXON when there's KVM usage.

And IIUC, it will NOT work if kvm_online_cpu() happens when kvm_usage_count > 0:
VMXON has actually already been done on this cpu, so that the "VMXON" before
tdx_cpu_enable() will fail.  Probably this can be addressed somehow, but still
doesn't seem nice.

So the above "option 1" doesn't seem right to me.

After thinking again, I think we have been too nervous about "losing 
CPU hotplug between tdx_enable() and kvm_init(), and when there's no KVM 
usage".

Instead, I think it's acceptable we don't do tdx_cpu_enable() for new 
CPU when it is hotplugged during the above two cases.

We just need to guarantee all online cpus are TDX capable "when there's 
real KVM usage, i.e., there's real VM running".

So "option 2":

I believe we just need to do tdx_cpu_enable() in vt_hardware_enable() 
after vmx_hardware_enable():

1) When the first VM is created, KVM will try to do tdx_cpu_enable() for 
those CPUs that becomes online after tdx_enable(), and if any 
tdx_cpu_enabled() fails, the VM will not be created.  Otherwise, all 
online cpus are TDX-capable.

2) When there's real VM running, and when a new CPU can successfully 
become online, it must be TDX-capable.

Failure of tdx_cpu_enable() in 2) is obviously fine.

The consequence of failure to do tdx_cpu_enable() in 1) is that, besides 
VM cannot be created, there might be some online CPUs are not 
TDX-capable when TDX is marked as enabled.

It's fine from KVM's perspective, because literally no VM is running.

From host kernel's perspective, the only tricky thing is #MC handler. 
It tries to use SEAMCALL to read the faulty page's status to determine 
whether it is a TDX private page.  If #MC happens on those 
non-TDX-capable cpus, then the SEAMCALL will fail.  But that is also 
fine as we don't need a precise result anyway.

Option 3:

We want still to make sure our goal: 

When TDX is enabled by KVM, all online cpus are TDX-capable.

For that, we can register an *additional* TDX specific CPU hotplug callback
right after tdx_enable() to handle any CPU hotplug "between tdx_enable() and
kvm_init(), and when there's no KVM usage".

Specifically, we can use dynamically allocated CPU hotplug state to avoid having
to hard-code another KVM-TDX-specific CPU hotplug callback state:

	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_DYN, "kvm/cpu/tdx:online",        
                                      tdx_online_cpu, NULL);

In tdx_online_cpu(), we do tdx_cpu_enable() if enable_tdx is true.

The problem is if tdx_online_cpu() happens when there's already KVM usage, the
kvm_online_cpu() has already done VMXON.  So tdx_online_cpu() will need to grab
the @kvm_lock mutex and check @kvm_usage_count to determine whether to do VMXON
before tdx_cpu_enable().

However both @kvm_lock mutex and @kvm_usage_count are in kvm.ko, and it's not
nice to export such low level thing to kvm-intel.ko for TDX.

Option 4:

To avoid exporting @kvm_lock and @kvm_usage_count, we can still register the
TDX-specific CPU hotplug callback, but choose to do "unconditional
tdx_cpu_enable()" w/o doing VMXON in tdx_online_cpu().  If that fails due to
VMXON hasn't been done, let it fail.  This basically means:

When TDX is enabled, KVM can only online CPU when there's running VM.

To summarize:

I think "option 2" should be the best solution for now.  It's easy to implement
and yet has no real issue.

Any comments?
Sean Christopherson April 10, 2024, 3:29 p.m. UTC | #11
On Wed, Apr 10, 2024, Kai Huang wrote:
> On Fri, 2024-03-22 at 14:23 -0700, Isaku Yamahata wrote:
> > > > +	r = atomic_read(&enable.err);
> > > > +	if (!r)
> > > > +		r = tdx_module_setup();
> > > > +	else
> > > > +		r = -EIO;
> > > > +	on_each_cpu(vmx_off, &enable.enabled, true);
> > > > +	cpus_read_unlock();
> > > > +	free_cpumask_var(enable.enabled);
> > > > +
> > > > +out:
> > > > +	return r;
> > > > +}
> > > 
> > > At last, I think there's one problem here:
> > > 
> > > KVM actually only registers CPU hotplug callback in kvm_init(), which happens
> > > way after tdx_hardware_setup().
> > > 
> > > What happens if any CPU goes online *BETWEEN* tdx_hardware_setup() and
> > > kvm_init()?
> > > 
> > > Looks we have two options:
> > > 
> > > 1) move registering CPU hotplug callback before tdx_hardware_setup(), or
> > > 2) we need to disable CPU hotplug until callbacks have been registered.

This is all so dumb (not TDX, the current state of KVM).  All of the hardware
enabling crud is pointless complex inherited from misguided, decade old paranoia
that led to the decision to enable VMX if and only if VMs are running.  Enabling
VMX doesn't make the system less secure, and the insane dances we are doing to
do VMXON on-demand makes everything *more* fragile.

And all of this complexity really was driven by VMX, enabling virtualization for
every other vendor, including AMD/SVM, is completely uninteresting.  Forcing other
architectures/vendors to take on yet more complexity doesn't make any sense.

Barely tested, and other architectures would need to be converted, but I don't
see any obvious reasons why we can't simply enable virtualization when the module
is loaded.

The diffstat pretty much says it all.

---
 Documentation/virt/kvm/locking.rst |   4 -
 arch/x86/include/asm/kvm_host.h    |   3 +
 arch/x86/kvm/svm/svm.c             |   5 +-
 arch/x86/kvm/vmx/vmx.c             |  18 ++-
 arch/x86/kvm/x86.c                 |  22 ++--
 include/linux/kvm_host.h           |   2 +
 virt/kvm/kvm_main.c                | 181 +++++++----------------------
 7 files changed, 67 insertions(+), 168 deletions(-)

diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst
index 02880d5552d5..0d6eff13fd46 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -227,10 +227,6 @@ time it will be set using the Dirty tracking mechanism described above.
 :Type:		mutex
 :Arch:		any
 :Protects:	- vm_list
-		- kvm_usage_count
-		- hardware virtualization enable/disable
-:Comment:	KVM also disables CPU hotplug via cpus_read_lock() during
-		enable/disable.
 
 ``kvm->mn_invalidate_lock``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 73740d698ebe..7422239987d8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -36,6 +36,7 @@
 #include <asm/kvm_page_track.h>
 #include <asm/kvm_vcpu_regs.h>
 #include <asm/hyperv-tlfs.h>
+#include <asm/reboot.h>
 
 #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
 
@@ -1605,6 +1606,8 @@ struct kvm_x86_ops {
 
 	int (*hardware_enable)(void);
 	void (*hardware_disable)(void);
+	cpu_emergency_virt_cb *emergency_disable;
+
 	void (*hardware_unsetup)(void);
 	bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
 	void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9aaf83c8d57d..7e118284934c 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4917,6 +4917,7 @@ static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
 static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.name = KBUILD_MODNAME,
 
+	.emergency_disable = svm_emergency_disable,
 	.check_processor_compatibility = svm_check_processor_compat,
 
 	.hardware_unsetup = svm_hardware_unsetup,
@@ -5348,8 +5349,6 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
 static void __svm_exit(void)
 {
 	kvm_x86_vendor_exit();
-
-	cpu_emergency_unregister_virt_callback(svm_emergency_disable);
 }
 
 static int __init svm_init(void)
@@ -5365,8 +5364,6 @@ static int __init svm_init(void)
 	if (r)
 		return r;
 
-	cpu_emergency_register_virt_callback(svm_emergency_disable);
-
 	/*
 	 * Common KVM initialization _must_ come last, after this, /dev/kvm is
 	 * exposed to userspace!
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d18dcb1e11a6..0dbe74da7ee3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8320,6 +8320,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 
 	.hardware_enable = vmx_hardware_enable,
 	.hardware_disable = vmx_hardware_disable,
+	.emergency_disable = vmx_emergency_disable,
+
 	.has_emulated_msr = vmx_has_emulated_msr,
 
 	.vm_size = sizeof(struct kvm_vmx),
@@ -8733,8 +8735,6 @@ static void __vmx_exit(void)
 {
 	allow_smaller_maxphyaddr = false;
 
-	cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
-
 	vmx_cleanup_l1d_flush();
 }
 
@@ -8760,6 +8760,12 @@ static int __init vmx_init(void)
 	 */
 	hv_init_evmcs();
 
+	for_each_possible_cpu(cpu) {
+		INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+
+		pi_init_cpu(cpu);
+	}
+
 	r = kvm_x86_vendor_init(&vmx_init_ops);
 	if (r)
 		return r;
@@ -8775,14 +8781,6 @@ static int __init vmx_init(void)
 	if (r)
 		goto err_l1d_flush;
 
-	for_each_possible_cpu(cpu) {
-		INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
-
-		pi_init_cpu(cpu);
-	}
-
-	cpu_emergency_register_virt_callback(vmx_emergency_disable);
-
 	vmx_check_vmcs12_offsets();
 
 	/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 26288ca05364..41d3e4e32e20 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9687,15 +9687,10 @@ static int kvm_x86_check_processor_compatibility(void)
 	return static_call(kvm_x86_check_processor_compatibility)();
 }
 
-static void kvm_x86_check_cpu_compat(void *ret)
-{
-	*(int *)ret = kvm_x86_check_processor_compatibility();
-}
-
 int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 {
 	u64 host_pat;
-	int r, cpu;
+	int r;
 
 	guard(mutex)(&vendor_module_lock);
 
@@ -9771,11 +9766,11 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 
 	kvm_ops_update(ops);
 
-	for_each_online_cpu(cpu) {
-		smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1);
-		if (r < 0)
-			goto out_unwind_ops;
-	}
+	cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable);
+
+	r = kvm_enable_virtualization();
+	if (r)
+		goto out_unwind_ops;
 
 	/*
 	 * Point of no return!  DO NOT add error paths below this point unless
@@ -9818,6 +9813,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 	return 0;
 
 out_unwind_ops:
+	cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable);
 	kvm_x86_ops.hardware_enable = NULL;
 	static_call(kvm_x86_hardware_unsetup)();
 out_mmu_exit:
@@ -9858,6 +9854,10 @@ void kvm_x86_vendor_exit(void)
 	static_key_deferred_flush(&kvm_xen_enabled);
 	WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
 #endif
+
+	kvm_disable_virtualization();
+	cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable);
+
 	mutex_lock(&vendor_module_lock);
 	kvm_x86_ops.hardware_enable = NULL;
 	mutex_unlock(&vendor_module_lock);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 48f31dcd318a..92da2eee7448 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1518,6 +1518,8 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {}
 #endif
 
 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
+int kvm_enable_virtualization(void);
+void kvm_disable_virtualization(void);
 int kvm_arch_hardware_enable(void);
 void kvm_arch_hardware_disable(void);
 #endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f345dc15854f..326e3225c052 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -139,8 +139,6 @@ static int kvm_no_compat_open(struct inode *inode, struct file *file)
 #define KVM_COMPAT(c)	.compat_ioctl	= kvm_no_compat_ioctl,	\
 			.open		= kvm_no_compat_open
 #endif
-static int hardware_enable_all(void);
-static void hardware_disable_all(void);
 
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 
@@ -1261,10 +1259,6 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 	if (r)
 		goto out_err_no_arch_destroy_vm;
 
-	r = hardware_enable_all();
-	if (r)
-		goto out_err_no_disable;
-
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
 #endif
@@ -1304,8 +1298,6 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 		mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
 #endif
 out_err_no_mmu_notifier:
-	hardware_disable_all();
-out_err_no_disable:
 	kvm_arch_destroy_vm(kvm);
 out_err_no_arch_destroy_vm:
 	WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
@@ -1393,7 +1385,6 @@ static void kvm_destroy_vm(struct kvm *kvm)
 #endif
 	kvm_arch_free_vm(kvm);
 	preempt_notifier_dec();
-	hardware_disable_all();
 	mmdrop(mm);
 }
 
@@ -5536,9 +5527,8 @@ __visible bool kvm_rebooting;
 EXPORT_SYMBOL_GPL(kvm_rebooting);
 
 static DEFINE_PER_CPU(bool, hardware_enabled);
-static int kvm_usage_count;
 
-static int __hardware_enable_nolock(void)
+static int __kvm_enable_virtualization(void)
 {
 	if (__this_cpu_read(hardware_enabled))
 		return 0;
@@ -5553,34 +5543,18 @@ static int __hardware_enable_nolock(void)
 	return 0;
 }
 
-static void hardware_enable_nolock(void *failed)
-{
-	if (__hardware_enable_nolock())
-		atomic_inc(failed);
-}
-
 static int kvm_online_cpu(unsigned int cpu)
 {
-	int ret = 0;
-
 	/*
 	 * Abort the CPU online process if hardware virtualization cannot
 	 * be enabled. Otherwise running VMs would encounter unrecoverable
 	 * errors when scheduled to this CPU.
 	 */
-	mutex_lock(&kvm_lock);
-	if (kvm_usage_count)
-		ret = __hardware_enable_nolock();
-	mutex_unlock(&kvm_lock);
-	return ret;
+	return __kvm_enable_virtualization();
 }
 
-static void hardware_disable_nolock(void *junk)
+static void __kvm_disable_virtualization(void *ign)
 {
-	/*
-	 * Note, hardware_disable_all_nolock() tells all online CPUs to disable
-	 * hardware, not just CPUs that successfully enabled hardware!
-	 */
 	if (!__this_cpu_read(hardware_enabled))
 		return;
 
@@ -5591,78 +5565,10 @@ static void hardware_disable_nolock(void *junk)
 
 static int kvm_offline_cpu(unsigned int cpu)
 {
-	mutex_lock(&kvm_lock);
-	if (kvm_usage_count)
-		hardware_disable_nolock(NULL);
-	mutex_unlock(&kvm_lock);
+	__kvm_disable_virtualization(NULL);
 	return 0;
 }
 
-static void hardware_disable_all_nolock(void)
-{
-	BUG_ON(!kvm_usage_count);
-
-	kvm_usage_count--;
-	if (!kvm_usage_count)
-		on_each_cpu(hardware_disable_nolock, NULL, 1);
-}
-
-static void hardware_disable_all(void)
-{
-	cpus_read_lock();
-	mutex_lock(&kvm_lock);
-	hardware_disable_all_nolock();
-	mutex_unlock(&kvm_lock);
-	cpus_read_unlock();
-}
-
-static int hardware_enable_all(void)
-{
-	atomic_t failed = ATOMIC_INIT(0);
-	int r;
-
-	/*
-	 * Do not enable hardware virtualization if the system is going down.
-	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
-	 * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
-	 * after kvm_reboot() is called.  Note, this relies on system_state
-	 * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
-	 * hook instead of registering a dedicated reboot notifier (the latter
-	 * runs before system_state is updated).
-	 */
-	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
-	    system_state == SYSTEM_RESTART)
-		return -EBUSY;
-
-	/*
-	 * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
-	 * is called, and so on_each_cpu() between them includes the CPU that
-	 * is being onlined.  As a result, hardware_enable_nolock() may get
-	 * invoked before kvm_online_cpu(), which also enables hardware if the
-	 * usage count is non-zero.  Disable CPU hotplug to avoid attempting to
-	 * enable hardware multiple times.
-	 */
-	cpus_read_lock();
-	mutex_lock(&kvm_lock);
-
-	r = 0;
-
-	kvm_usage_count++;
-	if (kvm_usage_count == 1) {
-		on_each_cpu(hardware_enable_nolock, &failed, 1);
-
-		if (atomic_read(&failed)) {
-			hardware_disable_all_nolock();
-			r = -EBUSY;
-		}
-	}
-
-	mutex_unlock(&kvm_lock);
-	cpus_read_unlock();
-
-	return r;
-}
-
 static void kvm_shutdown(void)
 {
 	/*
@@ -5678,34 +5584,22 @@ static void kvm_shutdown(void)
 	 */
 	pr_info("kvm: exiting hardware virtualization\n");
 	kvm_rebooting = true;
-	on_each_cpu(hardware_disable_nolock, NULL, 1);
+	on_each_cpu(__kvm_disable_virtualization, NULL, 1);
 }
 
 static int kvm_suspend(void)
 {
-	/*
-	 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
-	 * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
-	 * is stable.  Assert that kvm_lock is not held to ensure the system
-	 * isn't suspended while KVM is enabling hardware.  Hardware enabling
-	 * can be preempted, but the task cannot be frozen until it has dropped
-	 * all locks (userspace tasks are frozen via a fake signal).
-	 */
-	lockdep_assert_not_held(&kvm_lock);
 	lockdep_assert_irqs_disabled();
 
-	if (kvm_usage_count)
-		hardware_disable_nolock(NULL);
+	__kvm_disable_virtualization(NULL);
 	return 0;
 }
 
 static void kvm_resume(void)
 {
-	lockdep_assert_not_held(&kvm_lock);
 	lockdep_assert_irqs_disabled();
 
-	if (kvm_usage_count)
-		WARN_ON_ONCE(__hardware_enable_nolock());
+	WARN_ON_ONCE(__kvm_enable_virtualization());
 }
 
 static struct syscore_ops kvm_syscore_ops = {
@@ -5713,16 +5607,45 @@ static struct syscore_ops kvm_syscore_ops = {
 	.resume = kvm_resume,
 	.shutdown = kvm_shutdown,
 };
-#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
-static int hardware_enable_all(void)
+
+int kvm_enable_virtualization(void)
 {
+	int r;
+
+	r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
+			      kvm_online_cpu, kvm_offline_cpu);
+	if (r)
+		return r;
+
+	register_syscore_ops(&kvm_syscore_ops);
+
+	/*
+	 * Manually undo virtualization enabling if the system is going down.
+	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
+	 * possible for an in-flight module load to enable virtualization
+	 * after syscore_shutdown() is called, i.e. without kvm_shutdown()
+	 * being invoked.  Note, this relies on system_state being set _before_
+	 * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
+	 * or this CPU observes the impedning shutdown.  Which is why KVM uses
+	 * a syscore ops hook instead of registering a dedicated reboot
+	 * notifier (the latter runs before system_state is updated).
+	 */
+	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
+	    system_state == SYSTEM_RESTART) {
+		unregister_syscore_ops(&kvm_syscore_ops);
+		cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
+		return -EBUSY;
+	}
+
 	return 0;
 }
 
-static void hardware_disable_all(void)
+void kvm_disable_virtualization(void)
 {
-
+	unregister_syscore_ops(&kvm_syscore_ops);
+	cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
 }
+
 #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
 
 static void kvm_iodevice_destructor(struct kvm_io_device *dev)
@@ -6418,15 +6341,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	int r;
 	int cpu;
 
-#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
-				      kvm_online_cpu, kvm_offline_cpu);
-	if (r)
-		return r;
-
-	register_syscore_ops(&kvm_syscore_ops);
-#endif
-
 	/* A kmem cache lets us meet the alignment requirements of fx_save. */
 	if (!vcpu_align)
 		vcpu_align = __alignof__(struct kvm_vcpu);
@@ -6437,10 +6351,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 					   offsetofend(struct kvm_vcpu, stats_id)
 					   - offsetof(struct kvm_vcpu, arch),
 					   NULL);
-	if (!kvm_vcpu_cache) {
-		r = -ENOMEM;
-		goto err_vcpu_cache;
-	}
+	if (!kvm_vcpu_cache)
+		return -ENOMEM;
 
 	for_each_possible_cpu(cpu) {
 		if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
@@ -6497,11 +6409,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	for_each_possible_cpu(cpu)
 		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
 	kmem_cache_destroy(kvm_vcpu_cache);
-err_vcpu_cache:
-#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-	unregister_syscore_ops(&kvm_syscore_ops);
-	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
-#endif
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_init);
@@ -6523,10 +6430,6 @@ void kvm_exit(void)
 	kmem_cache_destroy(kvm_vcpu_cache);
 	kvm_vfio_ops_exit();
 	kvm_async_pf_deinit();
-#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-	unregister_syscore_ops(&kvm_syscore_ops);
-	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
-#endif
 	kvm_irqfd_exit();
 }
 EXPORT_SYMBOL_GPL(kvm_exit);

base-commit: f10f3621ad80f008c218dbbc13a05c893766a7d2
--
Huang, Kai April 10, 2024, 11:15 p.m. UTC | #12
On 11/04/2024 3:29 am, Sean Christopherson wrote:
> On Wed, Apr 10, 2024, Kai Huang wrote:
>> On Fri, 2024-03-22 at 14:23 -0700, Isaku Yamahata wrote:
>>>>> +	r = atomic_read(&enable.err);
>>>>> +	if (!r)
>>>>> +		r = tdx_module_setup();
>>>>> +	else
>>>>> +		r = -EIO;
>>>>> +	on_each_cpu(vmx_off, &enable.enabled, true);
>>>>> +	cpus_read_unlock();
>>>>> +	free_cpumask_var(enable.enabled);
>>>>> +
>>>>> +out:
>>>>> +	return r;
>>>>> +}
>>>>
>>>> At last, I think there's one problem here:
>>>>
>>>> KVM actually only registers CPU hotplug callback in kvm_init(), which happens
>>>> way after tdx_hardware_setup().
>>>>
>>>> What happens if any CPU goes online *BETWEEN* tdx_hardware_setup() and
>>>> kvm_init()?
>>>>
>>>> Looks we have two options:
>>>>
>>>> 1) move registering CPU hotplug callback before tdx_hardware_setup(), or
>>>> 2) we need to disable CPU hotplug until callbacks have been registered.
> 
> This is all so dumb (not TDX, the current state of KVM).  All of the hardware
> enabling crud is pointless complex inherited from misguided, decade old paranoia
> that led to the decision to enable VMX if and only if VMs are running.  Enabling
> VMX doesn't make the system less secure, and the insane dances we are doing to
> do VMXON on-demand makes everything *more* fragile.
> 
> And all of this complexity really was driven by VMX, enabling virtualization for
> every other vendor, including AMD/SVM, is completely uninteresting.  Forcing other
> architectures/vendors to take on yet more complexity doesn't make any sense.

Ah, I actually preferred this solution, but I was trying to follow your 
suggestion here:

https://lore.kernel.org/lkml/ZW6FRBnOwYV-UCkY@google.com/

form which I interpreted you didn't like always having VMX enabled when 
KVM is present. :-)

> 
> Barely tested, and other architectures would need to be converted, but I don't
> see any obvious reasons why we can't simply enable virtualization when the module
> is loaded.
> 
> The diffstat pretty much says it all.

Thanks a lot for the code!

I can certainly follow up with this and generate a reviewable patchset 
if I can confirm with you that this is what you want?
Sean Christopherson April 11, 2024, 2:03 p.m. UTC | #13
On Thu, Apr 11, 2024, Kai Huang wrote:
> On 11/04/2024 3:29 am, Sean Christopherson wrote:
> > On Wed, Apr 10, 2024, Kai Huang wrote:
> > > > > What happens if any CPU goes online *BETWEEN* tdx_hardware_setup() and
> > > > > kvm_init()?
> > > > > 
> > > > > Looks we have two options:
> > > > > 
> > > > > 1) move registering CPU hotplug callback before tdx_hardware_setup(), or
> > > > > 2) we need to disable CPU hotplug until callbacks have been registered.
> > 
> > This is all so dumb (not TDX, the current state of KVM).  All of the hardware
> > enabling crud is pointless complex inherited from misguided, decade old paranoia
> > that led to the decision to enable VMX if and only if VMs are running.  Enabling
> > VMX doesn't make the system less secure, and the insane dances we are doing to
> > do VMXON on-demand makes everything *more* fragile.
> > 
> > And all of this complexity really was driven by VMX, enabling virtualization for
> > every other vendor, including AMD/SVM, is completely uninteresting.  Forcing other
> > architectures/vendors to take on yet more complexity doesn't make any sense.
> 
> Ah, I actually preferred this solution, but I was trying to follow your
> suggestion here:
> 
> https://lore.kernel.org/lkml/ZW6FRBnOwYV-UCkY@google.com/
> 
> form which I interpreted you didn't like always having VMX enabled when KVM
> is present. :-)

I had a feeling I said something along those lines in the past.  

> > Barely tested, and other architectures would need to be converted, but I don't
> > see any obvious reasons why we can't simply enable virtualization when the module
> > is loaded.
> > 
> > The diffstat pretty much says it all.
> 
> Thanks a lot for the code!
> 
> I can certainly follow up with this and generate a reviewable patchset if I
> can confirm with you that this is what you want?

Yes, I think it's the right direction.  I still have minor concerns about VMX
being enabled while kvm.ko is loaded, which means that VMXON will _always_ be
enabled if KVM is built-in.  But after seeing the complexity that is needed to
safely initialize TDX, and after seeing just how much complexity KVM already
has because it enables VMX on-demand (I hadn't actually tried removing that code
before), I think the cost of that complexity far outweighs the risk of "always"
being post-VMXON.

Within reason, I recommend getting feedback from others before you spend _too_
much time on this.  It's entirely possible I'm missing/forgetting some other angle.
Huang, Kai April 11, 2024, 10:58 p.m. UTC | #14
On 12/04/2024 2:03 am, Sean Christopherson wrote:
> On Thu, Apr 11, 2024, Kai Huang wrote:
>> On 11/04/2024 3:29 am, Sean Christopherson wrote:
>>> On Wed, Apr 10, 2024, Kai Huang wrote:
>>>>>> What happens if any CPU goes online *BETWEEN* tdx_hardware_setup() and
>>>>>> kvm_init()?
>>>>>>
>>>>>> Looks we have two options:
>>>>>>
>>>>>> 1) move registering CPU hotplug callback before tdx_hardware_setup(), or
>>>>>> 2) we need to disable CPU hotplug until callbacks have been registered.
>>>
>>> This is all so dumb (not TDX, the current state of KVM).  All of the hardware
>>> enabling crud is pointless complex inherited from misguided, decade old paranoia
>>> that led to the decision to enable VMX if and only if VMs are running.  Enabling
>>> VMX doesn't make the system less secure, and the insane dances we are doing to
>>> do VMXON on-demand makes everything *more* fragile.
>>>
>>> And all of this complexity really was driven by VMX, enabling virtualization for
>>> every other vendor, including AMD/SVM, is completely uninteresting.  Forcing other
>>> architectures/vendors to take on yet more complexity doesn't make any sense.
>>
>> Ah, I actually preferred this solution, but I was trying to follow your
>> suggestion here:
>>
>> https://lore.kernel.org/lkml/ZW6FRBnOwYV-UCkY@google.com/
>>
>> form which I interpreted you didn't like always having VMX enabled when KVM
>> is present. :-)
> 
> I had a feeling I said something along those lines in the past.
> 
>>> Barely tested, and other architectures would need to be converted, but I don't
>>> see any obvious reasons why we can't simply enable virtualization when the module
>>> is loaded.
>>>
>>> The diffstat pretty much says it all.
>>
>> Thanks a lot for the code!
>>
>> I can certainly follow up with this and generate a reviewable patchset if I
>> can confirm with you that this is what you want?
> 
> Yes, I think it's the right direction.  I still have minor concerns about VMX
> being enabled while kvm.ko is loaded, which means that VMXON will _always_ be
> enabled if KVM is built-in.  But after seeing the complexity that is needed to
> safely initialize TDX, and after seeing just how much complexity KVM already
> has because it enables VMX on-demand (I hadn't actually tried removing that code
> before), I think the cost of that complexity far outweighs the risk of "always"
> being post-VMXON.

Does always leaving VMXON have any actual damage, given we have 
emergency virtualization shutdown?

> 
> Within reason, I recommend getting feedback from others before you spend _too_
> much time on this.  It's entirely possible I'm missing/forgetting some other angle.

Sure.  Could you suggest who should we try to get feedback from?

Perhaps you can just help to Cc them?

Thanks for your time.
Sean Christopherson April 16, 2024, 8:58 p.m. UTC | #15
On Fri, Apr 12, 2024, Kai Huang wrote:
> On 12/04/2024 2:03 am, Sean Christopherson wrote:
> > On Thu, Apr 11, 2024, Kai Huang wrote:
> > > I can certainly follow up with this and generate a reviewable patchset if I
> > > can confirm with you that this is what you want?
> > 
> > Yes, I think it's the right direction.  I still have minor concerns about VMX
> > being enabled while kvm.ko is loaded, which means that VMXON will _always_ be
> > enabled if KVM is built-in.  But after seeing the complexity that is needed to
> > safely initialize TDX, and after seeing just how much complexity KVM already
> > has because it enables VMX on-demand (I hadn't actually tried removing that code
> > before), I think the cost of that complexity far outweighs the risk of "always"
> > being post-VMXON.
> 
> Does always leaving VMXON have any actual damage, given we have emergency
> virtualization shutdown?

Being post-VMXON increases the risk of kexec() into the kdump kernel failing.
The tradeoffs that we're trying to balance are: is the risk of kexec() failing
due to the complexity of the emergency VMX code higher than the risk of us breaking
things in general due to taking on a ton of complexity to juggle VMXON for TDX?

After seeing the latest round of TDX code, my opinion is that being post-VMXON
is less risky overall, in no small part because we need that to work anyways for
hosts that are actively running VMs.

> > Within reason, I recommend getting feedback from others before you spend _too_
> > much time on this.  It's entirely possible I'm missing/forgetting some other angle.
> 
> Sure.  Could you suggest who should we try to get feedback from?
> 
> Perhaps you can just help to Cc them?

I didn't have anyone in particular in mind, I just really want *someone* to weigh
in as a sanity check.
Huang, Kai April 17, 2024, 1:20 p.m. UTC | #16
On Tue, 2024-04-16 at 13:58 -0700, Sean Christopherson wrote:
> On Fri, Apr 12, 2024, Kai Huang wrote:
> > On 12/04/2024 2:03 am, Sean Christopherson wrote:
> > > On Thu, Apr 11, 2024, Kai Huang wrote:
> > > > I can certainly follow up with this and generate a reviewable patchset if I
> > > > can confirm with you that this is what you want?
> > > 
> > > Yes, I think it's the right direction.  I still have minor concerns about VMX
> > > being enabled while kvm.ko is loaded, which means that VMXON will _always_ be
> > > enabled if KVM is built-in.  But after seeing the complexity that is needed to
> > > safely initialize TDX, and after seeing just how much complexity KVM already
> > > has because it enables VMX on-demand (I hadn't actually tried removing that code
> > > before), I think the cost of that complexity far outweighs the risk of "always"
> > > being post-VMXON.
> > 
> > Does always leaving VMXON have any actual damage, given we have emergency
> > virtualization shutdown?
> 
> Being post-VMXON increases the risk of kexec() into the kdump kernel failing.
> The tradeoffs that we're trying to balance are: is the risk of kexec() failing
> due to the complexity of the emergency VMX code higher than the risk of us breaking
> things in general due to taking on a ton of complexity to juggle VMXON for TDX?
> 
> After seeing the latest round of TDX code, my opinion is that being post-VMXON
> is less risky overall, in no small part because we need that to work anyways for
> hosts that are actively running VMs.
> 
> > 

How about we only keep VMX always on when TDX is enabled?

In short, we can do this way:

 - Do VMXON + unconditional tdx_cpu_enable() in vt_hardware_enable()

 - And in vt_hardware_setup():
   
   cpus_read_lock();
   hardware_enable_all_nolock();  (this doesn't exist yet)
   ret = tdx_enable();
   if (!ret)
	hardware_disable_all_nolock();
   cpus_read_unlock();

 - And in vt_hardware_unsetup():

   if (TDX is enabled) {
	cpus_read_lock();
	hardware_disable_all_nolock();
	cpus_read_unlock();
   }

Note to make this work, we also need to move register/unregister
kvm_online_cpu()/kvm_offline_cpu() from kvm_init() to
hardware_enable_all_nolock() and hardware_disable_all_nolock()
respectively to cover any CPU becoming online after tdx_enable() (well,
more precisely, after hardware_enable_all_nolock()).  

This is reasonable anyway even w/o TDX, because only _after_ we have
enabled hardware on all online cpus, we need to handle CPU hotplug.

Calling hardware_enable_all_nolock() w/o holding kvm_lock mutex is also
fine because at this stage it's not possible for userspace to create VM
yet.

Btw, kvm_arch_hardware_enable() does things like TSC backworks, uret_msrs,
etc but they are safe to be called during module load/unload AFAICT.  We
can put a comment there for reminder if needed.

If I am not missing anything, below diff to kvm.ko shows my idea:

diff --git a/arch/x86/include/asm/kvm_host.h
b/arch/x86/include/asm/kvm_host.h
index 16e07a2eee19..ed8b2f34af01 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2318,4 +2318,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot,
unsigned long npages);
  */
 #define KVM_EXIT_HYPERCALL_MBZ         GENMASK_ULL(31, 1)

+int hardware_enable_all_nolock(void);
+void hardware_disable_all_nolock(void);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fb49c2a60200..3d2ff7dd0150 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5601,14 +5601,23 @@ static int kvm_offline_cpu(unsigned int cpu)
        return 0;
 }

-static void hardware_disable_all_nolock(void)
+static void __hardware_disable_all_nolock(void)
+{
+#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
+       cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
+#endif
+       on_each_cpu(hardware_disable_nolock, NULL, 1);
+}
+
+void hardware_disable_all_nolock(void)
 {
        BUG_ON(!kvm_usage_count);

        kvm_usage_count--;
        if (!kvm_usage_count)
-               on_each_cpu(hardware_disable_nolock, NULL, 1);
+               __hardware_disable_all_nolock();
 }
+EXPORT_SYMBOL_GPL(hardware_disable_all_nolock);

 static void hardware_disable_all(void)
 {
@@ -5619,11 +5628,27 @@ static void hardware_disable_all(void)
        cpus_read_unlock();
 }

-static int hardware_enable_all(void)
+static int __hardware_enable_all_nolock(void)
 {
        atomic_t failed = ATOMIC_INIT(0);
        int r;

+       on_each_cpu(hardware_enable_nolock, &failed, 1);
+
+       r = atomic_read(&failed);
+       if (r)
+               return -EBUSY;
+
+#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
+       r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE,
"kvm/cpu:online",
+                                     kvm_online_cpu, kvm_offline_cpu);
+#endif
+
+       return r;
+}
+
+int hardware_enable_all_nolock(void)
+{
        /*
         * Do not enable hardware virtualization if the system is going
down.
         * If userspace initiated a forced reboot, e.g. reboot -f, then
it's
@@ -5637,6 +5662,24 @@ static int hardware_enable_all(void)
            system_state == SYSTEM_RESTART)
                return -EBUSY;

+       kvm_usage_count++;
+       if (kvm_usage_count == 1) {
+               int r = __hardware_enable_all_nolock();
+
+               if (r) {
+                       hardware_disable_all_nolock();
+                       return r;
+               }
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(hardware_enable_all_nolock);
+
+static int hardware_enable_all(void)
+{
+       int r;
+
        /*
         * When onlining a CPU, cpu_online_mask is set before
kvm_online_cpu()
         * is called, and so on_each_cpu() between them includes the CPU
that
@@ -5648,17 +5691,7 @@ static int hardware_enable_all(void)
        cpus_read_lock();
        mutex_lock(&kvm_lock);

-       r = 0;
-
-       kvm_usage_count++;
-       if (kvm_usage_count == 1) {
-               on_each_cpu(hardware_enable_nolock, &failed, 1);
-
-               if (atomic_read(&failed)) {
-                       hardware_disable_all_nolock();
-                       r = -EBUSY;
-               }
-       }
+       r = hardware_enable_all_nolock();

        mutex_unlock(&kvm_lock);
        cpus_read_unlock();
@@ -6422,11 +6455,6 @@ int kvm_init(unsigned vcpu_size, unsigned
vcpu_align, struct module *module)
        int cpu;

 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-       r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE,
"kvm/cpu:online",
-                                     kvm_online_cpu, kvm_offline_cpu);
-       if (r)
-               return r;
-
        register_syscore_ops(&kvm_syscore_ops);
 #endif

@@ -6528,7 +6556,6 @@ void kvm_exit(void)
        kvm_async_pf_deinit();
 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
        unregister_syscore_ops(&kvm_syscore_ops);
-       cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
 #endif
        kvm_irqfd_exit();
 }
Sean Christopherson April 17, 2024, 2:40 p.m. UTC | #17
On Wed, Apr 17, 2024, Kai Huang wrote:
> On Tue, 2024-04-16 at 13:58 -0700, Sean Christopherson wrote:
> > On Fri, Apr 12, 2024, Kai Huang wrote:
> > > On 12/04/2024 2:03 am, Sean Christopherson wrote:
> > > > On Thu, Apr 11, 2024, Kai Huang wrote:
> > > > > I can certainly follow up with this and generate a reviewable patchset if I
> > > > > can confirm with you that this is what you want?
> > > > 
> > > > Yes, I think it's the right direction.  I still have minor concerns about VMX
> > > > being enabled while kvm.ko is loaded, which means that VMXON will _always_ be
> > > > enabled if KVM is built-in.  But after seeing the complexity that is needed to
> > > > safely initialize TDX, and after seeing just how much complexity KVM already
> > > > has because it enables VMX on-demand (I hadn't actually tried removing that code
> > > > before), I think the cost of that complexity far outweighs the risk of "always"
> > > > being post-VMXON.
> > > 
> > > Does always leaving VMXON have any actual damage, given we have emergency
> > > virtualization shutdown?
> > 
> > Being post-VMXON increases the risk of kexec() into the kdump kernel failing.
> > The tradeoffs that we're trying to balance are: is the risk of kexec() failing
> > due to the complexity of the emergency VMX code higher than the risk of us breaking
> > things in general due to taking on a ton of complexity to juggle VMXON for TDX?
> > 
> > After seeing the latest round of TDX code, my opinion is that being post-VMXON
> > is less risky overall, in no small part because we need that to work anyways for
> > hosts that are actively running VMs.
> 
> How about we only keep VMX always on when TDX is enabled?

Paolo also suggested that forcing VMXON only if TDX is enabled, mostly because
kvm-intel.ko and kvm-amd.ko may be auto-loaded based on MODULE_DEVICE_TABLE(),
which in turn causes problems for out-of-tree hypervisors that want control over
VMX and SVM.

I'm not opposed to the idea, it's the complexity and messiness I dislike.  E.g.
the TDX code shouldn't have to deal with CPU hotplug locks, core KVM shouldn't
need to expose nolock helpers, etc.  And if we're going to make non-trivial
changes to the core KVM hardware enabling code anyways...

What about this?  Same basic idea as before, but instead of unconditionally doing
hardware enabling during module initialization, let TDX do hardware enabling in
a late_hardware_setup(), and then have KVM x86 ensure virtualization is enabled
when creating VMs.

This way, architectures that aren't saddled with out-of-tree hypervisors can do
the dead simple thing of enabling hardware during their initialization sequence,
and the TDX code is much more sane, e.g. invoke kvm_x86_enable_virtualization()
during late_hardware_setup(), and kvm_x86_disable_virtualization() during module
exit (presumably).

---
 Documentation/virt/kvm/locking.rst |   4 -
 arch/x86/include/asm/kvm_host.h    |   3 +
 arch/x86/kvm/svm/svm.c             |   5 +-
 arch/x86/kvm/vmx/vmx.c             |  18 ++-
 arch/x86/kvm/x86.c                 |  59 +++++++---
 arch/x86/kvm/x86.h                 |   2 +
 include/linux/kvm_host.h           |   2 +
 virt/kvm/kvm_main.c                | 181 +++++++----------------------
 8 files changed, 104 insertions(+), 170 deletions(-)

diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst
index 02880d5552d5..0d6eff13fd46 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -227,10 +227,6 @@ time it will be set using the Dirty tracking mechanism described above.
 :Type:		mutex
 :Arch:		any
 :Protects:	- vm_list
-		- kvm_usage_count
-		- hardware virtualization enable/disable
-:Comment:	KVM also disables CPU hotplug via cpus_read_lock() during
-		enable/disable.
 
 ``kvm->mn_invalidate_lock``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 73740d698ebe..7422239987d8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -36,6 +36,7 @@
 #include <asm/kvm_page_track.h>
 #include <asm/kvm_vcpu_regs.h>
 #include <asm/hyperv-tlfs.h>
+#include <asm/reboot.h>
 
 #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
 
@@ -1605,6 +1606,8 @@ struct kvm_x86_ops {
 
 	int (*hardware_enable)(void);
 	void (*hardware_disable)(void);
+	cpu_emergency_virt_cb *emergency_disable;
+
 	void (*hardware_unsetup)(void);
 	bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
 	void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9aaf83c8d57d..7e118284934c 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4917,6 +4917,7 @@ static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
 static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.name = KBUILD_MODNAME,
 
+	.emergency_disable = svm_emergency_disable,
 	.check_processor_compatibility = svm_check_processor_compat,
 
 	.hardware_unsetup = svm_hardware_unsetup,
@@ -5348,8 +5349,6 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
 static void __svm_exit(void)
 {
 	kvm_x86_vendor_exit();
-
-	cpu_emergency_unregister_virt_callback(svm_emergency_disable);
 }
 
 static int __init svm_init(void)
@@ -5365,8 +5364,6 @@ static int __init svm_init(void)
 	if (r)
 		return r;
 
-	cpu_emergency_register_virt_callback(svm_emergency_disable);
-
 	/*
 	 * Common KVM initialization _must_ come last, after this, /dev/kvm is
 	 * exposed to userspace!
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d18dcb1e11a6..0dbe74da7ee3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8320,6 +8320,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 
 	.hardware_enable = vmx_hardware_enable,
 	.hardware_disable = vmx_hardware_disable,
+	.emergency_disable = vmx_emergency_disable,
+
 	.has_emulated_msr = vmx_has_emulated_msr,
 
 	.vm_size = sizeof(struct kvm_vmx),
@@ -8733,8 +8735,6 @@ static void __vmx_exit(void)
 {
 	allow_smaller_maxphyaddr = false;
 
-	cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
-
 	vmx_cleanup_l1d_flush();
 }
 
@@ -8760,6 +8760,12 @@ static int __init vmx_init(void)
 	 */
 	hv_init_evmcs();
 
+	for_each_possible_cpu(cpu) {
+		INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+
+		pi_init_cpu(cpu);
+	}
+
 	r = kvm_x86_vendor_init(&vmx_init_ops);
 	if (r)
 		return r;
@@ -8775,14 +8781,6 @@ static int __init vmx_init(void)
 	if (r)
 		goto err_l1d_flush;
 
-	for_each_possible_cpu(cpu) {
-		INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
-
-		pi_init_cpu(cpu);
-	}
-
-	cpu_emergency_register_virt_callback(vmx_emergency_disable);
-
 	vmx_check_vmcs12_offsets();
 
 	/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 26288ca05364..fdf6e05000c1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -134,6 +134,7 @@ static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
 
 static DEFINE_MUTEX(vendor_module_lock);
 struct kvm_x86_ops kvm_x86_ops __read_mostly;
+static int kvm_usage_count;
 
 #define KVM_X86_OP(func)					     \
 	DEFINE_STATIC_CALL_NULL(kvm_x86_##func,			     \
@@ -9687,15 +9688,10 @@ static int kvm_x86_check_processor_compatibility(void)
 	return static_call(kvm_x86_check_processor_compatibility)();
 }
 
-static void kvm_x86_check_cpu_compat(void *ret)
-{
-	*(int *)ret = kvm_x86_check_processor_compatibility();
-}
-
 int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 {
 	u64 host_pat;
-	int r, cpu;
+	int r;
 
 	guard(mutex)(&vendor_module_lock);
 
@@ -9771,11 +9767,11 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 
 	kvm_ops_update(ops);
 
-	for_each_online_cpu(cpu) {
-		smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1);
-		if (r < 0)
-			goto out_unwind_ops;
-	}
+	cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable);
+
+	r = ops->late_hardware_setup();
+	if (r)
+		goto out_unwind_ops;
 
 	/*
 	 * Point of no return!  DO NOT add error paths below this point unless
@@ -9818,6 +9814,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 	return 0;
 
 out_unwind_ops:
+	cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable);
 	kvm_x86_ops.hardware_enable = NULL;
 	static_call(kvm_x86_hardware_unsetup)();
 out_mmu_exit:
@@ -9858,6 +9855,10 @@ void kvm_x86_vendor_exit(void)
 	static_key_deferred_flush(&kvm_xen_enabled);
 	WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
 #endif
+
+	kvm_disable_virtualization();
+	cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable);
+
 	mutex_lock(&vendor_module_lock);
 	kvm_x86_ops.hardware_enable = NULL;
 	mutex_unlock(&vendor_module_lock);
@@ -12522,6 +12523,33 @@ void kvm_arch_free_vm(struct kvm *kvm)
 	__kvm_arch_free_vm(kvm);
 }
 
+int kvm_x86_enable_virtualization(void)
+{
+	int r;
+
+	guard(mutex)(&vendor_module_lock);
+
+	if (kvm_usage_count++)
+		return 0;
+
+	r = kvm_enable_virtualization();
+	if (r)
+		--kvm_usage_count;
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_x86_enable_virtualization);
+
+void kvm_x86_disable_virtualization(void)
+{
+	guard(mutex)(&vendor_module_lock);
+
+	if (--kvm_usage_count)
+		return;
+
+	kvm_disable_virtualization();
+}
+EXPORT_SYMBOL_GPL(kvm_x86_disable_virtualization);
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
@@ -12533,9 +12561,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	kvm->arch.vm_type = type;
 
+	ret = kvm_x86_enable_virtualization();
+	if (ret)
+		return ret;
+
 	ret = kvm_page_track_init(kvm);
 	if (ret)
-		goto out;
+		goto out_disable_virtualization;
 
 	kvm_mmu_init_vm(kvm);
 
@@ -12582,7 +12614,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 out_uninit_mmu:
 	kvm_mmu_uninit_vm(kvm);
 	kvm_page_track_cleanup(kvm);
-out:
+out_disable_virtualization:
+	kvm_x86_disable_virtualization();
 	return ret;
 }
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a8b71803777b..427c5d102525 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -32,6 +32,8 @@ struct kvm_caps {
 };
 
 void kvm_spurious_fault(void);
+int kvm_x86_enable_virtualization(void);
+void kvm_x86_disable_virtualization(void);
 
 #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check)		\
 ({									\
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 48f31dcd318a..92da2eee7448 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1518,6 +1518,8 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {}
 #endif
 
 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
+int kvm_enable_virtualization(void);
+void kvm_disable_virtualization(void);
 int kvm_arch_hardware_enable(void);
 void kvm_arch_hardware_disable(void);
 #endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f345dc15854f..326e3225c052 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -139,8 +139,6 @@ static int kvm_no_compat_open(struct inode *inode, struct file *file)
 #define KVM_COMPAT(c)	.compat_ioctl	= kvm_no_compat_ioctl,	\
 			.open		= kvm_no_compat_open
 #endif
-static int hardware_enable_all(void);
-static void hardware_disable_all(void);
 
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 
@@ -1261,10 +1259,6 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 	if (r)
 		goto out_err_no_arch_destroy_vm;
 
-	r = hardware_enable_all();
-	if (r)
-		goto out_err_no_disable;
-
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
 #endif
@@ -1304,8 +1298,6 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 		mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
 #endif
 out_err_no_mmu_notifier:
-	hardware_disable_all();
-out_err_no_disable:
 	kvm_arch_destroy_vm(kvm);
 out_err_no_arch_destroy_vm:
 	WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
@@ -1393,7 +1385,6 @@ static void kvm_destroy_vm(struct kvm *kvm)
 #endif
 	kvm_arch_free_vm(kvm);
 	preempt_notifier_dec();
-	hardware_disable_all();
 	mmdrop(mm);
 }
 
@@ -5536,9 +5527,8 @@ __visible bool kvm_rebooting;
 EXPORT_SYMBOL_GPL(kvm_rebooting);
 
 static DEFINE_PER_CPU(bool, hardware_enabled);
-static int kvm_usage_count;
 
-static int __hardware_enable_nolock(void)
+static int __kvm_enable_virtualization(void)
 {
 	if (__this_cpu_read(hardware_enabled))
 		return 0;
@@ -5553,34 +5543,18 @@ static int __hardware_enable_nolock(void)
 	return 0;
 }
 
-static void hardware_enable_nolock(void *failed)
-{
-	if (__hardware_enable_nolock())
-		atomic_inc(failed);
-}
-
 static int kvm_online_cpu(unsigned int cpu)
 {
-	int ret = 0;
-
 	/*
 	 * Abort the CPU online process if hardware virtualization cannot
 	 * be enabled. Otherwise running VMs would encounter unrecoverable
 	 * errors when scheduled to this CPU.
 	 */
-	mutex_lock(&kvm_lock);
-	if (kvm_usage_count)
-		ret = __hardware_enable_nolock();
-	mutex_unlock(&kvm_lock);
-	return ret;
+	return __kvm_enable_virtualization();
 }
 
-static void hardware_disable_nolock(void *junk)
+static void __kvm_disable_virtualization(void *ign)
 {
-	/*
-	 * Note, hardware_disable_all_nolock() tells all online CPUs to disable
-	 * hardware, not just CPUs that successfully enabled hardware!
-	 */
 	if (!__this_cpu_read(hardware_enabled))
 		return;
 
@@ -5591,78 +5565,10 @@ static void hardware_disable_nolock(void *junk)
 
 static int kvm_offline_cpu(unsigned int cpu)
 {
-	mutex_lock(&kvm_lock);
-	if (kvm_usage_count)
-		hardware_disable_nolock(NULL);
-	mutex_unlock(&kvm_lock);
+	__kvm_disable_virtualization(NULL);
 	return 0;
 }
 
-static void hardware_disable_all_nolock(void)
-{
-	BUG_ON(!kvm_usage_count);
-
-	kvm_usage_count--;
-	if (!kvm_usage_count)
-		on_each_cpu(hardware_disable_nolock, NULL, 1);
-}
-
-static void hardware_disable_all(void)
-{
-	cpus_read_lock();
-	mutex_lock(&kvm_lock);
-	hardware_disable_all_nolock();
-	mutex_unlock(&kvm_lock);
-	cpus_read_unlock();
-}
-
-static int hardware_enable_all(void)
-{
-	atomic_t failed = ATOMIC_INIT(0);
-	int r;
-
-	/*
-	 * Do not enable hardware virtualization if the system is going down.
-	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
-	 * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
-	 * after kvm_reboot() is called.  Note, this relies on system_state
-	 * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
-	 * hook instead of registering a dedicated reboot notifier (the latter
-	 * runs before system_state is updated).
-	 */
-	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
-	    system_state == SYSTEM_RESTART)
-		return -EBUSY;
-
-	/*
-	 * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
-	 * is called, and so on_each_cpu() between them includes the CPU that
-	 * is being onlined.  As a result, hardware_enable_nolock() may get
-	 * invoked before kvm_online_cpu(), which also enables hardware if the
-	 * usage count is non-zero.  Disable CPU hotplug to avoid attempting to
-	 * enable hardware multiple times.
-	 */
-	cpus_read_lock();
-	mutex_lock(&kvm_lock);
-
-	r = 0;
-
-	kvm_usage_count++;
-	if (kvm_usage_count == 1) {
-		on_each_cpu(hardware_enable_nolock, &failed, 1);
-
-		if (atomic_read(&failed)) {
-			hardware_disable_all_nolock();
-			r = -EBUSY;
-		}
-	}
-
-	mutex_unlock(&kvm_lock);
-	cpus_read_unlock();
-
-	return r;
-}
-
 static void kvm_shutdown(void)
 {
 	/*
@@ -5678,34 +5584,22 @@ static void kvm_shutdown(void)
 	 */
 	pr_info("kvm: exiting hardware virtualization\n");
 	kvm_rebooting = true;
-	on_each_cpu(hardware_disable_nolock, NULL, 1);
+	on_each_cpu(__kvm_disable_virtualization, NULL, 1);
 }
 
 static int kvm_suspend(void)
 {
-	/*
-	 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
-	 * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
-	 * is stable.  Assert that kvm_lock is not held to ensure the system
-	 * isn't suspended while KVM is enabling hardware.  Hardware enabling
-	 * can be preempted, but the task cannot be frozen until it has dropped
-	 * all locks (userspace tasks are frozen via a fake signal).
-	 */
-	lockdep_assert_not_held(&kvm_lock);
 	lockdep_assert_irqs_disabled();
 
-	if (kvm_usage_count)
-		hardware_disable_nolock(NULL);
+	__kvm_disable_virtualization(NULL);
 	return 0;
 }
 
 static void kvm_resume(void)
 {
-	lockdep_assert_not_held(&kvm_lock);
 	lockdep_assert_irqs_disabled();
 
-	if (kvm_usage_count)
-		WARN_ON_ONCE(__hardware_enable_nolock());
+	WARN_ON_ONCE(__kvm_enable_virtualization());
 }
 
 static struct syscore_ops kvm_syscore_ops = {
@@ -5713,16 +5607,45 @@ static struct syscore_ops kvm_syscore_ops = {
 	.resume = kvm_resume,
 	.shutdown = kvm_shutdown,
 };
-#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
-static int hardware_enable_all(void)
+
+int kvm_enable_virtualization(void)
 {
+	int r;
+
+	r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
+			      kvm_online_cpu, kvm_offline_cpu);
+	if (r)
+		return r;
+
+	register_syscore_ops(&kvm_syscore_ops);
+
+	/*
+	 * Manually undo virtualization enabling if the system is going down.
+	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
+	 * possible for an in-flight module load to enable virtualization
+	 * after syscore_shutdown() is called, i.e. without kvm_shutdown()
+	 * being invoked.  Note, this relies on system_state being set _before_
+	 * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
+	 * or this CPU observes the impedning shutdown.  Which is why KVM uses
+	 * a syscore ops hook instead of registering a dedicated reboot
+	 * notifier (the latter runs before system_state is updated).
+	 */
+	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
+	    system_state == SYSTEM_RESTART) {
+		unregister_syscore_ops(&kvm_syscore_ops);
+		cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
+		return -EBUSY;
+	}
+
 	return 0;
 }
 
-static void hardware_disable_all(void)
+void kvm_disable_virtualization(void)
 {
-
+	unregister_syscore_ops(&kvm_syscore_ops);
+	cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
 }
+
 #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
 
 static void kvm_iodevice_destructor(struct kvm_io_device *dev)
@@ -6418,15 +6341,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	int r;
 	int cpu;
 
-#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
-				      kvm_online_cpu, kvm_offline_cpu);
-	if (r)
-		return r;
-
-	register_syscore_ops(&kvm_syscore_ops);
-#endif
-
 	/* A kmem cache lets us meet the alignment requirements of fx_save. */
 	if (!vcpu_align)
 		vcpu_align = __alignof__(struct kvm_vcpu);
@@ -6437,10 +6351,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 					   offsetofend(struct kvm_vcpu, stats_id)
 					   - offsetof(struct kvm_vcpu, arch),
 					   NULL);
-	if (!kvm_vcpu_cache) {
-		r = -ENOMEM;
-		goto err_vcpu_cache;
-	}
+	if (!kvm_vcpu_cache)
+		return -ENOMEM;
 
 	for_each_possible_cpu(cpu) {
 		if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
@@ -6497,11 +6409,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	for_each_possible_cpu(cpu)
 		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
 	kmem_cache_destroy(kvm_vcpu_cache);
-err_vcpu_cache:
-#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-	unregister_syscore_ops(&kvm_syscore_ops);
-	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
-#endif
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_init);
@@ -6523,10 +6430,6 @@ void kvm_exit(void)
 	kmem_cache_destroy(kvm_vcpu_cache);
 	kvm_vfio_ops_exit();
 	kvm_async_pf_deinit();
-#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-	unregister_syscore_ops(&kvm_syscore_ops);
-	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
-#endif
 	kvm_irqfd_exit();
 }
 EXPORT_SYMBOL_GPL(kvm_exit);

base-commit: 2d181d84af38146748042a6974c577fc46c3f1c3
--
Huang, Kai April 17, 2024, 11:09 p.m. UTC | #18
On 18/04/2024 2:40 am, Sean Christopherson wrote:
> On Wed, Apr 17, 2024, Kai Huang wrote:
>> On Tue, 2024-04-16 at 13:58 -0700, Sean Christopherson wrote:
>>> On Fri, Apr 12, 2024, Kai Huang wrote:
>>>> On 12/04/2024 2:03 am, Sean Christopherson wrote:
>>>>> On Thu, Apr 11, 2024, Kai Huang wrote:
>>>>>> I can certainly follow up with this and generate a reviewable patchset if I
>>>>>> can confirm with you that this is what you want?
>>>>>
>>>>> Yes, I think it's the right direction.  I still have minor concerns about VMX
>>>>> being enabled while kvm.ko is loaded, which means that VMXON will _always_ be
>>>>> enabled if KVM is built-in.  But after seeing the complexity that is needed to
>>>>> safely initialize TDX, and after seeing just how much complexity KVM already
>>>>> has because it enables VMX on-demand (I hadn't actually tried removing that code
>>>>> before), I think the cost of that complexity far outweighs the risk of "always"
>>>>> being post-VMXON.
>>>>
>>>> Does always leaving VMXON have any actual damage, given we have emergency
>>>> virtualization shutdown?
>>>
>>> Being post-VMXON increases the risk of kexec() into the kdump kernel failing.
>>> The tradeoffs that we're trying to balance are: is the risk of kexec() failing
>>> due to the complexity of the emergency VMX code higher than the risk of us breaking
>>> things in general due to taking on a ton of complexity to juggle VMXON for TDX?
>>>
>>> After seeing the latest round of TDX code, my opinion is that being post-VMXON
>>> is less risky overall, in no small part because we need that to work anyways for
>>> hosts that are actively running VMs.
>>
>> How about we only keep VMX always on when TDX is enabled?
> 
> Paolo also suggested that forcing VMXON only if TDX is enabled, mostly because
> kvm-intel.ko and kvm-amd.ko may be auto-loaded based on MODULE_DEVICE_TABLE(),
> which in turn causes problems for out-of-tree hypervisors that want control over
> VMX and SVM.
> 
> I'm not opposed to the idea, it's the complexity and messiness I dislike.  E.g.
> the TDX code shouldn't have to deal with CPU hotplug locks, core KVM shouldn't
> need to expose nolock helpers, etc.  And if we're going to make non-trivial
> changes to the core KVM hardware enabling code anyways...
> 
> What about this?  Same basic idea as before, but instead of unconditionally doing
> hardware enabling during module initialization, let TDX do hardware enabling in
> a late_hardware_setup(), and then have KVM x86 ensure virtualization is enabled
> when creating VMs.
> 
> This way, architectures that aren't saddled with out-of-tree hypervisors can do
> the dead simple thing of enabling hardware during their initialization sequence,
> and the TDX code is much more sane, e.g. invoke kvm_x86_enable_virtualization()
> during late_hardware_setup(), and kvm_x86_disable_virtualization() during module
> exit (presumably).

Fine to me, given I am not familiar with other ARCHs, assuming always 
enable virtualization when KVM present is fine to them. :-)

Two questions below:

> +int kvm_x86_enable_virtualization(void)
> +{
> +	int r;
> +
> +	guard(mutex)(&vendor_module_lock);

It's a little bit odd to take the vendor_module_lock mutex.

It is called by kvm_arch_init_vm(), so more reasonablly we should still 
use kvm_lock?

Also, if we invoke kvm_x86_enable_virtualization() from 
kvm_x86_ops->late_hardware_setup(), then IIUC we will deadlock here 
because kvm_x86_vendor_init() already takes the vendor_module_lock?

> +
> +	if (kvm_usage_count++)
> +		return 0;
> +
> +	r = kvm_enable_virtualization();
> +	if (r)
> +		--kvm_usage_count;
> +
> +	return r;
> +}
> +EXPORT_SYMBOL_GPL(kvm_x86_enable_virtualization);
> +

[...]

> +int kvm_enable_virtualization(void)
>   {
> +	int r;
> +
> +	r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
> +			      kvm_online_cpu, kvm_offline_cpu);
> +	if (r)
> +		return r;
> +
> +	register_syscore_ops(&kvm_syscore_ops);
> +
> +	/*
> +	 * Manually undo virtualization enabling if the system is going down.
> +	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
> +	 * possible for an in-flight module load to enable virtualization
> +	 * after syscore_shutdown() is called, i.e. without kvm_shutdown()
> +	 * being invoked.  Note, this relies on system_state being set _before_
> +	 * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
> +	 * or this CPU observes the impedning shutdown.  Which is why KVM uses
> +	 * a syscore ops hook instead of registering a dedicated reboot
> +	 * notifier (the latter runs before system_state is updated).
> +	 */
> +	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
> +	    system_state == SYSTEM_RESTART) {
> +		unregister_syscore_ops(&kvm_syscore_ops);
> +		cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
> +		return -EBUSY;
> +	}
> +

Aren't we also supposed to do:

	on_each_cpu(__kvm_enable_virtualization, NULL, 1);

here?

>   	return 0;
>   }
>
Sean Christopherson April 17, 2024, 11:35 p.m. UTC | #19
On Thu, Apr 18, 2024, Kai Huang wrote:
> On 18/04/2024 2:40 am, Sean Christopherson wrote:
> > This way, architectures that aren't saddled with out-of-tree hypervisors can do
> > the dead simple thing of enabling hardware during their initialization sequence,
> > and the TDX code is much more sane, e.g. invoke kvm_x86_enable_virtualization()
> > during late_hardware_setup(), and kvm_x86_disable_virtualization() during module
> > exit (presumably).
> 
> Fine to me, given I am not familiar with other ARCHs, assuming always enable
> virtualization when KVM present is fine to them. :-)
> 
> Two questions below:
> 
> > +int kvm_x86_enable_virtualization(void)
> > +{
> > +	int r;
> > +
> > +	guard(mutex)(&vendor_module_lock);
> 
> It's a little bit odd to take the vendor_module_lock mutex.
> 
> It is called by kvm_arch_init_vm(), so more reasonablly we should still use
> kvm_lock?

I think this should take an x86-specific lock, since it's guarding x86-specific
data.  And vendor_module_lock fits the bill perfectly.  Well, except for the
name, and I definitely have no objection to renaming it.

> Also, if we invoke kvm_x86_enable_virtualization() from
> kvm_x86_ops->late_hardware_setup(), then IIUC we will deadlock here because
> kvm_x86_vendor_init() already takes the vendor_module_lock?

Ah, yeah.  Oh, duh.  I think the reason I didn't initially suggest late_hardware_setup()
is that I was assuming/hoping TDX setup could be done after kvm_x86_vendor_exit().
E.g. in vt_init() or whatever it gets called:

	r = kvm_x86_vendor_exit(...);
	if (r)
		return r;

	if (enable_tdx) {
		r = tdx_blah_blah_blah();
		if (r)
			goto vendor_exit;
	}

> > +	if (kvm_usage_count++)
> > +		return 0;
> > +
> > +	r = kvm_enable_virtualization();
> > +	if (r)
> > +		--kvm_usage_count;
> > +
> > +	return r;
> > +}
> > +EXPORT_SYMBOL_GPL(kvm_x86_enable_virtualization);
> > +
> 
> [...]
> 
> > +int kvm_enable_virtualization(void)
> >   {
> > +	int r;
> > +
> > +	r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
> > +			      kvm_online_cpu, kvm_offline_cpu);
> > +	if (r)
> > +		return r;
> > +
> > +	register_syscore_ops(&kvm_syscore_ops);
> > +
> > +	/*
> > +	 * Manually undo virtualization enabling if the system is going down.
> > +	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
> > +	 * possible for an in-flight module load to enable virtualization
> > +	 * after syscore_shutdown() is called, i.e. without kvm_shutdown()
> > +	 * being invoked.  Note, this relies on system_state being set _before_
> > +	 * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
> > +	 * or this CPU observes the impedning shutdown.  Which is why KVM uses
> > +	 * a syscore ops hook instead of registering a dedicated reboot
> > +	 * notifier (the latter runs before system_state is updated).
> > +	 */
> > +	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
> > +	    system_state == SYSTEM_RESTART) {
> > +		unregister_syscore_ops(&kvm_syscore_ops);
> > +		cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
> > +		return -EBUSY;
> > +	}
> > +
> 
> Aren't we also supposed to do:
> 
> 	on_each_cpu(__kvm_enable_virtualization, NULL, 1);
> 
> here?

No, cpuhp_setup_state() invokes the callback, kvm_online_cpu(), on each CPU.
I.e. KVM has been doing things the hard way by using cpuhp_setup_state_nocalls().
That's part of the complexity I would like to get rid of.
Huang, Kai April 18, 2024, 12:47 a.m. UTC | #20
On 18/04/2024 11:35 am, Sean Christopherson wrote:
> On Thu, Apr 18, 2024, Kai Huang wrote:
>> On 18/04/2024 2:40 am, Sean Christopherson wrote:
>>> This way, architectures that aren't saddled with out-of-tree hypervisors can do
>>> the dead simple thing of enabling hardware during their initialization sequence,
>>> and the TDX code is much more sane, e.g. invoke kvm_x86_enable_virtualization()
>>> during late_hardware_setup(), and kvm_x86_disable_virtualization() during module
>>> exit (presumably).
>>
>> Fine to me, given I am not familiar with other ARCHs, assuming always enable
>> virtualization when KVM present is fine to them. :-)
>>
>> Two questions below:
>>
>>> +int kvm_x86_enable_virtualization(void)
>>> +{
>>> +	int r;
>>> +
>>> +	guard(mutex)(&vendor_module_lock);
>>
>> It's a little bit odd to take the vendor_module_lock mutex.
>>
>> It is called by kvm_arch_init_vm(), so more reasonablly we should still use
>> kvm_lock?
> 
> I think this should take an x86-specific lock, since it's guarding x86-specific
> data.  

OK.  This makes sense.

And vendor_module_lock fits the bill perfectly.  Well, except for the
> name, and I definitely have no objection to renaming it.

No opinion on renaming.  Personally I wouldn't bother to rename.  We can 
add a comment in kvm_x86_enable_virtualization() to explain.  Perhaps in 
the future we just want to change to always enable virtualization for 
x86 too..

> 
>> Also, if we invoke kvm_x86_enable_virtualization() from
>> kvm_x86_ops->late_hardware_setup(), then IIUC we will deadlock here because
>> kvm_x86_vendor_init() already takes the vendor_module_lock?
> 
> Ah, yeah.  Oh, duh.  I think the reason I didn't initially suggest late_hardware_setup()
> is that I was assuming/hoping TDX setup could be done after kvm_x86_vendor_exit().
> E.g. in vt_init() or whatever it gets called:
> 
> 	r = kvm_x86_vendor_exit(...);
> 	if (r)
> 		return r;
> 
> 	if (enable_tdx) {
> 		r = tdx_blah_blah_blah();
> 		if (r)
> 			goto vendor_exit;
> 	}


I assume the reason you introduced the late_hardware_setup() is purely 
because you want to do:

   cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_enable);

after

   kvm_ops_update()?

Anyway, we can also do 'enable_tdx' outside of kvm_x86_vendor_init() as 
above, given it cannot be done in hardware_setup() anyway.

If we do 'enable_tdx' in late_hardware_setup(), we will need a 
kvm_x86_enable_virtualization_nolock(), but that's also not a problem to me.

So which way do you prefer?

Btw, with kvm_x86_virtualization_enable(), it seems the compatibility 
check is lost, which I assume is OK?

Btw2, currently tdx_enable() requires cpus_read_lock() must be called 
prior.  If we do unconditional tdx_cpu_enable() in vt_hardware_enable(), 
then with your proposal IIUC there's no such requirement anymore, 
because no task will be scheduled to the new CPU before it reaches 
CPUHP_AP_ACTIVE.  But now calling cpus_read_lock()/unlock() around 
tdx_enable() also acceptable to me.

[...]

>>
>>> +int kvm_enable_virtualization(void)
>>>    {
>>> +	int r;
>>> +
>>> +	r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
>>> +			      kvm_online_cpu, kvm_offline_cpu);
>>> +	if (r)
>>> +		return r;
>>> +
>>> +	register_syscore_ops(&kvm_syscore_ops);
>>> +
>>> +	/*
>>> +	 * Manually undo virtualization enabling if the system is going down.
>>> +	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
>>> +	 * possible for an in-flight module load to enable virtualization
>>> +	 * after syscore_shutdown() is called, i.e. without kvm_shutdown()
>>> +	 * being invoked.  Note, this relies on system_state being set _before_
>>> +	 * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
>>> +	 * or this CPU observes the impedning shutdown.  Which is why KVM uses
>>> +	 * a syscore ops hook instead of registering a dedicated reboot
>>> +	 * notifier (the latter runs before system_state is updated).
>>> +	 */
>>> +	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
>>> +	    system_state == SYSTEM_RESTART) {
>>> +		unregister_syscore_ops(&kvm_syscore_ops);
>>> +		cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
>>> +		return -EBUSY;
>>> +	}
>>> +
>>
>> Aren't we also supposed to do:
>>
>> 	on_each_cpu(__kvm_enable_virtualization, NULL, 1);
>>
>> here?
> 
> No, cpuhp_setup_state() invokes the callback, kvm_online_cpu(), on each CPU.
> I.e. KVM has been doing things the hard way by using cpuhp_setup_state_nocalls().
> That's part of the complexity I would like to get rid of.

Ah, right :-)

Btw, why couldn't we do the 'system_state' check at the very beginning 
of this function?
Sean Christopherson April 18, 2024, 2:30 p.m. UTC | #21
On Thu, Apr 18, 2024, Kai Huang wrote:
> On 18/04/2024 11:35 am, Sean Christopherson wrote:
> > Ah, yeah.  Oh, duh.  I think the reason I didn't initially suggest late_hardware_setup()
> > is that I was assuming/hoping TDX setup could be done after kvm_x86_vendor_exit().
> > E.g. in vt_init() or whatever it gets called:
> > 
> > 	r = kvm_x86_vendor_exit(...);
> > 	if (r)
> > 		return r;
> > 
> > 	if (enable_tdx) {
> > 		r = tdx_blah_blah_blah();
> > 		if (r)
> > 			goto vendor_exit;
> > 	}
> 
> 
> I assume the reason you introduced the late_hardware_setup() is purely
> because you want to do:
> 
>   cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_enable);
> 
> after
> 
>   kvm_ops_update()?

No, kvm_ops_update() needs to come before kvm_x86_enable_virtualization(), as the
static_call() to hardware_enable() needs to be patched in.

Oh, and my adjust patch is broken, the code to do the compat checks should NOT
be removed; it could be removed if KVM unconditionally enabled VMX during setup,
but it needs to stay in the !TDX case.

-       for_each_online_cpu(cpu) {
-               smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1);
-               if (r < 0)
-                       goto out_unwind_ops;
-       }

Which is another reason to defer kvm_x86_enable_virtualization(), though to be
honest not a particularly compelling reason on its own.

> Anyway, we can also do 'enable_tdx' outside of kvm_x86_vendor_init() as
> above, given it cannot be done in hardware_setup() anyway.
> 
> If we do 'enable_tdx' in late_hardware_setup(), we will need a
> kvm_x86_enable_virtualization_nolock(), but that's also not a problem to me.
> 
> So which way do you prefer?
> 
> Btw, with kvm_x86_virtualization_enable(), it seems the compatibility check
> is lost, which I assume is OK?

Heh, and I obviously wasn't reading ahead :-)

> Btw2, currently tdx_enable() requires cpus_read_lock() must be called prior.
> If we do unconditional tdx_cpu_enable() in vt_hardware_enable(), then with
> your proposal IIUC there's no such requirement anymore, because no task will
> be scheduled to the new CPU before it reaches CPUHP_AP_ACTIVE.

Correct.

> But now calling cpus_read_lock()/unlock() around tdx_enable() also acceptable
> to me.

No, that will deadlock as cpuhp_setup_state() does cpus_read_lock().

> > > > +int kvm_enable_virtualization(void)
> > > >    {
> > > > +	int r;
> > > > +
> > > > +	r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
> > > > +			      kvm_online_cpu, kvm_offline_cpu);
> > > > +	if (r)
> > > > +		return r;
> > > > +
> > > > +	register_syscore_ops(&kvm_syscore_ops);
> > > > +
> > > > +	/*
> > > > +	 * Manually undo virtualization enabling if the system is going down.
> > > > +	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
> > > > +	 * possible for an in-flight module load to enable virtualization
> > > > +	 * after syscore_shutdown() is called, i.e. without kvm_shutdown()
> > > > +	 * being invoked.  Note, this relies on system_state being set _before_
> > > > +	 * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
> > > > +	 * or this CPU observes the impedning shutdown.  Which is why KVM uses
> > > > +	 * a syscore ops hook instead of registering a dedicated reboot
> > > > +	 * notifier (the latter runs before system_state is updated).
> > > > +	 */
> > > > +	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
> > > > +	    system_state == SYSTEM_RESTART) {
> > > > +		unregister_syscore_ops(&kvm_syscore_ops);
> > > > +		cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
> > > > +		return -EBUSY;
> > > > +	}
> > > > +
> > > 
> > > Aren't we also supposed to do:
> > > 
> > > 	on_each_cpu(__kvm_enable_virtualization, NULL, 1);
> > > 
> > > here?
> > 
> > No, cpuhp_setup_state() invokes the callback, kvm_online_cpu(), on each CPU.
> > I.e. KVM has been doing things the hard way by using cpuhp_setup_state_nocalls().
> > That's part of the complexity I would like to get rid of.
> 
> Ah, right :-)
> 
> Btw, why couldn't we do the 'system_state' check at the very beginning of
> this function?

We could, but we'd still need to check after, and adding a small bit of extra
complexity just to try to catch a very rare situation isn't worth it.

To prevent races, system_state needs to be check after register_syscore_ops(),
because only once kvm_syscore_ops is registered is KVM guaranteed to get notified
of a shutdown.

And because the kvm_syscore_ops hooks disable virtualization, they should be called
after cpuhp_setup_state().  That's not strictly required, as the per-CPU
hardware_enabled flag will prevent true problems if the system enter shutdown
state before KVM reaches cpuhp_setup_state().

Hmm, but the same edge cases exists in the above flow.  If the system enters
shutdown _just_ after register_syscore_ops(), KVM would see that in system_state
and do cpuhp_remove_state(), i.e. invoke kvm_offline_cpu() and thus do a double
disable (which again is benign because of hardware_enabled).

Ah, but registering syscore ops before doing cpuhp_setup_state() has another race,
and one that could be fatal.  If the system does suspend+resume before the cpuhup
hooks are registered, kvm_resume() would enable virtualization.  And then if
cpuhp_setup_state() failed, virtualization would be left enabled.

So cpuhp_setup_state() *must* come before register_syscore_ops(), and
register_syscore_ops() *must* come before the system_state check.
Huang, Kai April 18, 2024, 11:09 p.m. UTC | #22
On 19/04/2024 2:30 am, Sean Christopherson wrote:
> On Thu, Apr 18, 2024, Kai Huang wrote:
>> On 18/04/2024 11:35 am, Sean Christopherson wrote:
>>> Ah, yeah.  Oh, duh.  I think the reason I didn't initially suggest late_hardware_setup()
>>> is that I was assuming/hoping TDX setup could be done after kvm_x86_vendor_exit().
>>> E.g. in vt_init() or whatever it gets called:
>>>
>>> 	r = kvm_x86_vendor_exit(...);
>>> 	if (r)
>>> 		return r;
>>>
>>> 	if (enable_tdx) {
>>> 		r = tdx_blah_blah_blah();
>>> 		if (r)
>>> 			goto vendor_exit;
>>> 	}
>>
>>
>> I assume the reason you introduced the late_hardware_setup() is purely
>> because you want to do:
>>
>>    cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_enable);
>>
>> after
>>
>>    kvm_ops_update()?
> 
> No, kvm_ops_update() needs to come before kvm_x86_enable_virtualization(), as the
> static_call() to hardware_enable() needs to be patched in.

Right.  I was talking about that the reason you introduced the 
late_hardware_setup() was because we need to do 
kvm_x86_virtualization_enabled() and the above 
cpu_emergency_register_virt_callback() after kvm_ops_update().

> 
> Oh, and my adjust patch is broken, the code to do the compat checks should NOT
> be removed; it could be removed if KVM unconditionally enabled VMX during setup,
> but it needs to stay in the !TDX case.

Right.

> 
> -       for_each_online_cpu(cpu) {
> -               smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1);
> -               if (r < 0)
> -                       goto out_unwind_ops;
> -       }
> 
> Which is another reason to defer kvm_x86_enable_virtualization(), though to be
> honest not a particularly compelling reason on its own.
> 
>> Anyway, we can also do 'enable_tdx' outside of kvm_x86_vendor_init() as
>> above, given it cannot be done in hardware_setup() anyway.
>>
>> If we do 'enable_tdx' in late_hardware_setup(), we will need a
>> kvm_x86_enable_virtualization_nolock(), but that's also not a problem to me.
>>
>> So which way do you prefer?
>>
>> Btw, with kvm_x86_virtualization_enable(), it seems the compatibility check
>> is lost, which I assume is OK?
> 
> Heh, and I obviously wasn't reading ahead :-)
> 
>> Btw2, currently tdx_enable() requires cpus_read_lock() must be called prior.
>> If we do unconditional tdx_cpu_enable() in vt_hardware_enable(), then with
>> your proposal IIUC there's no such requirement anymore, because no task will
>> be scheduled to the new CPU before it reaches CPUHP_AP_ACTIVE.
> 
> Correct.
> 
>> But now calling cpus_read_lock()/unlock() around tdx_enable() also acceptable
>> to me.
> 
> No, that will deadlock as cpuhp_setup_state() does cpus_read_lock().

Right, but it takes cpus_read_lock()/unlock() internally.  I was talking 
about:

	if (enable_tdx) {
		kvm_x86_virtualization_enable();

		/*
		 * Unfortunately currently tdx_enable() internally has
		 * lockdep_assert_cpus_held().
		 */
		cpus_read_lock();
		tdx_enable();
		cpus_read_unlock();
	}
	
> 
>>>>> +int kvm_enable_virtualization(void)
>>>>>     {
>>>>> +	int r;
>>>>> +
>>>>> +	r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
>>>>> +			      kvm_online_cpu, kvm_offline_cpu);
>>>>> +	if (r)
>>>>> +		return r;
>>>>> +
>>>>> +	register_syscore_ops(&kvm_syscore_ops);
>>>>> +
>>>>> +	/*
>>>>> +	 * Manually undo virtualization enabling if the system is going down.
>>>>> +	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
>>>>> +	 * possible for an in-flight module load to enable virtualization
>>>>> +	 * after syscore_shutdown() is called, i.e. without kvm_shutdown()
>>>>> +	 * being invoked.  Note, this relies on system_state being set _before_
>>>>> +	 * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
>>>>> +	 * or this CPU observes the impedning shutdown.  Which is why KVM uses
>>>>> +	 * a syscore ops hook instead of registering a dedicated reboot
>>>>> +	 * notifier (the latter runs before system_state is updated).
>>>>> +	 */
>>>>> +	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
>>>>> +	    system_state == SYSTEM_RESTART) {
>>>>> +		unregister_syscore_ops(&kvm_syscore_ops);
>>>>> +		cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
>>>>> +		return -EBUSY;
>>>>> +	}
>>>>> +
>>>>
>>>> Aren't we also supposed to do:
>>>>
>>>> 	on_each_cpu(__kvm_enable_virtualization, NULL, 1);
>>>>
>>>> here?
>>>
>>> No, cpuhp_setup_state() invokes the callback, kvm_online_cpu(), on each CPU.
>>> I.e. KVM has been doing things the hard way by using cpuhp_setup_state_nocalls().
>>> That's part of the complexity I would like to get rid of.
>>
>> Ah, right :-)
>>
>> Btw, why couldn't we do the 'system_state' check at the very beginning of
>> this function?
> 
> We could, but we'd still need to check after, and adding a small bit of extra
> complexity just to try to catch a very rare situation isn't worth it.
> 
> To prevent races, system_state needs to be check after register_syscore_ops(),
> because only once kvm_syscore_ops is registered is KVM guaranteed to get notified
> of a shutdown. >
> And because the kvm_syscore_ops hooks disable virtualization, they should be called
> after cpuhp_setup_state().  That's not strictly required, as the per-CPU
> hardware_enabled flag will prevent true problems if the system enter shutdown
> state before KVM reaches cpuhp_setup_state().
> 
> Hmm, but the same edge cases exists in the above flow.  If the system enters
> shutdown _just_ after register_syscore_ops(), KVM would see that in system_state
> and do cpuhp_remove_state(), i.e. invoke kvm_offline_cpu() and thus do a double
> disable (which again is benign because of hardware_enabled).
> 
> Ah, but registering syscore ops before doing cpuhp_setup_state() has another race,
> and one that could be fatal.  If the system does suspend+resume before the cpuhup
> hooks are registered, kvm_resume() would enable virtualization.  And then if
> cpuhp_setup_state() failed, virtualization would be left enabled.
> 
> So cpuhp_setup_state() *must* come before register_syscore_ops(), and
> register_syscore_ops() *must* come before the system_state check.

OK.  I guess I have to double check here to completely understand the 
races.  :-)

So I think we have consensus to go with the approach that shows in your 
second diff -- that is to always enable virtualization during module 
loading for all other ARCHs other than x86, for which we only always 
enables virtualization during module loading for TDX.

Then how about "do kvm_x86_virtualization_enable()  within 
late_hardware_setup() in kvm_x86_vendor_init()"  vs "do 
kvm_x86_virtualization_enable() in TDX-specific code after 
kvm_x86_vendor_init()"?

Which do you prefer?
Sean Christopherson April 19, 2024, 5:23 p.m. UTC | #23
On Fri, Apr 19, 2024, Kai Huang wrote:
> On 19/04/2024 2:30 am, Sean Christopherson wrote:
> > No, that will deadlock as cpuhp_setup_state() does cpus_read_lock().
> 
> Right, but it takes cpus_read_lock()/unlock() internally.  I was talking
> about:
> 
> 	if (enable_tdx) {
> 		kvm_x86_virtualization_enable();
> 
> 		/*
> 		 * Unfortunately currently tdx_enable() internally has
> 		 * lockdep_assert_cpus_held().
> 		 */
> 		cpus_read_lock();
> 		tdx_enable();
> 		cpus_read_unlock();
> 	}

Ah.  Just have tdx_enable() do cpus_read_lock(), I suspect/assume the current
implemention was purely done in anticipation of KVM "needing" to do tdx_enable()
while holding cpu_hotplug_lock.

And tdx_enable() should also do its best to verify that the caller is post-VMXON:

	if (WARN_ON_ONCE(!(__read_cr4() & X86_CR4_VMXE)))
		return -EINVAL;

> > > Btw, why couldn't we do the 'system_state' check at the very beginning of
> > > this function?
> > 
> > We could, but we'd still need to check after, and adding a small bit of extra
> > complexity just to try to catch a very rare situation isn't worth it.
> > 
> > To prevent races, system_state needs to be check after register_syscore_ops(),
> > because only once kvm_syscore_ops is registered is KVM guaranteed to get notified
> > of a shutdown. >
> > And because the kvm_syscore_ops hooks disable virtualization, they should be called
> > after cpuhp_setup_state().  That's not strictly required, as the per-CPU
> > hardware_enabled flag will prevent true problems if the system enter shutdown
> > state before KVM reaches cpuhp_setup_state().
> > 
> > Hmm, but the same edge cases exists in the above flow.  If the system enters
> > shutdown _just_ after register_syscore_ops(), KVM would see that in system_state
> > and do cpuhp_remove_state(), i.e. invoke kvm_offline_cpu() and thus do a double
> > disable (which again is benign because of hardware_enabled).
> > 
> > Ah, but registering syscore ops before doing cpuhp_setup_state() has another race,
> > and one that could be fatal.  If the system does suspend+resume before the cpuhup
> > hooks are registered, kvm_resume() would enable virtualization.  And then if
> > cpuhp_setup_state() failed, virtualization would be left enabled.
> > 
> > So cpuhp_setup_state() *must* come before register_syscore_ops(), and
> > register_syscore_ops() *must* come before the system_state check.
> 
> OK.  I guess I have to double check here to completely understand the races.
> :-)
> 
> So I think we have consensus to go with the approach that shows in your
> second diff -- that is to always enable virtualization during module loading
> for all other ARCHs other than x86, for which we only always enables
> virtualization during module loading for TDX.

Assuming the other arch maintainers are ok with that approach.  If waiting until
a VM is created is desirable for other architectures, then we'll need to figure
out a plan b.  E.g. KVM arm64 doesn't support being built as a module, so enabling
hardware during initialization would mean virtualization is enabled for any kernel
that is built with CONFIG_KVM=y.

Actually, duh.  There's absolutely no reason to force other architectures to
choose when to enable virtualization.  As evidenced by the massaging to have x86
keep enabling virtualization on-demand for !TDX, the cleanups don't come from
enabling virtualization during module load, they come from registering cpuup and
syscore ops when virtualization is enabled.

I.e. we can keep kvm_usage_count in common code, and just do exactly what I
proposed for kvm_x86_enable_virtualization().

I have patches to do this, and initial testing suggests they aren't wildly
broken.  I'll post them soon-ish, assuming nothing pops up in testing.  They are
clean enough that they can land in advance of TDX, e.g. in kvm-coco-queue even
before other architectures verify I didn't break them.

> Then how about "do kvm_x86_virtualization_enable()  within
> late_hardware_setup() in kvm_x86_vendor_init()"  vs "do
> kvm_x86_virtualization_enable() in TDX-specific code after
> kvm_x86_vendor_init()"?
> 
> Which do you prefer?

The latter, assuming it doesn't make the TDX code more complex than it needs to
be.  The fewer kvm_x86_ops hooks, the better.
Huang, Kai April 22, 2024, 12:46 p.m. UTC | #24
On Fri, 2024-04-19 at 10:23 -0700, Sean Christopherson wrote:
> On Fri, Apr 19, 2024, Kai Huang wrote:
> > On 19/04/2024 2:30 am, Sean Christopherson wrote:
> > > No, that will deadlock as cpuhp_setup_state() does cpus_read_lock().
> > 
> > Right, but it takes cpus_read_lock()/unlock() internally.  I was talking
> > about:
> > 
> > 	if (enable_tdx) {
> > 		kvm_x86_virtualization_enable();
> > 
> > 		/*
> > 		 * Unfortunately currently tdx_enable() internally has
> > 		 * lockdep_assert_cpus_held().
> > 		 */
> > 		cpus_read_lock();
> > 		tdx_enable();
> > 		cpus_read_unlock();
> > 	}
> 
> Ah.  Just have tdx_enable() do cpus_read_lock(), I suspect/assume the current
> implemention was purely done in anticipation of KVM "needing" to do tdx_enable()
> while holding cpu_hotplug_lock.

Yeah.  It was implemented based on the assumption that KVM would do below
sequence in hardware_setup():

	cpus_read_lock();
	on_each_cpu(vmxon_and_tdx_cpu_enable, NULL, 1);
	tdx_enable();
	on_each_cpu(vmxoff, NULL, 1);
	cpus_read_unlock();

> 
> And tdx_enable() should also do its best to verify that the caller is post-VMXON:
> 
> 	if (WARN_ON_ONCE(!(__read_cr4() & X86_CR4_VMXE)))
> 		return -EINVAL;

This won't be helpful, or at least isn't sufficient.

tdx_enable() can SEAMCALLs on all online CPUs, so checking "the caller is
post-VMXON" isn't enough.  It needs "checking all online CPUs are in post-
VMXON with tdx_cpu_enable() having been done".

I didn't add such check because it's not mandatory, i.e., the later
SEAMCALL will catch such violation.

Btw, I noticed there's another problem, that is currently tdx_cpu_enable()
actually requires IRQ being disabled.  Again it was implemented based on
it would be invoked via both on_each_cpu() and kvm_online_cpu().

It also also implemented with consideration that it could be called by
multiple in-kernel TDX users in parallel via both SMP call and in normal
context, so it was implemented to simply request the caller to make sure
it is called with IRQ disabled so it can be IRQ safe  (it uses a percpu
variable to track whether TDH.SYS.LP.INIT has been done for local cpu
similar to the hardware_enabled percpu variable).

> 
> > > > Btw, why couldn't we do the 'system_state' check at the very beginning of
> > > > this function?
> > > 
> > > We could, but we'd still need to check after, and adding a small bit of extra
> > > complexity just to try to catch a very rare situation isn't worth it.
> > > 
> > > To prevent races, system_state needs to be check after register_syscore_ops(),
> > > because only once kvm_syscore_ops is registered is KVM guaranteed to get notified
> > > of a shutdown. >
> > > And because the kvm_syscore_ops hooks disable virtualization, they should be called
> > > after cpuhp_setup_state().  That's not strictly required, as the per-CPU
> > > hardware_enabled flag will prevent true problems if the system enter shutdown
> > > state before KVM reaches cpuhp_setup_state().
> > > 
> > > Hmm, but the same edge cases exists in the above flow.  If the system enters
> > > shutdown _just_ after register_syscore_ops(), KVM would see that in system_state
> > > and do cpuhp_remove_state(), i.e. invoke kvm_offline_cpu() and thus do a double
> > > disable (which again is benign because of hardware_enabled).
> > > 
> > > Ah, but registering syscore ops before doing cpuhp_setup_state() has another race,
> > > and one that could be fatal.  If the system does suspend+resume before the cpuhup
> > > hooks are registered, kvm_resume() would enable virtualization.  And then if
> > > cpuhp_setup_state() failed, virtualization would be left enabled.
> > > 
> > > So cpuhp_setup_state() *must* come before register_syscore_ops(), and
> > > register_syscore_ops() *must* come before the system_state check.
> > 
> > OK.  I guess I have to double check here to completely understand the races.
> > :-)
> > 
> > So I think we have consensus to go with the approach that shows in your
> > second diff -- that is to always enable virtualization during module loading
> > for all other ARCHs other than x86, for which we only always enables
> > virtualization during module loading for TDX.
> 
> Assuming the other arch maintainers are ok with that approach.  If waiting until
> a VM is created is desirable for other architectures, then we'll need to figure
> out a plan b.  E.g. KVM arm64 doesn't support being built as a module, so enabling
> hardware during initialization would mean virtualization is enabled for any kernel
> that is built with CONFIG_KVM=y.
> 
> Actually, duh.  There's absolutely no reason to force other architectures to
> choose when to enable virtualization.  As evidenced by the massaging to have x86
> keep enabling virtualization on-demand for !TDX, the cleanups don't come from
> enabling virtualization during module load, they come from registering cpuup and
> syscore ops when virtualization is enabled.
> 
> I.e. we can keep kvm_usage_count in common code, and just do exactly what I
> proposed for kvm_x86_enable_virtualization().

If so, then looks this is basically changing "cpuhp_setup_state_nocalls()
+ on_each_cpu()" to "cpuhp_setup_state()", and moving it along with
register_syscore_ops() to hardware_enable_all()"?

> 
> I have patches to do this, and initial testing suggests they aren't wildly
> broken.  I'll post them soon-ish, assuming nothing pops up in testing.  They are
> clean enough that they can land in advance of TDX, e.g. in kvm-coco-queue even
> before other architectures verify I didn't break them.

Good to know.  I'll do more TDX test with them after you send them out.

> 
> > Then how about "do kvm_x86_virtualization_enable()  within
> > late_hardware_setup() in kvm_x86_vendor_init()"  vs "do
> > kvm_x86_virtualization_enable() in TDX-specific code after
> > kvm_x86_vendor_init()"?
> > 
> > Which do you prefer?
> 
> The latter, assuming it doesn't make the TDX code more complex than it needs to
> be.  The fewer kvm_x86_ops hooks, the better.
> 

Agreed.
Sean Christopherson April 22, 2024, 4:54 p.m. UTC | #25
On Mon, Apr 22, 2024, Kai Huang wrote:
> On Fri, 2024-04-19 at 10:23 -0700, Sean Christopherson wrote:
> > On Fri, Apr 19, 2024, Kai Huang wrote:
> > And tdx_enable() should also do its best to verify that the caller is post-VMXON:
> > 
> > 	if (WARN_ON_ONCE(!(__read_cr4() & X86_CR4_VMXE)))
> > 		return -EINVAL;
> 
> This won't be helpful, or at least isn't sufficient.
> 
> tdx_enable() can SEAMCALLs on all online CPUs, so checking "the caller is
> post-VMXON" isn't enough.  It needs "checking all online CPUs are in post-
> VMXON with tdx_cpu_enable() having been done".

I'm suggesting adding it in the responding code that does that actual SEAMCALL.

And the intent isn't to catch every possible problem.  As with many sanity checks,
the intent is to detect the most likely failure mode to make triaging and debugging
issues a bit easier.

> I didn't add such check because it's not mandatory, i.e., the later
> SEAMCALL will catch such violation.

Yeah, a sanity check definitely isn't manadatory, but I do think it would be
useful and worthwhile.  The code in question is relatively unique (enables VMX
at module load) and a rare operation, i.e. the cost of sanity checking CR4.VMXE
is meaningless.  If we do end up with a bug where a CPU fails to do VMXON, this
sanity check would give a decent chance of a precise report, whereas #UD on a
SEAMCALL will be less clearcut.

> Btw, I noticed there's another problem, that is currently tdx_cpu_enable()
> actually requires IRQ being disabled.  Again it was implemented based on
> it would be invoked via both on_each_cpu() and kvm_online_cpu().
> 
> It also also implemented with consideration that it could be called by
> multiple in-kernel TDX users in parallel via both SMP call and in normal
> context, so it was implemented to simply request the caller to make sure
> it is called with IRQ disabled so it can be IRQ safe  (it uses a percpu
> variable to track whether TDH.SYS.LP.INIT has been done for local cpu
> similar to the hardware_enabled percpu variable).

Is this is an actual problem, or is it just something that would need to be
updated in the TDX code to handle the change in direction?

> > Actually, duh.  There's absolutely no reason to force other architectures to
> > choose when to enable virtualization.  As evidenced by the massaging to have x86
> > keep enabling virtualization on-demand for !TDX, the cleanups don't come from
> > enabling virtualization during module load, they come from registering cpuup and
> > syscore ops when virtualization is enabled.
> > 
> > I.e. we can keep kvm_usage_count in common code, and just do exactly what I
> > proposed for kvm_x86_enable_virtualization().
> 
> If so, then looks this is basically changing "cpuhp_setup_state_nocalls()
> + on_each_cpu()" to "cpuhp_setup_state()", and moving it along with
> register_syscore_ops() to hardware_enable_all()"?

Yep.
Huang, Kai April 22, 2024, 10:47 p.m. UTC | #26
On Mon, 2024-04-22 at 09:54 -0700, Sean Christopherson wrote:
> On Mon, Apr 22, 2024, Kai Huang wrote:
> > On Fri, 2024-04-19 at 10:23 -0700, Sean Christopherson wrote:
> > > On Fri, Apr 19, 2024, Kai Huang wrote:
> > > And tdx_enable() should also do its best to verify that the caller is post-VMXON:
> > > 
> > > 	if (WARN_ON_ONCE(!(__read_cr4() & X86_CR4_VMXE)))
> > > 		return -EINVAL;
> > 
> > This won't be helpful, or at least isn't sufficient.
> > 
> > tdx_enable() can SEAMCALLs on all online CPUs, so checking "the caller is
> > post-VMXON" isn't enough.  It needs "checking all online CPUs are in post-
> > VMXON with tdx_cpu_enable() having been done".
> 
> I'm suggesting adding it in the responding code that does that actual SEAMCALL.

The thing is to check you will need to do additional things like making
sure no scheduling would happen during "check + make SEAMCALL".  Doesn't
seem worth to do for me.

The intent of tdx_enable() was the caller should make sure no new CPU
should come online (well this can be relaxed if we move
cpuhp_setup_state() to hardware_enable_all()), and all existing online
CPUs are in post-VMXON with tdx_cpu_enable() been done.

I think, if we ever need any check, the latter seems to be more
reasonable.

But if we allow new CPU to become online during tdx_enable() (with your
enhancement to the hardware enabling), then I don't know how to make such
check at the beginning of tdx_enable(), because do 

	on_each_cpu(check_seamcall_precondition, NULL, 1);

cannot catch any new CPU during tdx_enable().

> 
> And the intent isn't to catch every possible problem.  As with many sanity checks,
> the intent is to detect the most likely failure mode to make triaging and debugging
> issues a bit easier.

The SEAMCALL will literally return a unique error code to indicate CPU
isn't in post-VMXON, or tdx_cpu_enable() hasn't been done.  I think the
error code is already clear to pinpoint the problem (due to these pre-
SEAMCALL-condition not being met).

> 
> > I didn't add such check because it's not mandatory, i.e., the later
> > SEAMCALL will catch such violation.
> 
> Yeah, a sanity check definitely isn't manadatory, but I do think it would be
> useful and worthwhile.  The code in question is relatively unique (enables VMX
> at module load) and a rare operation, i.e. the cost of sanity checking CR4.VMXE
> is meaningless.  If we do end up with a bug where a CPU fails to do VMXON, this
> sanity check would give a decent chance of a precise report, whereas #UD on a
> SEAMCALL will be less clearcut.

If VMXON fails for any CPU then cpuhp_setup_state() will fail, preventing
KVM to be loaded.

And if it fails during kvm_online_cpu(), the new CPU will fail to online.

> 
> > Btw, I noticed there's another problem, that is currently tdx_cpu_enable()
> > actually requires IRQ being disabled.  Again it was implemented based on
> > it would be invoked via both on_each_cpu() and kvm_online_cpu().
> > 
> > It also also implemented with consideration that it could be called by
> > multiple in-kernel TDX users in parallel via both SMP call and in normal
> > context, so it was implemented to simply request the caller to make sure
> > it is called with IRQ disabled so it can be IRQ safe  (it uses a percpu
> > variable to track whether TDH.SYS.LP.INIT has been done for local cpu
> > similar to the hardware_enabled percpu variable).
> 
> Is this is an actual problem, or is it just something that would need to be
> updated in the TDX code to handle the change in direction?

For now this isn't, because KVM is the solo user, and in KVM
hardware_enable_all() and kvm_online_cpu() uses kvm_lock mutex to make
hardware_enable_nolock() IPI safe.

I am not sure how TDX/SEAMCALL will be used in TDX Connect.

However I needed to consider KVM as a user, so I decided to just make it
must be called with IRQ disabled so I could know it is IRQ safe.

Back to the current tdx_enable() and tdx_cpu_enable(), my personal
preference is, of course, to keep the existing way, that is:

During module load:

	cpus_read_lock();
	tdx_enable();
	cpus_read_unlock();

and in kvm_online_cpu():

	local_irq_save();
	tdx_cpu_enable();
	local_irq_restore();

But given KVM is the solo user now, I am also fine to change if you
believe this is not acceptable.
Sean Christopherson April 23, 2024, 12:08 a.m. UTC | #27
On Mon, Apr 22, 2024, Kai Huang wrote:
> On Mon, 2024-04-22 at 09:54 -0700, Sean Christopherson wrote:
> > On Mon, Apr 22, 2024, Kai Huang wrote:
> > > On Fri, 2024-04-19 at 10:23 -0700, Sean Christopherson wrote:
> > > > On Fri, Apr 19, 2024, Kai Huang wrote:
> > > > And tdx_enable() should also do its best to verify that the caller is post-VMXON:
> > > > 
> > > > 	if (WARN_ON_ONCE(!(__read_cr4() & X86_CR4_VMXE)))
> > > > 		return -EINVAL;
> > > 
> > > This won't be helpful, or at least isn't sufficient.
> > > 
> > > tdx_enable() can SEAMCALLs on all online CPUs, so checking "the caller is
> > > post-VMXON" isn't enough.  It needs "checking all online CPUs are in post-
> > > VMXON with tdx_cpu_enable() having been done".
> > 
> > I'm suggesting adding it in the responding code that does that actual SEAMCALL.
> 
> The thing is to check you will need to do additional things like making
> sure no scheduling would happen during "check + make SEAMCALL".  Doesn't
> seem worth to do for me.
> 
> The intent of tdx_enable() was the caller should make sure no new CPU
> should come online (well this can be relaxed if we move
> cpuhp_setup_state() to hardware_enable_all()), and all existing online
> CPUs are in post-VMXON with tdx_cpu_enable() been done.
> 
> I think, if we ever need any check, the latter seems to be more
> reasonable.
> 
> But if we allow new CPU to become online during tdx_enable() (with your
> enhancement to the hardware enabling), then I don't know how to make such
> check at the beginning of tdx_enable(), because do 
> 
> 	on_each_cpu(check_seamcall_precondition, NULL, 1);
> 
> cannot catch any new CPU during tdx_enable().

Doh, we're talking past each other, because my initial suggestion was half-baked.

When I initially said "tdx_enable()", I didn't intend to literally mean just the
CPU that calls tdx_enable().  What I was trying to say was, when doing per-CPU
things when enabling TDX, sanity check that the current CPU has CR4.VMXE=1 before
doing a SEAMCALL.

Ah, and now I see where the disconnect is.  I was assuming tdx_enable() also did
TDH_SYS_LP_INIT, but that's in a separate chunk of code that's manually invoked
by KVM.  More below.

> > And the intent isn't to catch every possible problem.  As with many sanity checks,
> > the intent is to detect the most likely failure mode to make triaging and debugging
> > issues a bit easier.
> 
> The SEAMCALL will literally return a unique error code to indicate CPU
> isn't in post-VMXON, or tdx_cpu_enable() hasn't been done.  I think the
> error code is already clear to pinpoint the problem (due to these pre-
> SEAMCALL-condition not being met).

No, SEAMCALL #UDs if the CPU isn't post-VMXON.  I.e. the CPU doesn't make it to
the TDX Module to provide a unique error code, all KVM will see is a #UD.

> > > Btw, I noticed there's another problem, that is currently tdx_cpu_enable()
> > > actually requires IRQ being disabled.  Again it was implemented based on
> > > it would be invoked via both on_each_cpu() and kvm_online_cpu().
> > > 
> > > It also also implemented with consideration that it could be called by
> > > multiple in-kernel TDX users in parallel via both SMP call and in normal
> > > context, so it was implemented to simply request the caller to make sure
> > > it is called with IRQ disabled so it can be IRQ safe  (it uses a percpu
> > > variable to track whether TDH.SYS.LP.INIT has been done for local cpu
> > > similar to the hardware_enabled percpu variable).
> > 
> > Is this is an actual problem, or is it just something that would need to be
> > updated in the TDX code to handle the change in direction?
> 
> For now this isn't, because KVM is the solo user, and in KVM
> hardware_enable_all() and kvm_online_cpu() uses kvm_lock mutex to make
> hardware_enable_nolock() IPI safe.
> 
> I am not sure how TDX/SEAMCALL will be used in TDX Connect.
> 
> However I needed to consider KVM as a user, so I decided to just make it
> must be called with IRQ disabled so I could know it is IRQ safe.
> 
> Back to the current tdx_enable() and tdx_cpu_enable(), my personal
> preference is, of course, to keep the existing way, that is:
> 
> During module load:
> 
> 	cpus_read_lock();
> 	tdx_enable();
> 	cpus_read_unlock();
> 
> and in kvm_online_cpu():
> 
> 	local_irq_save();
> 	tdx_cpu_enable();
> 	local_irq_restore();
> 
> But given KVM is the solo user now, I am also fine to change if you
> believe this is not acceptable.

Looking more closely at the code, tdx_enable() needs to be called under
cpu_hotplug_lock to prevent *unplug*, i.e. to prevent the last CPU on a package
from being offlined.  I.e. that part's not option.

And the root of the problem/confusion is that the APIs provided by the core kernel
are weird, which is really just a polite way of saying they are awful :-)

There is no reason to rely on the caller to take cpu_hotplug_lock, and definitely
no reason to rely on the caller to invoke tdx_cpu_enable() separately from invoking
tdx_enable().  I suspect they got that way because of KVM's unnecessarily complex
code, e.g. if KVM is already doing on_each_cpu() to do VMXON, then it's easy enough
to also do TDH_SYS_LP_INIT, so why do two IPIs?

But just because KVM "owns" VMXON doesn't mean the core kernel code should punt
TDX to KVM too.  If KVM relies on the cpuhp code to ensure all online CPUs are
post-VMXON, then the TDX shapes up nicely and provides a single API to enable
TDX.  And then my CR4.VMXE=1 sanity check makes a _lot_ more sense.

Relative to some random version of the TDX patches, this is what I'm thinking:

---
 arch/x86/kvm/vmx/tdx.c      | 46 +++----------------
 arch/x86/virt/vmx/tdx/tdx.c | 89 ++++++++++++++++++-------------------
 2 files changed, 49 insertions(+), 86 deletions(-)

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index a1d3ae09091c..137d08da43c3 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -3322,38 +3322,8 @@ bool tdx_is_vm_type_supported(unsigned long type)
 	return type == KVM_X86_TDX_VM;
 }
 
-struct tdx_enabled {
-	cpumask_var_t enabled;
-	atomic_t err;
-};
-
-static void __init tdx_on(void *_enable)
-{
-	struct tdx_enabled *enable = _enable;
-	int r;
-
-	r = vmx_hardware_enable();
-	if (!r) {
-		cpumask_set_cpu(smp_processor_id(), enable->enabled);
-		r = tdx_cpu_enable();
-	}
-	if (r)
-		atomic_set(&enable->err, r);
-}
-
-static void __init vmx_off(void *_enabled)
-{
-	cpumask_var_t *enabled = (cpumask_var_t *)_enabled;
-
-	if (cpumask_test_cpu(smp_processor_id(), *enabled))
-		vmx_hardware_disable();
-}
-
 int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
 {
-	struct tdx_enabled enable = {
-		.err = ATOMIC_INIT(0),
-	};
 	int max_pkgs;
 	int r = 0;
 	int i;
@@ -3409,17 +3379,11 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
 		goto out;
 	}
 
-	/* tdx_enable() in tdx_module_setup() requires cpus lock. */
-	cpus_read_lock();
-	on_each_cpu(tdx_on, &enable, true); /* TDX requires vmxon. */
-	r = atomic_read(&enable.err);
-	if (!r)
-		r = tdx_module_setup();
-	else
-		r = -EIO;
-	on_each_cpu(vmx_off, &enable.enabled, true);
-	cpus_read_unlock();
-	free_cpumask_var(enable.enabled);
+	r = tdx_enable();
+	if (r)
+		goto out;
+
+	r = tdx_module_setup();
 	if (r)
 		goto out;
 
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index d2b8f079a637..19897f736c47 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -139,49 +139,6 @@ static int try_init_module_global(void)
 	return sysinit_ret;
 }
 
-/**
- * tdx_cpu_enable - Enable TDX on local cpu
- *
- * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
- * global initialization SEAMCALL if not done) on local cpu to make this
- * cpu be ready to run any other SEAMCALLs.
- *
- * Always call this function via IPI function calls.
- *
- * Return 0 on success, otherwise errors.
- */
-int tdx_cpu_enable(void)
-{
-	struct tdx_module_args args = {};
-	int ret;
-
-	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
-		return -ENODEV;
-
-	lockdep_assert_irqs_disabled();
-
-	if (__this_cpu_read(tdx_lp_initialized))
-		return 0;
-
-	/*
-	 * The TDX module global initialization is the very first step
-	 * to enable TDX.  Need to do it first (if hasn't been done)
-	 * before the per-cpu initialization.
-	 */
-	ret = try_init_module_global();
-	if (ret)
-		return ret;
-
-	ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
-	if (ret)
-		return ret;
-
-	__this_cpu_write(tdx_lp_initialized, true);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(tdx_cpu_enable);
-
 /*
  * Add a memory region as a TDX memory block.  The caller must make sure
  * all memory regions are added in address ascending order and don't
@@ -1201,6 +1158,43 @@ static int init_tdx_module(void)
 	goto out_put_tdxmem;
 }
 
+/**
+ * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
+ * global initialization SEAMCALL if not done) on local cpu to make this
+ * cpu be ready to run any other SEAMCALLs.
+ */
+static void tdx_cpu_enable(void *__err)
+{
+	struct tdx_module_args args = {};
+	atomic_t err = __err;
+	int ret;
+
+	if (__this_cpu_read(tdx_lp_initialized))
+		return;
+
+	if (WARN_ON_ONCE(!(__read_cr4() & X86_CR4_VMXE)))
+		goto failed;
+
+	/*
+	 * The TDX module global initialization is the very first step
+	 * to enable TDX.  Need to do it first (if hasn't been done)
+	 * before the per-cpu initialization.
+	 */
+	ret = try_init_module_global();
+	if (ret)
+		goto failed;
+
+	ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
+	if (ret)
+		goto failed;
+
+	__this_cpu_write(tdx_lp_initialized, true);
+	return;
+
+failed:
+	atomic_inc(err);
+}
+
 static int __tdx_enable(void)
 {
 	int ret;
@@ -1234,15 +1228,19 @@ static int __tdx_enable(void)
  */
 int tdx_enable(void)
 {
+	atomic_t err = ATOMIC_INIT(0);
 	int ret;
 
 	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
 		return -ENODEV;
 
-	lockdep_assert_cpus_held();
-
+	cpus_read_lock();
 	mutex_lock(&tdx_module_lock);
 
+	on_each_cpu(tdx_cpu_enable, &err, true);
+	if (atomic_read(&err))
+		tdx_module_status = TDX_MODULE_ERROR;
+
 	switch (tdx_module_status) {
 	case TDX_MODULE_UNINITIALIZED:
 		ret = __tdx_enable();
@@ -1258,6 +1256,7 @@ int tdx_enable(void)
 	}
 
 	mutex_unlock(&tdx_module_lock);
+	cpus_read_unlock();
 
 	return ret;
 }

base-commit: fde917bc1af3e1a440ab0cb0d9364f8da25b9e17
--
Huang, Kai April 23, 2024, 1:34 a.m. UTC | #28
> 
> > > And the intent isn't to catch every possible problem.  As with many sanity checks,
> > > the intent is to detect the most likely failure mode to make triaging and debugging
> > > issues a bit easier.
> > 
> > The SEAMCALL will literally return a unique error code to indicate CPU
> > isn't in post-VMXON, or tdx_cpu_enable() hasn't been done.  I think the
> > error code is already clear to pinpoint the problem (due to these pre-
> > SEAMCALL-condition not being met).
> 
> No, SEAMCALL #UDs if the CPU isn't post-VMXON.  I.e. the CPU doesn't make it to
> the TDX Module to provide a unique error code, all KVM will see is a #UD.

#UD is handled by the SEAMCALL assembly code.  Please see TDX_MODULE_CALL
assembly macro:

.Lseamcall_trap\@:
        /*
         * SEAMCALL caused #GP or #UD.  By reaching here RAX contains
         * the trap number.  Convert the trap number to the TDX error
         * code by setting TDX_SW_ERROR to the high 32-bits of RAX.
         *
         * Note cannot OR TDX_SW_ERROR directly to RAX as OR instruction
         * only accepts 32-bit immediate at most.
         */
        movq $TDX_SW_ERROR, %rdi
        orq  %rdi, %rax

	...
       
	_ASM_EXTABLE_FAULT(.Lseamcall\@, .Lseamcall_trap\@)
.endif  /* \host */

> 
> > > > Btw, I noticed there's another problem, that is currently tdx_cpu_enable()
> > > > actually requires IRQ being disabled.  Again it was implemented based on
> > > > it would be invoked via both on_each_cpu() and kvm_online_cpu().
> > > > 
> > > > It also also implemented with consideration that it could be called by
> > > > multiple in-kernel TDX users in parallel via both SMP call and in normal
> > > > context, so it was implemented to simply request the caller to make sure
> > > > it is called with IRQ disabled so it can be IRQ safe  (it uses a percpu
> > > > variable to track whether TDH.SYS.LP.INIT has been done for local cpu
> > > > similar to the hardware_enabled percpu variable).
> > > 
> > > Is this is an actual problem, or is it just something that would need to be
> > > updated in the TDX code to handle the change in direction?
> > 
> > For now this isn't, because KVM is the solo user, and in KVM
> > hardware_enable_all() and kvm_online_cpu() uses kvm_lock mutex to make
> > hardware_enable_nolock() IPI safe.
> > 
> > I am not sure how TDX/SEAMCALL will be used in TDX Connect.
> > 
> > However I needed to consider KVM as a user, so I decided to just make it
> > must be called with IRQ disabled so I could know it is IRQ safe.
> > 
> > Back to the current tdx_enable() and tdx_cpu_enable(), my personal
> > preference is, of course, to keep the existing way, that is:
> > 
> > During module load:
> > 
> > 	cpus_read_lock();
> > 	tdx_enable();
> > 	cpus_read_unlock();
> > 
> > and in kvm_online_cpu():
> > 
> > 	local_irq_save();
> > 	tdx_cpu_enable();
> > 	local_irq_restore();
> > 
> > But given KVM is the solo user now, I am also fine to change if you
> > believe this is not acceptable.
> 
> Looking more closely at the code, tdx_enable() needs to be called under
> cpu_hotplug_lock to prevent *unplug*, i.e. to prevent the last CPU on a package
> from being offlined.  I.e. that part's not option.

Yeah.  We can say that.  I almost forgot this :-)

> 
> And the root of the problem/confusion is that the APIs provided by the core kernel
> are weird, which is really just a polite way of saying they are awful :-)

Well, apologize for it :-)

> 
> There is no reason to rely on the caller to take cpu_hotplug_lock, and definitely
> no reason to rely on the caller to invoke tdx_cpu_enable() separately from invoking
> tdx_enable().  I suspect they got that way because of KVM's unnecessarily complex
> code, e.g. if KVM is already doing on_each_cpu() to do VMXON, then it's easy enough
> to also do TDH_SYS_LP_INIT, so why do two IPIs?

The main reason is we relaxed the TDH.SYS.LP.INIT to be called _after_ TDX
module initialization.  

Previously, the TDH.SYS.LP.INIT must be done on *ALL* CPUs that the
platform has (i.e., cpu_present_mask) right after TDH.SYS.INIT and before
any other SEAMCALLs.  This didn't quite work with (kernel software) CPU
hotplug, and it had problem dealing with things like SMT disable
mitigation:

https://lore.kernel.org/lkml/529a22d05e21b9218dc3f29c17ac5a176334cac1.camel@intel.com/T/#mf42fa2d68d6b98edcc2aae11dba3c2487caf3b8f

So the x86 maintainers requested to change this.  The original proposal
was to eliminate the entire TDH.SYS.INIT and TDH.SYS.LP.INIT:

https://lore.kernel.org/lkml/529a22d05e21b9218dc3f29c17ac5a176334cac1.camel@intel.com/T/#m78c0c48078f231e92ea1b87a69bac38564d46469

But somehow it wasn't feasible, and the result was we relaxed to allow
TDH.SYS.LP.INIT to be called after module initialization.

So we need a separate tdx_cpu_enable() for that.

> 
> But just because KVM "owns" VMXON doesn't mean the core kernel code should punt
> TDX to KVM too.  If KVM relies on the cpuhp code to ensure all online CPUs are
> post-VMXON, then the TDX shapes up nicely and provides a single API to enable
> TDX.  
> 

We could ask tdx_enable() to invoke tdx_cpu_enable() internally, but as
mentioned above we still to have the tdx_cpu_enable() as a separate API to
allow CPU hotplug to call it.

> And then my CR4.VMXE=1 sanity check makes a _lot_ more sense.

As replied above the current SEAMCALL assembly returns a unique error code
for that:

	#define TDX_SEAMCALL_UD		(TDX_SW_ERROR |
X86_TRAP_UD)
Huang, Kai April 23, 2024, 1:45 a.m. UTC | #29
On Tue, 2024-04-23 at 13:34 +1200, Kai Huang wrote:
> > 
> > > > And the intent isn't to catch every possible problem.  As with many sanity checks,
> > > > the intent is to detect the most likely failure mode to make triaging and debugging
> > > > issues a bit easier.
> > > 
> > > The SEAMCALL will literally return a unique error code to indicate CPU
> > > isn't in post-VMXON, or tdx_cpu_enable() hasn't been done.  I think the
> > > error code is already clear to pinpoint the problem (due to these pre-
> > > SEAMCALL-condition not being met).
> > 
> > No, SEAMCALL #UDs if the CPU isn't post-VMXON.  I.e. the CPU doesn't make it to
> > the TDX Module to provide a unique error code, all KVM will see is a #UD.
> 
> #UD is handled by the SEAMCALL assembly code.  Please see TDX_MODULE_CALL
> assembly macro:
> 
> .Lseamcall_trap\@:
>         /*
>          * SEAMCALL caused #GP or #UD.  By reaching here RAX contains
>          * the trap number.  Convert the trap number to the TDX error
>          * code by setting TDX_SW_ERROR to the high 32-bits of RAX.
>          *
>          * Note cannot OR TDX_SW_ERROR directly to RAX as OR instruction
>          * only accepts 32-bit immediate at most.
>          */
>         movq $TDX_SW_ERROR, %rdi
>         orq  %rdi, %rax
> 
> 	...
>        
> 	_ASM_EXTABLE_FAULT(.Lseamcall\@, .Lseamcall_trap\@)
> .endif  /* \host */
> 
> > 
> > > > > Btw, I noticed there's another problem, that is currently tdx_cpu_enable()
> > > > > actually requires IRQ being disabled.  Again it was implemented based on
> > > > > it would be invoked via both on_each_cpu() and kvm_online_cpu().
> > > > > 
> > > > > It also also implemented with consideration that it could be called by
> > > > > multiple in-kernel TDX users in parallel via both SMP call and in normal
> > > > > context, so it was implemented to simply request the caller to make sure
> > > > > it is called with IRQ disabled so it can be IRQ safe  (it uses a percpu
> > > > > variable to track whether TDH.SYS.LP.INIT has been done for local cpu
> > > > > similar to the hardware_enabled percpu variable).
> > > > 
> > > > Is this is an actual problem, or is it just something that would need to be
> > > > updated in the TDX code to handle the change in direction?
> > > 
> > > For now this isn't, because KVM is the solo user, and in KVM
> > > hardware_enable_all() and kvm_online_cpu() uses kvm_lock mutex to make
> > > hardware_enable_nolock() IPI safe.
> > > 
> > > I am not sure how TDX/SEAMCALL will be used in TDX Connect.
> > > 
> > > However I needed to consider KVM as a user, so I decided to just make it
> > > must be called with IRQ disabled so I could know it is IRQ safe.
> > > 
> > > Back to the current tdx_enable() and tdx_cpu_enable(), my personal
> > > preference is, of course, to keep the existing way, that is:
> > > 
> > > During module load:
> > > 
> > > 	cpus_read_lock();
> > > 	tdx_enable();
> > > 	cpus_read_unlock();
> > > 
> > > and in kvm_online_cpu():
> > > 
> > > 	local_irq_save();
> > > 	tdx_cpu_enable();
> > > 	local_irq_restore();
> > > 
> > > But given KVM is the solo user now, I am also fine to change if you
> > > believe this is not acceptable.
> > 
> > Looking more closely at the code, tdx_enable() needs to be called under
> > cpu_hotplug_lock to prevent *unplug*, i.e. to prevent the last CPU on a package
> > from being offlined.  I.e. that part's not option.
> 
> Yeah.  We can say that.  I almost forgot this :-)
> 
> > 
> > And the root of the problem/confusion is that the APIs provided by the core kernel
> > are weird, which is really just a polite way of saying they are awful :-)
> 
> Well, apologize for it :-)
> 
> > 
> > There is no reason to rely on the caller to take cpu_hotplug_lock, and definitely
> > no reason to rely on the caller to invoke tdx_cpu_enable() separately from invoking
> > tdx_enable().  I suspect they got that way because of KVM's unnecessarily complex
> > code, e.g. if KVM is already doing on_each_cpu() to do VMXON, then it's easy enough
> > to also do TDH_SYS_LP_INIT, so why do two IPIs?
> 
> The main reason is we relaxed the TDH.SYS.LP.INIT to be called _after_ TDX
> module initialization.  
> 
> Previously, the TDH.SYS.LP.INIT must be done on *ALL* CPUs that the
> platform has (i.e., cpu_present_mask) right after TDH.SYS.INIT and before
> any other SEAMCALLs.  This didn't quite work with (kernel software) CPU
> hotplug, and it had problem dealing with things like SMT disable
> mitigation:
> 
> https://lore.kernel.org/lkml/529a22d05e21b9218dc3f29c17ac5a176334cac1.camel@intel.com/T/#mf42fa2d68d6b98edcc2aae11dba3c2487caf3b8f
> 
> So the x86 maintainers requested to change this.  The original proposal
> was to eliminate the entire TDH.SYS.INIT and TDH.SYS.LP.INIT:
> 
> https://lore.kernel.org/lkml/529a22d05e21b9218dc3f29c17ac5a176334cac1.camel@intel.com/T/#m78c0c48078f231e92ea1b87a69bac38564d46469
> 
> But somehow it wasn't feasible, and the result was we relaxed to allow
> TDH.SYS.LP.INIT to be called after module initialization.
> 
> So we need a separate tdx_cpu_enable() for that.

Btw, the ideal (or probably the final) plan is to handle tdx_cpu_enable()
in TDX's own CPU hotplug callback in the core-kernel and hide it from all
other in-kernel TDX users.  

Specifically:

1) that callback, e.g., tdx_online_cpu() will be placed _before_ any in-
kernel TDX users like KVM's callback.
2) In tdx_online_cpu(), we do VMXON + tdx_cpu_enable() + VMXOFF, and
return error in case of any error to prevent that cpu from going online.

That makes sure that, if TDX is supported by the platform, we basically
guarantees all online CPUs are ready to issue SEAMCALL (of course, the in-
kernel TDX user still needs to do VMXON for it, but that's TDX user's
responsibility).

But that obviously needs to move VMXON to the core-kernel.

Currently, export tdx_cpu_enable() as a separate API and require KVM to
call it explicitly is a temporary solution.

That being said, we could do tdx_cpu_enable() inside tdx_enable(), but I
don't see it's a better idea.
Sean Christopherson April 23, 2024, 3:15 p.m. UTC | #30
On Tue, Apr 23, 2024, Kai Huang wrote:
> On Tue, 2024-04-23 at 13:34 +1200, Kai Huang wrote:
> > > 
> > > > > And the intent isn't to catch every possible problem.  As with many sanity checks,
> > > > > the intent is to detect the most likely failure mode to make triaging and debugging
> > > > > issues a bit easier.
> > > > 
> > > > The SEAMCALL will literally return a unique error code to indicate CPU
> > > > isn't in post-VMXON, or tdx_cpu_enable() hasn't been done.  I think the
> > > > error code is already clear to pinpoint the problem (due to these pre-
> > > > SEAMCALL-condition not being met).
> > > 
> > > No, SEAMCALL #UDs if the CPU isn't post-VMXON.  I.e. the CPU doesn't make it to
> > > the TDX Module to provide a unique error code, all KVM will see is a #UD.
> > 
> > #UD is handled by the SEAMCALL assembly code.  Please see TDX_MODULE_CALL
> > assembly macro:

Right, but that doesn't say why the #UD occurred.  The macro dresses it up in
TDX_SW_ERROR so that KVM only needs a single parser, but at the end of the day
KVM is still only going to see that SEAMCALL hit a #UD.

> > > There is no reason to rely on the caller to take cpu_hotplug_lock, and definitely
> > > no reason to rely on the caller to invoke tdx_cpu_enable() separately from invoking
> > > tdx_enable().  I suspect they got that way because of KVM's unnecessarily complex
> > > code, e.g. if KVM is already doing on_each_cpu() to do VMXON, then it's easy enough
> > > to also do TDH_SYS_LP_INIT, so why do two IPIs?
> > 
> > The main reason is we relaxed the TDH.SYS.LP.INIT to be called _after_ TDX
> > module initialization.  
> > 
> > Previously, the TDH.SYS.LP.INIT must be done on *ALL* CPUs that the
> > platform has (i.e., cpu_present_mask) right after TDH.SYS.INIT and before
> > any other SEAMCALLs.  This didn't quite work with (kernel software) CPU
> > hotplug, and it had problem dealing with things like SMT disable
> > mitigation:
> > 
> > https://lore.kernel.org/lkml/529a22d05e21b9218dc3f29c17ac5a176334cac1.camel@intel.com/T/#mf42fa2d68d6b98edcc2aae11dba3c2487caf3b8f
> > 
> > So the x86 maintainers requested to change this.  The original proposal
> > was to eliminate the entire TDH.SYS.INIT and TDH.SYS.LP.INIT:
> > 
> > https://lore.kernel.org/lkml/529a22d05e21b9218dc3f29c17ac5a176334cac1.camel@intel.com/T/#m78c0c48078f231e92ea1b87a69bac38564d46469
> > 
> > But somehow it wasn't feasible, and the result was we relaxed to allow
> > TDH.SYS.LP.INIT to be called after module initialization.
> > 
> > So we need a separate tdx_cpu_enable() for that.

No, you don't, at least not given the TDX patches I'm looking at.  Allowing
TDH.SYS.LP.INIT after module initialization makes sense because otherwise the
kernel would need to online all possible CPUs before initializing TDX.  But that
doesn't mean that the kernel needs to, or should, punt TDH.SYS.LP.INIT to KVM.

AFAICT, KVM is NOT doing TDH.SYS.LP.INIT when a CPU is onlined, only when KVM
is loaded, which means that tdx_enable() can process all online CPUs just as
easily as KVM.

Presumably that approach relies on something blocking onlining CPUs when TDX is
active.  And if that's not the case, the proposed patches are buggy.

> Btw, the ideal (or probably the final) plan is to handle tdx_cpu_enable()
> in TDX's own CPU hotplug callback in the core-kernel and hide it from all
> other in-kernel TDX users.  
> 
> Specifically:
> 
> 1) that callback, e.g., tdx_online_cpu() will be placed _before_ any in-
> kernel TDX users like KVM's callback.
> 2) In tdx_online_cpu(), we do VMXON + tdx_cpu_enable() + VMXOFF, and
> return error in case of any error to prevent that cpu from going online.
> 
> That makes sure that, if TDX is supported by the platform, we basically
> guarantees all online CPUs are ready to issue SEAMCALL (of course, the in-
> kernel TDX user still needs to do VMXON for it, but that's TDX user's
> responsibility).
> 
> But that obviously needs to move VMXON to the core-kernel.

It doesn't strictly have to be core kernel per se, just in code that sits below
KVM, e.g. in a seperate module called VAC[*] ;-)

[*] https://lore.kernel.org/all/ZW6FRBnOwYV-UCkY@google.com

> Currently, export tdx_cpu_enable() as a separate API and require KVM to
> call it explicitly is a temporary solution.
> 
> That being said, we could do tdx_cpu_enable() inside tdx_enable(), but I
> don't see it's a better idea.

It simplifies the API surface for enabling TDX and eliminates an export.
Huang, Kai April 23, 2024, 10:59 p.m. UTC | #31
On Tue, 2024-04-23 at 08:15 -0700, Sean Christopherson wrote:
> On Tue, Apr 23, 2024, Kai Huang wrote:
> > On Tue, 2024-04-23 at 13:34 +1200, Kai Huang wrote:
> > > > 
> > > > > > And the intent isn't to catch every possible problem.  As with many sanity checks,
> > > > > > the intent is to detect the most likely failure mode to make triaging and debugging
> > > > > > issues a bit easier.
> > > > > 
> > > > > The SEAMCALL will literally return a unique error code to indicate CPU
> > > > > isn't in post-VMXON, or tdx_cpu_enable() hasn't been done.  I think the
> > > > > error code is already clear to pinpoint the problem (due to these pre-
> > > > > SEAMCALL-condition not being met).
> > > > 
> > > > No, SEAMCALL #UDs if the CPU isn't post-VMXON.  I.e. the CPU doesn't make it to
> > > > the TDX Module to provide a unique error code, all KVM will see is a #UD.
> > > 
> > > #UD is handled by the SEAMCALL assembly code.  Please see TDX_MODULE_CALL
> > > assembly macro:
> 
> Right, but that doesn't say why the #UD occurred.  The macro dresses it up in
> TDX_SW_ERROR so that KVM only needs a single parser, but at the end of the day
> KVM is still only going to see that SEAMCALL hit a #UD.

Right.  But is there any problem here?  I thought the point was we can
just use the error code to tell what went wrong.

> 
> > > > There is no reason to rely on the caller to take cpu_hotplug_lock, and definitely
> > > > no reason to rely on the caller to invoke tdx_cpu_enable() separately from invoking
> > > > tdx_enable().  I suspect they got that way because of KVM's unnecessarily complex
> > > > code, e.g. if KVM is already doing on_each_cpu() to do VMXON, then it's easy enough
> > > > to also do TDH_SYS_LP_INIT, so why do two IPIs?
> > > 
> > > The main reason is we relaxed the TDH.SYS.LP.INIT to be called _after_ TDX
> > > module initialization.  
> > > 
> > > Previously, the TDH.SYS.LP.INIT must be done on *ALL* CPUs that the
> > > platform has (i.e., cpu_present_mask) right after TDH.SYS.INIT and before
> > > any other SEAMCALLs.  This didn't quite work with (kernel software) CPU
> > > hotplug, and it had problem dealing with things like SMT disable
> > > mitigation:
> > > 
> > > https://lore.kernel.org/lkml/529a22d05e21b9218dc3f29c17ac5a176334cac1.camel@intel.com/T/#mf42fa2d68d6b98edcc2aae11dba3c2487caf3b8f
> > > 
> > > So the x86 maintainers requested to change this.  The original proposal
> > > was to eliminate the entire TDH.SYS.INIT and TDH.SYS.LP.INIT:
> > > 
> > > https://lore.kernel.org/lkml/529a22d05e21b9218dc3f29c17ac5a176334cac1.camel@intel.com/T/#m78c0c48078f231e92ea1b87a69bac38564d46469
> > > 
> > > But somehow it wasn't feasible, and the result was we relaxed to allow
> > > TDH.SYS.LP.INIT to be called after module initialization.
> > > 
> > > So we need a separate tdx_cpu_enable() for that.
> 
> No, you don't, at least not given the TDX patches I'm looking at.  Allowing
> TDH.SYS.LP.INIT after module initialization makes sense because otherwise the
> kernel would need to online all possible CPUs before initializing TDX.  But that
> doesn't mean that the kernel needs to, or should, punt TDH.SYS.LP.INIT to KVM.
> 
> AFAICT, KVM is NOT doing TDH.SYS.LP.INIT when a CPU is onlined, only when KVM
> is loaded, which means that tdx_enable() can process all online CPUs just as
> easily as KVM.

Hmm.. I assumed kvm_online_cpu() will do VMXON + tdx_cpu_enable().

> 
> Presumably that approach relies on something blocking onlining CPUs when TDX is
> active.  And if that's not the case, the proposed patches are buggy.

The current patch ([PATCH 023/130] KVM: TDX: Initialize the TDX module
when loading the KVM intel kernel module) indeed is buggy, but I don't
quite follow why we need to block onlining CPU  when TDX is active?

There's no hard things that prevent us to do so.  KVM just need to do
VMXON + tdx_cpu_enable() inside kvm_online_cpu().

> 
> > Btw, the ideal (or probably the final) plan is to handle tdx_cpu_enable()
> > in TDX's own CPU hotplug callback in the core-kernel and hide it from all
> > other in-kernel TDX users.  
> > 
> > Specifically:
> > 
> > 1) that callback, e.g., tdx_online_cpu() will be placed _before_ any in-
> > kernel TDX users like KVM's callback.
> > 2) In tdx_online_cpu(), we do VMXON + tdx_cpu_enable() + VMXOFF, and
> > return error in case of any error to prevent that cpu from going online.
> > 
> > That makes sure that, if TDX is supported by the platform, we basically
> > guarantees all online CPUs are ready to issue SEAMCALL (of course, the in-
> > kernel TDX user still needs to do VMXON for it, but that's TDX user's
> > responsibility).
> > 
> > But that obviously needs to move VMXON to the core-kernel.
> 
> It doesn't strictly have to be core kernel per se, just in code that sits below
> KVM, e.g. in a seperate module called VAC[*] ;-)
> 
> [*] https://lore.kernel.org/all/ZW6FRBnOwYV-UCkY@google.com

Could you elaborate why vac.ko is necessary?

Being a module natually we will need to handle module init and exit.  But
TDX cannot be disabled and re-enabled after initialization, so in general
the vac.ko doesn't quite fit for TDX.

And I am not sure what's the fundamental difference between managing TDX
module in a module vs in the core-kernel from KVM's perspective.

> 
> > Currently, export tdx_cpu_enable() as a separate API and require KVM to
> > call it explicitly is a temporary solution.
> > 
> > That being said, we could do tdx_cpu_enable() inside tdx_enable(), but I
> > don't see it's a better idea.
> 
> It simplifies the API surface for enabling TDX and eliminates an export.

I was surprised to see we want to prevent onlining CPU when TDX is active.
Huang, Kai April 23, 2024, 11:29 p.m. UTC | #32
On Tue, 2024-04-23 at 22:59 +0000, Huang, Kai wrote:
> > Right, but that doesn't say why the #UD occurred.  The macro dresses it up in
> > TDX_SW_ERROR so that KVM only needs a single parser, but at the end of the day
> > KVM is still only going to see that SEAMCALL hit a #UD.
> 
> Right.  But is there any problem here?  I thought the point was we can
> just use the error code to tell what went wrong.

Oh, I guess I was replying too quickly.  From the spec, #UD happens when

	IF not in VMX operation or inSMM or inSEAM or 
			((IA32_EFER.LMA & CS.L) == 0)
 		THEN #UD;

Are you worried about #UD was caused by other cases rather than "not in
VMX operation"?

But it's quite obvious the other 3 cases are not possible, correct?
Sean Christopherson April 25, 2024, 4:30 p.m. UTC | #33
On Tue, Apr 23, 2024, Kai Huang wrote:
> On Tue, 2024-04-23 at 22:59 +0000, Huang, Kai wrote:
> > > Right, but that doesn't say why the #UD occurred.  The macro dresses it up in
> > > TDX_SW_ERROR so that KVM only needs a single parser, but at the end of the day
> > > KVM is still only going to see that SEAMCALL hit a #UD.
> > 
> > Right.  But is there any problem here?  I thought the point was we can
> > just use the error code to tell what went wrong.
> 
> Oh, I guess I was replying too quickly.  From the spec, #UD happens when
> 
> 	IF not in VMX operation or inSMM or inSEAM or 
> 			((IA32_EFER.LMA & CS.L) == 0)
>  		THEN #UD;
> 
> Are you worried about #UD was caused by other cases rather than "not in
> VMX operation"?

Yes.
 
> But it's quite obvious the other 3 cases are not possible, correct?

The spec I'm looking at also has:

	If IA32_VMX_PROCBASED_CTLS3[5] is 0.

And anecdotally, I know of at least one crash in our production environment where
a VMX instruction hit a seemingly spurious #UD, i.e. it's not impossible for a
ucode bug or hardware defect to cause problems.  That's obviously _extremely_
unlikely, but that's why I emphasized that sanity checking CR4.VMXE is cheap.
Practically speaking it costs nothing, so IMO it's worth adding even if the odds
of it ever being helpful are one-in-and-million.
Sean Christopherson April 25, 2024, 4:35 p.m. UTC | #34
On Tue, Apr 23, 2024, Kai Huang wrote:
> On Tue, 2024-04-23 at 08:15 -0700, Sean Christopherson wrote:
> > Presumably that approach relies on something blocking onlining CPUs when TDX is
> > active.  And if that's not the case, the proposed patches are buggy.
> 
> The current patch ([PATCH 023/130] KVM: TDX: Initialize the TDX module
> when loading the KVM intel kernel module) indeed is buggy, but I don't
> quite follow why we need to block onlining CPU  when TDX is active?

I was saying that based on my reading of the code, either (a) the code is buggy
or (b) something blocks onlining CPUs when TDX is active.  Sounds like the answer
is (a).

> There's no hard things that prevent us to do so.  KVM just need to do
> VMXON + tdx_cpu_enable() inside kvm_online_cpu().
> 
> > 
> > > Btw, the ideal (or probably the final) plan is to handle tdx_cpu_enable()
> > > in TDX's own CPU hotplug callback in the core-kernel and hide it from all
> > > other in-kernel TDX users.  
> > > 
> > > Specifically:
> > > 
> > > 1) that callback, e.g., tdx_online_cpu() will be placed _before_ any in-
> > > kernel TDX users like KVM's callback.
> > > 2) In tdx_online_cpu(), we do VMXON + tdx_cpu_enable() + VMXOFF, and
> > > return error in case of any error to prevent that cpu from going online.
> > > 
> > > That makes sure that, if TDX is supported by the platform, we basically
> > > guarantees all online CPUs are ready to issue SEAMCALL (of course, the in-
> > > kernel TDX user still needs to do VMXON for it, but that's TDX user's
> > > responsibility).
> > > 
> > > But that obviously needs to move VMXON to the core-kernel.
> > 
> > It doesn't strictly have to be core kernel per se, just in code that sits below
> > KVM, e.g. in a seperate module called VAC[*] ;-)
> > 
> > [*] https://lore.kernel.org/all/ZW6FRBnOwYV-UCkY@google.com
> 
> Could you elaborate why vac.ko is necessary?
> 
> Being a module natually we will need to handle module init and exit.  But
> TDX cannot be disabled and re-enabled after initialization, so in general
> the vac.ko doesn't quite fit for TDX.
> 
> And I am not sure what's the fundamental difference between managing TDX
> module in a module vs in the core-kernel from KVM's perspective.

VAC isn't strictly necessary.  What I was saying is that it's not strictly
necessary for the core kernel to handle VMXON either.  I.e. it could be done in
something like VAC, or it could be done in the core kernel.

The important thing is that they're handled by _one_ entity.  What we have today
is probably the worst setup; VMXON is handled by KVM, but TDX.SYS.LP.INIT is
handled by core kernel (sort of).
Huang, Kai April 25, 2024, 9:53 p.m. UTC | #35
On Thu, 2024-04-25 at 09:35 -0700, Sean Christopherson wrote:
> On Tue, Apr 23, 2024, Kai Huang wrote:
> > On Tue, 2024-04-23 at 08:15 -0700, Sean Christopherson wrote:
> > > Presumably that approach relies on something blocking onlining CPUs when TDX is
> > > active.  And if that's not the case, the proposed patches are buggy.
> > 
> > The current patch ([PATCH 023/130] KVM: TDX: Initialize the TDX module
> > when loading the KVM intel kernel module) indeed is buggy, but I don't
> > quite follow why we need to block onlining CPU  when TDX is active?
> 
> I was saying that based on my reading of the code, either (a) the code is buggy
> or (b) something blocks onlining CPUs when TDX is active.  Sounds like the answer
> is (a).

Yeah it's a).

> 
> > There's no hard things that prevent us to do so.  KVM just need to do
> > VMXON + tdx_cpu_enable() inside kvm_online_cpu().
> > 
> > > 
> > > > Btw, the ideal (or probably the final) plan is to handle tdx_cpu_enable()
> > > > in TDX's own CPU hotplug callback in the core-kernel and hide it from all
> > > > other in-kernel TDX users.  
> > > > 
> > > > Specifically:
> > > > 
> > > > 1) that callback, e.g., tdx_online_cpu() will be placed _before_ any in-
> > > > kernel TDX users like KVM's callback.
> > > > 2) In tdx_online_cpu(), we do VMXON + tdx_cpu_enable() + VMXOFF, and
> > > > return error in case of any error to prevent that cpu from going online.
> > > > 
> > > > That makes sure that, if TDX is supported by the platform, we basically
> > > > guarantees all online CPUs are ready to issue SEAMCALL (of course, the in-
> > > > kernel TDX user still needs to do VMXON for it, but that's TDX user's
> > > > responsibility).
> > > > 
> > > > But that obviously needs to move VMXON to the core-kernel.
> > > 
> > > It doesn't strictly have to be core kernel per se, just in code that sits below
> > > KVM, e.g. in a seperate module called VAC[*] ;-)
> > > 
> > > [*] https://lore.kernel.org/all/ZW6FRBnOwYV-UCkY@google.com
> > 
> > Could you elaborate why vac.ko is necessary?
> > 
> > Being a module natually we will need to handle module init and exit.  But
> > TDX cannot be disabled and re-enabled after initialization, so in general
> > the vac.ko doesn't quite fit for TDX.
> > 
> > And I am not sure what's the fundamental difference between managing TDX
> > module in a module vs in the core-kernel from KVM's perspective.
> 
> VAC isn't strictly necessary.  What I was saying is that it's not strictly
> necessary for the core kernel to handle VMXON either.  I.e. it could be done in
> something like VAC, or it could be done in the core kernel.

Right, but so far I cannot see any advantage of using a VAC module,
perhaps I am missing something although.

> 
> The important thing is that they're handled by _one_ entity.  What we have today
> is probably the worst setup; VMXON is handled by KVM, but TDX.SYS.LP.INIT is
> handled by core kernel (sort of).

I cannot argue against this :-)

But from this point of view, I cannot see difference between tdx_enable()
and tdx_cpu_enable(), because they both in core-kernel while depend on KVM
to handle VMXON.

Or, do you prefer to we move VMXON to the core-kernel at this stage, i.e.,
as a prepare work for KVM TDX?
Sean Christopherson April 25, 2024, 10:07 p.m. UTC | #36
On Thu, Apr 25, 2024, Kai Huang wrote:
> On Thu, 2024-04-25 at 09:35 -0700, Sean Christopherson wrote:
> > On Tue, Apr 23, 2024, Kai Huang wrote:
> > > On Tue, 2024-04-23 at 08:15 -0700, Sean Christopherson wrote:
> > > > Presumably that approach relies on something blocking onlining CPUs when TDX is
> > > > active.  And if that's not the case, the proposed patches are buggy.
> > > 
> > > The current patch ([PATCH 023/130] KVM: TDX: Initialize the TDX module
> > > when loading the KVM intel kernel module) indeed is buggy, but I don't
> > > quite follow why we need to block onlining CPU  when TDX is active?
> > 
> > I was saying that based on my reading of the code, either (a) the code is buggy
> > or (b) something blocks onlining CPUs when TDX is active.  Sounds like the answer
> > is (a).
> 
> Yeah it's a).

...

> > > Being a module natually we will need to handle module init and exit.  But
> > > TDX cannot be disabled and re-enabled after initialization, so in general
> > > the vac.ko doesn't quite fit for TDX.
> > > 
> > > And I am not sure what's the fundamental difference between managing TDX
> > > module in a module vs in the core-kernel from KVM's perspective.
> > 
> > VAC isn't strictly necessary.  What I was saying is that it's not strictly
> > necessary for the core kernel to handle VMXON either.  I.e. it could be done in
> > something like VAC, or it could be done in the core kernel.
> 
> Right, but so far I cannot see any advantage of using a VAC module,
> perhaps I am missing something although.

Yeah, there's probably no big advantage.

> > The important thing is that they're handled by _one_ entity.  What we have today
> > is probably the worst setup; VMXON is handled by KVM, but TDX.SYS.LP.INIT is
> > handled by core kernel (sort of).
> 
> I cannot argue against this :-)
> 
> But from this point of view, I cannot see difference between tdx_enable()
> and tdx_cpu_enable(), because they both in core-kernel while depend on KVM
> to handle VMXON.

My comments were made under the assumption that the code was NOT buggy, i.e. if
KVM did NOT need to call tdx_cpu_enable() independent of tdx_enable().

That said, I do think it makes to have tdx_enable() call an private/inner version,
e.g. __tdx_cpu_enable(), and then have KVM call a public version.  Alternatively,
the kernel could register yet another cpuhp hook that runs after KVM's, i.e. does
TDX.SYS.LP.INIT after KVM has done VMXON (if TDX has been enabled).

> Or, do you prefer to we move VMXON to the core-kernel at this stage, i.e.,
> as a prepare work for KVM TDX? 

No, the amount of churn/work that would create is high, and TDX is already taking
on enough churn.
Huang, Kai April 25, 2024, 10:34 p.m. UTC | #37
On Thu, 2024-04-25 at 09:30 -0700, Sean Christopherson wrote:
> On Tue, Apr 23, 2024, Kai Huang wrote:
> > On Tue, 2024-04-23 at 22:59 +0000, Huang, Kai wrote:
> > > > Right, but that doesn't say why the #UD occurred.  The macro dresses it up in
> > > > TDX_SW_ERROR so that KVM only needs a single parser, but at the end of the day
> > > > KVM is still only going to see that SEAMCALL hit a #UD.
> > > 
> > > Right.  But is there any problem here?  I thought the point was we can
> > > just use the error code to tell what went wrong.
> > 
> > Oh, I guess I was replying too quickly.  From the spec, #UD happens when
> > 
> > 	IF not in VMX operation or inSMM or inSEAM or 
> > 			((IA32_EFER.LMA & CS.L) == 0)
> >  		THEN #UD;
> > 
> > Are you worried about #UD was caused by other cases rather than "not in
> > VMX operation"?
> 
> Yes.
>  
> > But it's quite obvious the other 3 cases are not possible, correct?
> 
> The spec I'm looking at also has:
> 
> 	If IA32_VMX_PROCBASED_CTLS3[5] is 0.

Ah, now I see this too.

It's not in the pseudo code of SEAMCALL instruction, but is at the "64-Bit
Mode Exceptions" section which is after the pseudo code.

And this bit 5 is to report the capability to allow to control the "shard
bit" in the 5-level EPT.

> 
> And anecdotally, I know of at least one crash in our production environment where
> a VMX instruction hit a seemingly spurious #UD, i.e. it's not impossible for a
> ucode bug or hardware defect to cause problems.  That's obviously _extremely_
> unlikely, but that's why I emphasized that sanity checking CR4.VMXE is cheap.

Yeah I agree it could happen although very unlikely.

But just to be sure:

I believe the #UD itself doesn't crash the kernel/machine, but should be
the kernel unable to handle #UD in such case?

If so, I am not sure whether the CR4.VMX check can make the kernel any
safer, because we can already handle the #UD for the SEAMCALL instruction.

Yeah we can clearly dump message saying "CPU isn't in VMX operation" and
return failure if we have the check, but if we don't, the worst situation
is we might mistakenly report "CPU isn't in VMX operation" (currently code
just treats #UD as CPU not in VMX operation) when CPU doesn't
IA32_VMX_PROCBASED_CTLS3[5].

And for the IA32_VMX_PROCBASED_CTLS3[5] we can easily do some pre-check in
KVM code during module loading to rule out this case.

And in practice, I even believe the BIOS cannot turn on TDX if the
IA32_VMX_PROCBASED_CTLS3[5] is not supported.  I can check on this.

> Practically speaking it costs nothing, so IMO it's worth adding even if the odds
> of it ever being helpful are one-in-and-million.

I think we will need to do below at somewhere for the common SEAMCALL
function:

	unsigned long flags;
	int ret = -EINVAL;

	local_irq_save(flags);

	if (WARN_ON_ONCE(!(__read_cr4() & X86_CR4_VMXE)))
		goto out;

	ret = seamcall();
out:
	local_irq_restore(flags);
	return ret;

to make it IRQ safe.

And the odd is currently the common SEAMCALL functions, a.k.a,
__seamcall() and seamcall() (the latter is a mocro actually), both return
u64, so if we want to have such CR4.VMX check code in the common code, we
need to invent a new error code for it.

That being said, although I agree it can make the code a little bit
clearer, I am not sure whether it can make the code any safer -- even w/o
it, the worst case is to incorrectly report "CPU is not in VMX operation",
but shouldn't crash kernel etc.
Sean Christopherson April 25, 2024, 10:43 p.m. UTC | #38
On Thu, Apr 25, 2024, Kai Huang wrote:
> On Thu, 2024-04-25 at 09:30 -0700, Sean Christopherson wrote:
> > On Tue, Apr 23, 2024, Kai Huang wrote:
> > And anecdotally, I know of at least one crash in our production environment where
> > a VMX instruction hit a seemingly spurious #UD, i.e. it's not impossible for a
> > ucode bug or hardware defect to cause problems.  That's obviously _extremely_
> > unlikely, but that's why I emphasized that sanity checking CR4.VMXE is cheap.
> 
> Yeah I agree it could happen although very unlikely.
> 
> But just to be sure:
> 
> I believe the #UD itself doesn't crash the kernel/machine, but should be
> the kernel unable to handle #UD in such case?

Correct, the #UD is likely not (immediately) fatal.
> 
> If so, I am not sure whether the CR4.VMX check can make the kernel any
> safer, because we can already handle the #UD for the SEAMCALL instruction.

It's not about making the kernel safer, it's about helping triage/debug issues.

> Yeah we can clearly dump message saying "CPU isn't in VMX operation" and
> return failure if we have the check, but if we don't, the worst situation
> is we might mistakenly report "CPU isn't in VMX operation" (currently code
> just treats #UD as CPU not in VMX operation) when CPU doesn't
> IA32_VMX_PROCBASED_CTLS3[5].
> 
> And for the IA32_VMX_PROCBASED_CTLS3[5] we can easily do some pre-check in
> KVM code during module loading to rule out this case.
>
> And in practice, I even believe the BIOS cannot turn on TDX if the
> IA32_VMX_PROCBASED_CTLS3[5] is not supported.  I can check on this.

Eh, I wouldn't worry about that too much.  The only reason I brought up that
check was to call out that we can't *know* with 100% certainty that SEAMCALL
failed due to the CPU not being post-VMXON.

> > Practically speaking it costs nothing, so IMO it's worth adding even if the odds
> > of it ever being helpful are one-in-and-million.
> 
> I think we will need to do below at somewhere for the common SEAMCALL
> function:
> 
> 	unsigned long flags;
> 	int ret = -EINVAL;
> 
> 	local_irq_save(flags);
> 
> 	if (WARN_ON_ONCE(!(__read_cr4() & X86_CR4_VMXE)))
> 		goto out;
> 
> 	ret = seamcall();
> out:
> 	local_irq_restore(flags);
> 	return ret;
> 
> to make it IRQ safe.
>
> And the odd is currently the common SEAMCALL functions, a.k.a,
> __seamcall() and seamcall() (the latter is a mocro actually), both return
> u64, so if we want to have such CR4.VMX check code in the common code, we
> need to invent a new error code for it.

Oh, I wasn't thinking that we'd check CR4.VMXE before *every* SEAMCALL, just
before the TDH.SYS.LP.INIT call, i.e. before the one that is most likely to fail
due to a software bug that results in the CPU not doing VMXON before enabling
TDX.

Again, my intent is to add a simple, cheap, and targeted sanity check to help
deal with potential failures in code that historically has been less than rock
solid, and in function that has a big fat assumption that the caller has done
VMXON on the CPU.

> That being said, although I agree it can make the code a little bit
> clearer, I am not sure whether it can make the code any safer -- even w/o
> it, the worst case is to incorrectly report "CPU is not in VMX operation",
> but shouldn't crash kernel etc.
Huang, Kai April 26, 2024, 12:21 a.m. UTC | #39
> 
> > > The important thing is that they're handled by _one_ entity.  What we have today
> > > is probably the worst setup; VMXON is handled by KVM, but TDX.SYS.LP.INIT is
> > > handled by core kernel (sort of).
> > 
> > I cannot argue against this :-)
> > 
> > But from this point of view, I cannot see difference between tdx_enable()
> > and tdx_cpu_enable(), because they both in core-kernel while depend on KVM
> > to handle VMXON.
> 
> My comments were made under the assumption that the code was NOT buggy, i.e. if
> KVM did NOT need to call tdx_cpu_enable() independent of tdx_enable().
> 
> That said, I do think it makes to have tdx_enable() call an private/inner version,
> e.g. __tdx_cpu_enable(), and then have KVM call a public version.  Alternatively,
> the kernel could register yet another cpuhp hook that runs after KVM's, i.e. does
> TDX.SYS.LP.INIT after KVM has done VMXON (if TDX has been enabled).

We will need to handle tdx_cpu_online() in "some cpuhp callback" anyway,
no matter whether tdx_enable() calls __tdx_cpu_enable() internally or not,
because now tdx_enable() can be done on a subset of cpus that the platform
has.

For the latter (after the "Alternatively" above), by "the kernel" do you
mean the core-kernel but not KVM?

E.g., you mean to register a cpuhp book _inside_ tdx_enable() after TDX is
initialized successfully?

That would have problem like when KVM is not present (e.g., KVM is
unloaded after it enables TDX), the cpuhp book won't work at all.

If we ever want a new TDX-specific cpuhp hook "at this stage", IMHO it's
better to have it done by KVM, i.e., it goes away when KVM is unloaded.

Logically, we have two approaches in terms of how to treat
tdx_cpu_enable():

1) We treat the two cases separately: calling tdx_cpu_enable() for all
online cpus, and calling it when a new CPU tries to go online in some
cpuhp hook.  And we only want to call tdx_cpu_enable() in cpuhp book when
tdx_enable() has done successfully.

That is: 

a) we always call tdx_cpu_enable() (or __tdx_cpu_enable()) inside
tdx_enable() as the first step, or,

b) let the caller (KVM) to make sure of tdx_cpu_enable() has been done for
all online cpus before calling tdx_enable().

Something like this:

	if (enable_tdx) {
		cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, kvm_online_cpu, 
			...);

		cpus_read_lock();
		on_each_cpu(tdx_cpu_enable, ...); /* or do it inside 
						   * in tdx_enable() */
		enable_tdx = tdx_enable();
		if (enable_tdx)
			cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
				tdx_online_cpu, ...);
		cpus_read_unlock();
	}

	static int tdx_online_cpu(unsigned int cpu)
	{
		unsigned long flags;
		int ret;

		if (!enable_tdx)
			return 0;

		local_irq_save(flags);
		ret = tdx_cpu_enable();
		local_irq_restore(flags);

		return ret;
	}

2) We treat tdx_cpu_enable() as a whole by viewing it as the first step to
run any TDX code (SEAMCALL) on any cpu, including the SEAMCALLs involved
in tdx_enable().

That is, we *unconditionally* call tdx_cpu_enable() for all online cpus,
and when a new CPU tries to go online.

This can be handled at once if we do tdx_cpu_enable() inside KVM's cpuhp
hook:

	static int vt_hardware_enable(unsigned int cpu)
	{
		vmx_hardware_enable();

		local_irq_save(flags);
		ret = tdx_cpu_enable();
		local_irq_restore(flags);

		/*
		 * -ENODEV means TDX is not supported by the platform
		 * (TDX not enabled by the hardware or module is
		 * not loaded) or the kernel isn't built with TDX.
		 *
		 * Allow CPU to go online as there's no way kernel
		 * could use TDX in this case.
		 *
		 * Other error codes means TDX is available but something
		 * went wrong.  Prevent this CPU to go online so that
		 * TDX may still work on other online CPUs.
		 */
		if (ret && ret != -ENODEV)
			return ret;

		return ret;
	}

So with your change to always enable virtualization when TDX is enabled
during module load, we can simply have:

	if (enable_tdx)
		cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, kvm_online_cpu, 
			...);

		cpus_read_lock();
		enable_tdx = tdx_enable();
		cpus_read_unlock();
	}

So despite the cpus_read_lock() around tdx_enable() is a little bit silly,
the logic is actually simpler IMHO.

(local_irq_save()/restore() around tdx_cpu_enable() is also silly but that
is a common problem to both above solution and can be changed
independently).

Also, as I mentioned that the final goal is to have a TDX-specific CPUHP
hook in the core-kernel _BEFORE_ any in-kernel TDX user (KVM) to make sure
all online CPUs are TDX-capable.  

When that happens, I can just move the code in vt_hardware_enable() to
tdx_online_cpu() and do additional VMXOFF inside it, with the assumption
that the in-kernel TDX users should manage VMXON/VMXOFF on their own. 
Then all TDX users can remove the handling of tdx_cpu_enable().
Chao Gao April 26, 2024, 3:21 a.m. UTC | #40
On Fri, Apr 26, 2024 at 12:21:46AM +0000, Huang, Kai wrote:
>
>> 
>> > > The important thing is that they're handled by _one_ entity.  What we have today
>> > > is probably the worst setup; VMXON is handled by KVM, but TDX.SYS.LP.INIT is
>> > > handled by core kernel (sort of).
>> > 
>> > I cannot argue against this :-)
>> > 
>> > But from this point of view, I cannot see difference between tdx_enable()
>> > and tdx_cpu_enable(), because they both in core-kernel while depend on KVM
>> > to handle VMXON.
>> 
>> My comments were made under the assumption that the code was NOT buggy, i.e. if
>> KVM did NOT need to call tdx_cpu_enable() independent of tdx_enable().
>> 
>> That said, I do think it makes to have tdx_enable() call an private/inner version,
>> e.g. __tdx_cpu_enable(), and then have KVM call a public version.  Alternatively,
>> the kernel could register yet another cpuhp hook that runs after KVM's, i.e. does
>> TDX.SYS.LP.INIT after KVM has done VMXON (if TDX has been enabled).
>
>We will need to handle tdx_cpu_online() in "some cpuhp callback" anyway,
>no matter whether tdx_enable() calls __tdx_cpu_enable() internally or not,
>because now tdx_enable() can be done on a subset of cpus that the platform
>has.

Can you confirm this is allowed again? it seems like this code indicates the
opposite:

https://github.com/intel/tdx-module/blob/tdx_1.5/src/vmm_dispatcher/api_calls/tdh_sys_config.c#L768C1-L775C6

>
>For the latter (after the "Alternatively" above), by "the kernel" do you
>mean the core-kernel but not KVM?
>
>E.g., you mean to register a cpuhp book _inside_ tdx_enable() after TDX is
>initialized successfully?
>
>That would have problem like when KVM is not present (e.g., KVM is
>unloaded after it enables TDX), the cpuhp book won't work at all.

Is "the cpuhp hook doesn't work if KVM is not loaded" a real problem?

The CPU about to online won't run any TDX code. So, it should be ok to
skip tdx_cpu_enable().

Don't get me wrong. I don't object to registering the cpuhp hook in KVM.
I just want you to make decisions based on good information.

>
>If we ever want a new TDX-specific cpuhp hook "at this stage", IMHO it's
>better to have it done by KVM, i.e., it goes away when KVM is unloaded.
>
>Logically, we have two approaches in terms of how to treat
>tdx_cpu_enable():
>
>1) We treat the two cases separately: calling tdx_cpu_enable() for all
>online cpus, and calling it when a new CPU tries to go online in some
>cpuhp hook.  And we only want to call tdx_cpu_enable() in cpuhp book when
>tdx_enable() has done successfully.
>
>That is: 
>
>a) we always call tdx_cpu_enable() (or __tdx_cpu_enable()) inside
>tdx_enable() as the first step, or,
>
>b) let the caller (KVM) to make sure of tdx_cpu_enable() has been done for
>all online cpus before calling tdx_enable().
>
>Something like this:
>
>	if (enable_tdx) {
>		cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, kvm_online_cpu, 
>			...);
>
>		cpus_read_lock();
>		on_each_cpu(tdx_cpu_enable, ...); /* or do it inside 
>						   * in tdx_enable() */
>		enable_tdx = tdx_enable();
>		if (enable_tdx)
>			cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
>				tdx_online_cpu, ...);
>		cpus_read_unlock();
>	}
>
>	static int tdx_online_cpu(unsigned int cpu)
>	{
>		unsigned long flags;
>		int ret;
>
>		if (!enable_tdx)
>			return 0;
>
>		local_irq_save(flags);
>		ret = tdx_cpu_enable();
>		local_irq_restore(flags);
>
>		return ret;
>	}
>
>2) We treat tdx_cpu_enable() as a whole by viewing it as the first step to
>run any TDX code (SEAMCALL) on any cpu, including the SEAMCALLs involved
>in tdx_enable().
>
>That is, we *unconditionally* call tdx_cpu_enable() for all online cpus,
>and when a new CPU tries to go online.
>
>This can be handled at once if we do tdx_cpu_enable() inside KVM's cpuhp
>hook:
>
>	static int vt_hardware_enable(unsigned int cpu)
>	{
>		vmx_hardware_enable();
>
>		local_irq_save(flags);
>		ret = tdx_cpu_enable();
>		local_irq_restore(flags);
>
>		/*
>		 * -ENODEV means TDX is not supported by the platform
>		 * (TDX not enabled by the hardware or module is
>		 * not loaded) or the kernel isn't built with TDX.
>		 *
>		 * Allow CPU to go online as there's no way kernel
>		 * could use TDX in this case.
>		 *
>		 * Other error codes means TDX is available but something
>		 * went wrong.  Prevent this CPU to go online so that
>		 * TDX may still work on other online CPUs.
>		 */
>		if (ret && ret != -ENODEV)
>			return ret;
>
>		return ret;
>	}
>
>So with your change to always enable virtualization when TDX is enabled
>during module load, we can simply have:
>
>	if (enable_tdx)
>		cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, kvm_online_cpu, 
>			...);
>
>		cpus_read_lock();
>		enable_tdx = tdx_enable();
>		cpus_read_unlock();
>	}
>
>So despite the cpus_read_lock() around tdx_enable() is a little bit silly,
>the logic is actually simpler IMHO.
>
>(local_irq_save()/restore() around tdx_cpu_enable() is also silly but that
>is a common problem to both above solution and can be changed
>independently).
>
>Also, as I mentioned that the final goal is to have a TDX-specific CPUHP
>hook in the core-kernel _BEFORE_ any in-kernel TDX user (KVM) to make sure
>all online CPUs are TDX-capable.  
>
>When that happens, I can just move the code in vt_hardware_enable() to
>tdx_online_cpu() and do additional VMXOFF inside it, with the assumption
>that the in-kernel TDX users should manage VMXON/VMXOFF on their own. 
>Then all TDX users can remove the handling of tdx_cpu_enable().
Huang, Kai April 26, 2024, 9:44 a.m. UTC | #41
On Fri, 2024-04-26 at 11:21 +0800, Gao, Chao wrote:
> On Fri, Apr 26, 2024 at 12:21:46AM +0000, Huang, Kai wrote:
> > 
> > > 
> > > > > The important thing is that they're handled by _one_ entity.  What we have today
> > > > > is probably the worst setup; VMXON is handled by KVM, but TDX.SYS.LP.INIT is
> > > > > handled by core kernel (sort of).
> > > > 
> > > > I cannot argue against this :-)
> > > > 
> > > > But from this point of view, I cannot see difference between tdx_enable()
> > > > and tdx_cpu_enable(), because they both in core-kernel while depend on KVM
> > > > to handle VMXON.
> > > 
> > > My comments were made under the assumption that the code was NOT buggy, i.e. if
> > > KVM did NOT need to call tdx_cpu_enable() independent of tdx_enable().
> > > 
> > > That said, I do think it makes to have tdx_enable() call an private/inner version,
> > > e.g. __tdx_cpu_enable(), and then have KVM call a public version.  Alternatively,
> > > the kernel could register yet another cpuhp hook that runs after KVM's, i.e. does
> > > TDX.SYS.LP.INIT after KVM has done VMXON (if TDX has been enabled).
> > 
> > We will need to handle tdx_cpu_online() in "some cpuhp callback" anyway,
> > no matter whether tdx_enable() calls __tdx_cpu_enable() internally or not,
> > because now tdx_enable() can be done on a subset of cpus that the platform
> > has.
> 
> Can you confirm this is allowed again? it seems like this code indicates the
> opposite:
> 
> https://github.com/intel/tdx-module/blob/tdx_1.5/src/vmm_dispatcher/api_calls/tdh_sys_config.c#L768C1-L775C6

This feature requires ucode/P-SEAMLDR and TDX module change, and cannot be
supported for some *early* generations.  I think they haven't added such
code to the opensource TDX module code yet.

I can ask TDX module people's plan if it is a concern.

In reality, this shouldn't be a problem because the current code kinda
works with both cases:

1) If this feature is not supported (i.e., old platform and/or old
module), and if user tries to enable TDX when there's offline cpu, then
tdx_enable() will fail when it does TDH.SYS.CONFIG, and we can use the
error code to pinpoint the root cause.

2) Otherwise, it just works.

> 
> > 
> > For the latter (after the "Alternatively" above), by "the kernel" do you
> > mean the core-kernel but not KVM?
> > 
> > E.g., you mean to register a cpuhp book _inside_ tdx_enable() after TDX is
> > initialized successfully?
> > 
> > That would have problem like when KVM is not present (e.g., KVM is
> > unloaded after it enables TDX), the cpuhp book won't work at all.
> 
> Is "the cpuhp hook doesn't work if KVM is not loaded" a real problem?
> 
> The CPU about to online won't run any TDX code. So, it should be ok to
> skip tdx_cpu_enable().

It _can_ work if we only consider KVM, because for KVM we can always
guarantee:

1) VMXON + tdx_cpu_enable() have been done for all online cpus before it
calls tdx_enable().
2) VMXON + tdx_cpu_enable() have been done in cpuhp for any new CPU before
it goes online.

Btw, this reminds me why I didn't want to do tdx_cpu_enable() inside
tdx_enable():

tdx_enable() will need to _always_ call tdx_cpu_enable() for all online
cpus regardless of whether the module has been initialized successfully in
the previous calls.

I believed this is kinda silly, i.e., why not just letting the caller to
do tdx_cpu_enable() for all online cpus before tdx_enable().

However, back to the TDX-specific core-kernel cpuhp hook, in the long
term, I believe the TDX cpuhp hook should be put _BEFORE_ all in-kernel
TDX-users' cpuhp hooks, because logically TDX users should depend on TDX
core-kernel code, but not the opposite.

That is, my long term vision is we can have a simple rule:

The core-kernel TDX code always guarantees online CPUs are TDX-capable. 
All TDX users don't need to consider tdx_cpu_enable() ever.  They just
need to call tdx_enable() to bring TDX to work.

So for now, given we depend on KVM for VMXON anyway, I don't see any
reason the core-kernel should register any TDX cpuhp.  Having to "skip
tdx_cpu_enable() when VMX isn't enabled" is kinda hacky anyway.
Huang, Kai April 29, 2024, 11:41 a.m. UTC | #42
On Thu, 2024-04-25 at 15:43 -0700, Sean Christopherson wrote:
> On Thu, Apr 25, 2024, Kai Huang wrote:
> > On Thu, 2024-04-25 at 09:30 -0700, Sean Christopherson wrote:
> > > On Tue, Apr 23, 2024, Kai Huang wrote:
> > > And anecdotally, I know of at least one crash in our production environment where
> > > a VMX instruction hit a seemingly spurious #UD, i.e. it's not impossible for a
> > > ucode bug or hardware defect to cause problems.  That's obviously _extremely_
> > > unlikely, but that's why I emphasized that sanity checking CR4.VMXE is cheap.
> > 
> > Yeah I agree it could happen although very unlikely.
> > 
> > But just to be sure:
> > 
> > I believe the #UD itself doesn't crash the kernel/machine, but should be
> > the kernel unable to handle #UD in such case?
> 
> Correct, the #UD is likely not (immediately) fatal.
> > 
> > If so, I am not sure whether the CR4.VMX check can make the kernel any
> > safer, because we can already handle the #UD for the SEAMCALL instruction.
> 
> It's not about making the kernel safer, it's about helping triage/debug issues.
> 
> > Yeah we can clearly dump message saying "CPU isn't in VMX operation" and
> > return failure if we have the check, but if we don't, the worst situation
> > is we might mistakenly report "CPU isn't in VMX operation" (currently code
> > just treats #UD as CPU not in VMX operation) when CPU doesn't
> > IA32_VMX_PROCBASED_CTLS3[5].
> > 
> > And for the IA32_VMX_PROCBASED_CTLS3[5] we can easily do some pre-check in
> > KVM code during module loading to rule out this case.
> > 
> > And in practice, I even believe the BIOS cannot turn on TDX if the
> > IA32_VMX_PROCBASED_CTLS3[5] is not supported.  I can check on this.
> 
> Eh, I wouldn't worry about that too much.  The only reason I brought up that
> check was to call out that we can't *know* with 100% certainty that SEAMCALL
> failed due to the CPU not being post-VMXON.

OK (though I think we can rule out other cases by adding more checks etc).

> 
> > > Practically speaking it costs nothing, so IMO it's worth adding even if the odds
> > > of it ever being helpful are one-in-and-million.
> > 
> > I think we will need to do below at somewhere for the common SEAMCALL
> > function:
> > 
> > 	unsigned long flags;
> > 	int ret = -EINVAL;
> > 
> > 	local_irq_save(flags);
> > 
> > 	if (WARN_ON_ONCE(!(__read_cr4() & X86_CR4_VMXE)))
> > 		goto out;
> > 
> > 	ret = seamcall();
> > out:
> > 	local_irq_restore(flags);
> > 	return ret;
> > 
> > to make it IRQ safe.
> > 
> > And the odd is currently the common SEAMCALL functions, a.k.a,
> > __seamcall() and seamcall() (the latter is a mocro actually), both return
> > u64, so if we want to have such CR4.VMX check code in the common code, we
> > need to invent a new error code for it.
> 
> Oh, I wasn't thinking that we'd check CR4.VMXE before *every* SEAMCALL, just
> before the TDH.SYS.LP.INIT call, i.e. before the one that is most likely to fail
> due to a software bug that results in the CPU not doing VMXON before enabling
> TDX.
> 
> Again, my intent is to add a simple, cheap, and targeted sanity check to help
> deal with potential failures in code that historically has been less than rock
> solid, and in function that has a big fat assumption that the caller has done
> VMXON on the CPU.

I see.

(To be fair, personally I don't recall that we ever had any bug due to
"cpu not in post-VMXON before SEAMCALL", but maybe it's just me. :-).)

But if tdx_enable() doesn't call tdx_cpu_enable() internally, then we will
have two functions need to handle.

For tdx_enable(), given it's still good idea to disable CPU hotplug around
it, we can still do some check for all online cpus at the beginning, like:

	on_each_cpu(check_cr4_vmx(), &err, 1);

Btw, please also see my last reply to Chao why I don't like calling
tdx_cpu_enable() inside tdx_enable():

https://lore.kernel.org/lkml/1fd17c931d5c2effcf1105b63deac8e3fb1664bc.camel@intel.com/

That being said, I can try to add additional patch(es) to do CR4.VMX check
if you want, but personally I found hard to have a strong justification to
do so.
Sean Christopherson April 29, 2024, 8:06 p.m. UTC | #43
On Mon, Apr 29, 2024, Kai Huang wrote:
> On Thu, 2024-04-25 at 15:43 -0700, Sean Christopherson wrote:
> > > And the odd is currently the common SEAMCALL functions, a.k.a,
> > > __seamcall() and seamcall() (the latter is a mocro actually), both return
> > > u64, so if we want to have such CR4.VMX check code in the common code, we
> > > need to invent a new error code for it.
> > 
> > Oh, I wasn't thinking that we'd check CR4.VMXE before *every* SEAMCALL, just
> > before the TDH.SYS.LP.INIT call, i.e. before the one that is most likely to fail
> > due to a software bug that results in the CPU not doing VMXON before enabling
> > TDX.
> > 
> > Again, my intent is to add a simple, cheap, and targeted sanity check to help
> > deal with potential failures in code that historically has been less than rock
> > solid, and in function that has a big fat assumption that the caller has done
> > VMXON on the CPU.
> 
> I see.
> 
> (To be fair, personally I don't recall that we ever had any bug due to
> "cpu not in post-VMXON before SEAMCALL", but maybe it's just me. :-).)
> 
> But if tdx_enable() doesn't call tdx_cpu_enable() internally, then we will
> have two functions need to handle.

Why?  I assume there will be exactly one caller of TDH.SYS.LP.INIT.

> For tdx_enable(), given it's still good idea to disable CPU hotplug around
> it, we can still do some check for all online cpus at the beginning, like:
> 
> 	on_each_cpu(check_cr4_vmx(), &err, 1);

If it gets to that point, just omit the check.  I really think you're making much
ado about nothing.  My suggestion is essentially "throw in a CR4.VMXE check before
TDH.SYS.LP.INIT if it's easy".  If it's not easy for some reason, then don't do
it.

> Btw, please also see my last reply to Chao why I don't like calling
> tdx_cpu_enable() inside tdx_enable():
> 
> https://lore.kernel.org/lkml/1fd17c931d5c2effcf1105b63deac8e3fb1664bc.camel@intel.com/
> 
> That being said, I can try to add additional patch(es) to do CR4.VMX check
> if you want, but personally I found hard to have a strong justification to
> do so.
>
Huang, Kai April 29, 2024, 11:12 p.m. UTC | #44
On 30/04/2024 8:06 am, Sean Christopherson wrote:
> On Mon, Apr 29, 2024, Kai Huang wrote:
>> On Thu, 2024-04-25 at 15:43 -0700, Sean Christopherson wrote:
>>>> And the odd is currently the common SEAMCALL functions, a.k.a,
>>>> __seamcall() and seamcall() (the latter is a mocro actually), both return
>>>> u64, so if we want to have such CR4.VMX check code in the common code, we
>>>> need to invent a new error code for it.
>>>
>>> Oh, I wasn't thinking that we'd check CR4.VMXE before *every* SEAMCALL, just
>>> before the TDH.SYS.LP.INIT call, i.e. before the one that is most likely to fail
>>> due to a software bug that results in the CPU not doing VMXON before enabling
>>> TDX.
>>>
>>> Again, my intent is to add a simple, cheap, and targeted sanity check to help
>>> deal with potential failures in code that historically has been less than rock
>>> solid, and in function that has a big fat assumption that the caller has done
>>> VMXON on the CPU.
>>
>> I see.
>>
>> (To be fair, personally I don't recall that we ever had any bug due to
>> "cpu not in post-VMXON before SEAMCALL", but maybe it's just me. :-).)
>>
>> But if tdx_enable() doesn't call tdx_cpu_enable() internally, then we will
>> have two functions need to handle.
> 
> Why?  I assume there will be exactly one caller of TDH.SYS.LP.INIT.

Right, it's only done in tdx_cpu_enable().

I was thinking "the one that is most likely to fail" isn't just 
TDH.SYS.LP.INIT in this case, but also could be any SEAMCALL that is 
firstly run on any online cpu inside tdx_enable().

Or perhaps you were thinking once tdx_cpu_enable() is called on one cpu, 
then we can safely assume that cpu must be in post-VMXON, despite we 
have two separate functions: tdx_cpu_enable() and tdx_enable().

> 
>> For tdx_enable(), given it's still good idea to disable CPU hotplug around
>> it, we can still do some check for all online cpus at the beginning, like:
>>
>> 	on_each_cpu(check_cr4_vmx(), &err, 1);
> 
> If it gets to that point, just omit the check.  I really think you're making much
> ado about nothing.  

Yeah.

> My suggestion is essentially "throw in a CR4.VMXE check before
> TDH.SYS.LP.INIT if it's easy".  If it's not easy for some reason, then don't do
> it.

I see.  The disconnection between us is I am not super clear why we 
should treat TDH.SYS.LP.INIT as a special one that deserves a CR4.VMXE 
check but not other SEAMCALLs.

Anyway, I don't think adding such check or not matters a lot at this 
stage, and I don't want to jeopardize any more time from you on this.  :-)
Sean Christopherson April 30, 2024, 4:13 p.m. UTC | #45
On Tue, Apr 30, 2024, Kai Huang wrote:
> On 30/04/2024 8:06 am, Sean Christopherson wrote:
> > My suggestion is essentially "throw in a CR4.VMXE check before
> > TDH.SYS.LP.INIT if it's easy".  If it's not easy for some reason, then don't do
> > it.
> 
> I see.  The disconnection between us is I am not super clear why we should
> treat TDH.SYS.LP.INIT as a special one that deserves a CR4.VMXE check but
> not other SEAMCALLs.

Because TDH.SYS.LP.INIT is done on all CPUs via an IPI function call, is a one-
time thing, and is at the intersection of core TDX and KVM module code, e.g. the
the core TDX code has an explicit assumption that:

 * This function assumes the caller has: 1) held read lock of CPU hotplug
 * lock to prevent any new cpu from becoming online; 2) done both VMXON
 * and tdx_cpu_enable() on all online cpus.

KVM can obviously screw up and attempt SEAMCALLs without being post-VMXON, but
that's entirely a _KVM_ bug.  And the probability of getting all the way to
something like TDH_MEM_SEPT_ADD without being post-VMXON is comically low, e.g.
KVM and/or the kernel would likely crash long before that point.
Huang, Kai May 1, 2024, 2:56 a.m. UTC | #46
On 1/05/2024 4:13 am, Sean Christopherson wrote:
> On Tue, Apr 30, 2024, Kai Huang wrote:
>> On 30/04/2024 8:06 am, Sean Christopherson wrote:
>>> My suggestion is essentially "throw in a CR4.VMXE check before
>>> TDH.SYS.LP.INIT if it's easy".  If it's not easy for some reason, then don't do
>>> it.
>>
>> I see.  The disconnection between us is I am not super clear why we should
>> treat TDH.SYS.LP.INIT as a special one that deserves a CR4.VMXE check but
>> not other SEAMCALLs.
> 
> Because TDH.SYS.LP.INIT is done on all CPUs via an IPI function call, is a one-
> time thing, and is at the intersection of core TDX and KVM module code, e.g. the
> the core TDX code has an explicit assumption that:
> 
>   * This function assumes the caller has: 1) held read lock of CPU hotplug
>   * lock to prevent any new cpu from becoming online; 2) done both VMXON
>   * and tdx_cpu_enable() on all online cpus.

Yeah but from this perspective, both tdx_cpu_enable() and tdx_enable() 
are "a one time thing" and "at the intersection of core TDX and KVM"  :-)

But from the perspective that tdx_cpu_enable() must be called in IRQ 
disabled context, and there's no possibility that other thread/code 
could potentially mess up VMX enabling after the CR4.VMXE check, so it's 
fine to add such check.

And looking again, in fact the comment of tdx_cpu_enable() doesn't 
explicitly call out it requires the caller to do VMXON first (although 
kinda implied by the comment of tdx_enable() as you quoted above).

I can add a patch to make it more clear by calling out in the comment of 
tdx_cpu_enable() that it requires caller to do VMXON and adding a 
WARN_ON_ONCE(!CR4.VMXE) check inside.  I just don't know whether it is 
worth to do at this stage given it's not something mandatory because it 
requires review time from maintainers.  I can include such patch in next 
KVM TDX patchset if you prefer so we can see how it goes.

> 
> KVM can obviously screw up and attempt SEAMCALLs without being post-VMXON, but
> that's entirely a _KVM_ bug.  And the probability of getting all the way to
> something like TDH_MEM_SEPT_ADD without being post-VMXON is comically low, e.g.
> KVM and/or the kernel would likely crash long before that point.

Yeah fully agree SEAMCALLs managed by KVM shouldn't need to do CR4.VMXE 
check.  I was talking about those involved in tdx_enable(), i.e., 
TDH.SYS.xxx.
Huang, Kai May 7, 2024, 12:40 p.m. UTC | #47
> > 
> > So I think we have consensus to go with the approach that shows in your
> > second diff -- that is to always enable virtualization during module loading
> > for all other ARCHs other than x86, for which we only always enables
> > virtualization during module loading for TDX.
> 
> Assuming the other arch maintainers are ok with that approach.  If waiting until
> a VM is created is desirable for other architectures, then we'll need to figure
> out a plan b.  E.g. KVM arm64 doesn't support being built as a module, so enabling
> hardware during initialization would mean virtualization is enabled for any kernel
> that is built with CONFIG_KVM=y.
> 
> Actually, duh.  There's absolutely no reason to force other architectures to
> choose when to enable virtualization.  As evidenced by the massaging to have x86
> keep enabling virtualization on-demand for !TDX, the cleanups don't come from
> enabling virtualization during module load, they come from registering cpuup and
> syscore ops when virtualization is enabled.
> 
> I.e. we can keep kvm_usage_count in common code, and just do exactly what I
> proposed for kvm_x86_enable_virtualization().
> 
> I have patches to do this, and initial testing suggests they aren't wildly
> broken.  I'll post them soon-ish, assuming nothing pops up in testing.  They are
> clean enough that they can land in advance of TDX, e.g. in kvm-coco-queue even
> before other architectures verify I didn't break them.
> 

Hi Sean,

Just want to check with you what is your plan on this?

Please feel free to let me know if there's anything that I can help. 
Thanks.
Sean Christopherson May 7, 2024, 4:24 p.m. UTC | #48
On Tue, May 07, 2024, Kai Huang wrote:
> > > So I think we have consensus to go with the approach that shows in your
> > > second diff -- that is to always enable virtualization during module loading
> > > for all other ARCHs other than x86, for which we only always enables
> > > virtualization during module loading for TDX.
> > 
> > Assuming the other arch maintainers are ok with that approach.  If waiting until
> > a VM is created is desirable for other architectures, then we'll need to figure
> > out a plan b.  E.g. KVM arm64 doesn't support being built as a module, so enabling
> > hardware during initialization would mean virtualization is enabled for any kernel
> > that is built with CONFIG_KVM=y.
> > 
> > Actually, duh.  There's absolutely no reason to force other architectures to
> > choose when to enable virtualization.  As evidenced by the massaging to have x86
> > keep enabling virtualization on-demand for !TDX, the cleanups don't come from
> > enabling virtualization during module load, they come from registering cpuup and
> > syscore ops when virtualization is enabled.
> > 
> > I.e. we can keep kvm_usage_count in common code, and just do exactly what I
> > proposed for kvm_x86_enable_virtualization().
> > 
> > I have patches to do this, and initial testing suggests they aren't wildly
> > broken.  I'll post them soon-ish, assuming nothing pops up in testing.  They are
> > clean enough that they can land in advance of TDX, e.g. in kvm-coco-queue even
> > before other architectures verify I didn't break them.
> > 
> 
> Hi Sean,
> 
> Just want to check with you what is your plan on this?
> 
> Please feel free to let me know if there's anything that I can help. 

Ah shoot, I posted patches[*] but managed to forget to Cc any of the TDX folks.
Sorry :-/

[*] https://lore.kernel.org/all/20240425233951.3344485-1-seanjc@google.com
Huang, Kai May 7, 2024, 9:59 p.m. UTC | #49
On 8/05/2024 4:24 am, Sean Christopherson wrote:
> On Tue, May 07, 2024, Kai Huang wrote:
>>>> So I think we have consensus to go with the approach that shows in your
>>>> second diff -- that is to always enable virtualization during module loading
>>>> for all other ARCHs other than x86, for which we only always enables
>>>> virtualization during module loading for TDX.
>>>
>>> Assuming the other arch maintainers are ok with that approach.  If waiting until
>>> a VM is created is desirable for other architectures, then we'll need to figure
>>> out a plan b.  E.g. KVM arm64 doesn't support being built as a module, so enabling
>>> hardware during initialization would mean virtualization is enabled for any kernel
>>> that is built with CONFIG_KVM=y.
>>>
>>> Actually, duh.  There's absolutely no reason to force other architectures to
>>> choose when to enable virtualization.  As evidenced by the massaging to have x86
>>> keep enabling virtualization on-demand for !TDX, the cleanups don't come from
>>> enabling virtualization during module load, they come from registering cpuup and
>>> syscore ops when virtualization is enabled.
>>>
>>> I.e. we can keep kvm_usage_count in common code, and just do exactly what I
>>> proposed for kvm_x86_enable_virtualization().
>>>
>>> I have patches to do this, and initial testing suggests they aren't wildly
>>> broken.  I'll post them soon-ish, assuming nothing pops up in testing.  They are
>>> clean enough that they can land in advance of TDX, e.g. in kvm-coco-queue even
>>> before other architectures verify I didn't break them.
>>>
>>
>> Hi Sean,
>>
>> Just want to check with you what is your plan on this?
>>
>> Please feel free to let me know if there's anything that I can help.
> 
> Ah shoot, I posted patches[*] but managed to forget to Cc any of the TDX folks.
> Sorry :-/
> 
> [*] https://lore.kernel.org/all/20240425233951.3344485-1-seanjc@google.com

Oh I missed that :-( Thanks for the info!
diff mbox series

Patch

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 274df24b647f..5b85ef84b2e9 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -24,6 +24,7 @@  kvm-intel-y		+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
 
 kvm-intel-$(CONFIG_X86_SGX_KVM)	+= vmx/sgx.o
 kvm-intel-$(CONFIG_KVM_HYPERV)	+= vmx/hyperv.o vmx/hyperv_evmcs.o
+kvm-intel-$(CONFIG_INTEL_TDX_HOST)	+= vmx/tdx.o
 
 kvm-amd-y		+= svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
 			   svm/sev.o
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 18cecf12c7c8..18aef6e23aab 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -6,6 +6,22 @@ 
 #include "nested.h"
 #include "pmu.h"
 
+static bool enable_tdx __ro_after_init;
+module_param_named(tdx, enable_tdx, bool, 0444);
+
+static __init int vt_hardware_setup(void)
+{
+	int ret;
+
+	ret = vmx_hardware_setup();
+	if (ret)
+		return ret;
+
+	enable_tdx = enable_tdx && !tdx_hardware_setup(&vt_x86_ops);
+
+	return 0;
+}
+
 #define VMX_REQUIRED_APICV_INHIBITS				\
 	(BIT(APICV_INHIBIT_REASON_DISABLE)|			\
 	 BIT(APICV_INHIBIT_REASON_ABSENT) |			\
@@ -22,6 +38,7 @@  struct kvm_x86_ops vt_x86_ops __initdata = {
 
 	.hardware_unsetup = vmx_hardware_unsetup,
 
+	/* TDX cpu enablement is done by tdx_hardware_setup(). */
 	.hardware_enable = vmx_hardware_enable,
 	.hardware_disable = vmx_hardware_disable,
 	.has_emulated_msr = vmx_has_emulated_msr,
@@ -161,7 +178,7 @@  struct kvm_x86_ops vt_x86_ops __initdata = {
 };
 
 struct kvm_x86_init_ops vt_init_ops __initdata = {
-	.hardware_setup = vmx_hardware_setup,
+	.hardware_setup = vt_hardware_setup,
 	.handle_intel_pt_intr = NULL,
 
 	.runtime_ops = &vt_x86_ops,
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
new file mode 100644
index 000000000000..43c504fb4fed
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -0,0 +1,84 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <asm/tdx.h>
+
+#include "capabilities.h"
+#include "x86_ops.h"
+#include "x86.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+static int __init tdx_module_setup(void)
+{
+	int ret;
+
+	ret = tdx_enable();
+	if (ret) {
+		pr_info("Failed to initialize TDX module.\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+struct tdx_enabled {
+	cpumask_var_t enabled;
+	atomic_t err;
+};
+
+static void __init tdx_on(void *_enable)
+{
+	struct tdx_enabled *enable = _enable;
+	int r;
+
+	r = vmx_hardware_enable();
+	if (!r) {
+		cpumask_set_cpu(smp_processor_id(), enable->enabled);
+		r = tdx_cpu_enable();
+	}
+	if (r)
+		atomic_set(&enable->err, r);
+}
+
+static void __init vmx_off(void *_enabled)
+{
+	cpumask_var_t *enabled = (cpumask_var_t *)_enabled;
+
+	if (cpumask_test_cpu(smp_processor_id(), *enabled))
+		vmx_hardware_disable();
+}
+
+int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
+{
+	struct tdx_enabled enable = {
+		.err = ATOMIC_INIT(0),
+	};
+	int r = 0;
+
+	if (!enable_ept) {
+		pr_warn("Cannot enable TDX with EPT disabled\n");
+		return -EINVAL;
+	}
+
+	if (!zalloc_cpumask_var(&enable.enabled, GFP_KERNEL)) {
+		r = -ENOMEM;
+		goto out;
+	}
+
+	/* tdx_enable() in tdx_module_setup() requires cpus lock. */
+	cpus_read_lock();
+	on_each_cpu(tdx_on, &enable, true); /* TDX requires vmxon. */
+	r = atomic_read(&enable.err);
+	if (!r)
+		r = tdx_module_setup();
+	else
+		r = -EIO;
+	on_each_cpu(vmx_off, &enable.enabled, true);
+	cpus_read_unlock();
+	free_cpumask_var(enable.enabled);
+
+out:
+	return r;
+}
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index b936388853ab..346289a2a01c 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -135,4 +135,10 @@  void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu);
 #endif
 void vmx_setup_mce(struct kvm_vcpu *vcpu);
 
+#ifdef CONFIG_INTEL_TDX_HOST
+int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops);
+#else
+static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -EOPNOTSUPP; }
+#endif
+
 #endif /* __KVM_X86_VMX_X86_OPS_H */