diff mbox series

[RFC,v5,073/104] KVM: TDX: track LP tdx vcpu run and teardown vcpus on descroing the guest TD

Message ID 6e096d8509ef40ce3e25c1e132643e772641241b.1646422845.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM TDX basic feature support | expand

Commit Message

Isaku Yamahata March 4, 2022, 7:49 p.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

When shutting down the machine, (VMX or TDX) vcpus needs to be shutdown on
each pcpu.  Do the similar for TDX with TDX SEAMCALL APIs.

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/kvm/vmx/main.c    | 23 +++++++++++--
 arch/x86/kvm/vmx/tdx.c     | 70 ++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/vmx/tdx.h     |  2 ++
 arch/x86/kvm/vmx/x86_ops.h |  4 +++
 4 files changed, 95 insertions(+), 4 deletions(-)

Comments

Erdem Aktas March 23, 2022, 12:54 a.m. UTC | #1
On Fri, Mar 4, 2022 at 11:50 AM <isaku.yamahata@intel.com> wrote:
>
> From: Isaku Yamahata <isaku.yamahata@intel.com>
>
> When shutting down the machine, (VMX or TDX) vcpus needs to be shutdown on
> each pcpu.  Do the similar for TDX with TDX SEAMCALL APIs.
>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/kvm/vmx/main.c    | 23 +++++++++++--
>  arch/x86/kvm/vmx/tdx.c     | 70 ++++++++++++++++++++++++++++++++++++--
>  arch/x86/kvm/vmx/tdx.h     |  2 ++
>  arch/x86/kvm/vmx/x86_ops.h |  4 +++
>  4 files changed, 95 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index 2cd5ba0e8788..882358ac270b 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -13,6 +13,25 @@ static bool vt_is_vm_type_supported(unsigned long type)
>         return type == KVM_X86_DEFAULT_VM || tdx_is_vm_type_supported(type);
>  }
>
> +static int vt_hardware_enable(void)
> +{
> +       int ret;
> +
> +       ret = vmx_hardware_enable();
> +       if (ret)
> +               return ret;
> +
> +       tdx_hardware_enable();
> +       return 0;
> +}
> +
> +static void vt_hardware_disable(void)
> +{
> +       /* Note, TDX *and* VMX need to be disabled if TDX is enabled. */
> +       tdx_hardware_disable();
> +       vmx_hardware_disable();
> +}
> +
>  static __init int vt_hardware_setup(void)
>  {
>         int ret;
> @@ -199,8 +218,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>
>         .hardware_unsetup = vt_hardware_unsetup,
>
> -       .hardware_enable = vmx_hardware_enable,
> -       .hardware_disable = vmx_hardware_disable,
> +       .hardware_enable = vt_hardware_enable,
> +       .hardware_disable = vt_hardware_disable,
>         .cpu_has_accelerated_tpr = report_flexpriority,
>         .has_emulated_msr = vmx_has_emulated_msr,
>
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index a6b1a8ce888d..690298fb99c7 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -48,6 +48,14 @@ struct tdx_capabilities tdx_caps;
>  static DEFINE_MUTEX(tdx_lock);
>  static struct mutex *tdx_mng_key_config_lock;
>
> +/*
> + * A per-CPU list of TD vCPUs associated with a given CPU.  Used when a CPU
> + * is brought down to invoke TDH_VP_FLUSH on the approapriate TD vCPUS.
> + * Protected by interrupt mask.  This list is manipulated in process context
> + * of vcpu and IPI callback.  See tdx_flush_vp_on_cpu().
> + */
> +static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
> +
>  static u64 hkid_mask __ro_after_init;
>  static u8 hkid_start_pos __ro_after_init;
>
> @@ -87,6 +95,8 @@ static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
>
>  static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
>  {
> +       list_del(&to_tdx(vcpu)->cpu_list);
> +
>         /*
>          * Ensure tdx->cpu_list is updated is before setting vcpu->cpu to -1,
>          * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
> @@ -97,6 +107,22 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
>         vcpu->cpu = -1;
>  }
>
> +void tdx_hardware_enable(void)
> +{
> +       INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, raw_smp_processor_id()));
> +}
> +
> +void tdx_hardware_disable(void)
> +{
> +       int cpu = raw_smp_processor_id();
> +       struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
> +       struct vcpu_tdx *tdx, *tmp;
> +
> +       /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
> +       list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list)
> +               tdx_disassociate_vp(&tdx->vcpu);
> +}
> +
>  static void tdx_clear_page(unsigned long page)
>  {
>         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> @@ -230,9 +256,11 @@ void tdx_mmu_prezap(struct kvm *kvm)
>         struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>         cpumask_var_t packages;
>         bool cpumask_allocated;
> +       struct kvm_vcpu *vcpu;
>         u64 err;
>         int ret;
>         int i;
> +       unsigned long j;
>
>         if (!is_hkid_assigned(kvm_tdx))
>                 return;
> @@ -248,6 +276,17 @@ void tdx_mmu_prezap(struct kvm *kvm)
>                 return;
>         }
>
> +       kvm_for_each_vcpu(j, vcpu, kvm)
> +               tdx_flush_vp_on_cpu(vcpu);
> +
> +       mutex_lock(&tdx_lock);
> +       err = tdh_mng_vpflushdone(kvm_tdx->tdr.pa);

Hi Isaku,

I am wondering about the impact of the failures on these functions. Is
there any other function which recovers any failures here?
When I look at the tdx_flush_vp function, it seems like it can fail
due to task migration so tdx_flush_vp_on_cpu might also fail and if it
fails, tdh_mng_vpflushdone returns err. Since tdx_vm_teardown does not
return any error , how the VMM can free the keyid used in this TD.
Will they be forever in "used state"?
Also if tdx_vm_teardown fails, the kvm_tdx->hkid is never set to -1
which will prevent tdx_vcpu_free to free and reclaim the resources
allocated for the vcpu.

-Erdem
> +       mutex_unlock(&tdx_lock);
> +       if (WARN_ON_ONCE(err)) {
> +               pr_tdx_error(TDH_MNG_VPFLUSHDONE, err, NULL);
> +               return;
> +       }
> +
>         cpumask_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
>         for_each_online_cpu(i) {
>                 if (cpumask_allocated &&
> @@ -472,8 +511,22 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
>
>  void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  {
> -       if (vcpu->cpu != cpu)
> -               tdx_flush_vp_on_cpu(vcpu);
> +       struct vcpu_tdx *tdx = to_tdx(vcpu);
> +
> +       if (vcpu->cpu == cpu)
> +               return;
> +
> +       tdx_flush_vp_on_cpu(vcpu);
> +
> +       local_irq_disable();
> +       /*
> +        * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
> +        * vcpu->cpu is read before tdx->cpu_list.
> +        */
> +       smp_rmb();
> +
> +       list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
> +       local_irq_enable();
>  }
>
>  void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
> @@ -522,6 +575,19 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu)
>                 tdx_reclaim_td_page(&tdx->tdvpx[i]);
>         kfree(tdx->tdvpx);
>         tdx_reclaim_td_page(&tdx->tdvpr);
> +
> +       /*
> +        * kvm_free_vcpus()
> +        *   -> kvm_unload_vcpu_mmu()
> +        *
> +        * does vcpu_load() for every vcpu after they already disassociated
> +        * from the per cpu list when tdx_vm_teardown(). So we need to
> +        * disassociate them again, otherwise the freed vcpu data will be
> +        * accessed when do list_{del,add}() on associated_tdvcpus list
> +        * later.
> +        */
> +       tdx_flush_vp_on_cpu(vcpu);
> +       WARN_ON(vcpu->cpu != -1);
>  }
>
>  void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 8b1cf9c158e3..180360a65545 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -81,6 +81,8 @@ struct vcpu_tdx {
>         struct tdx_td_page tdvpr;
>         struct tdx_td_page *tdvpx;
>
> +       struct list_head cpu_list;
> +
>         union tdx_exit_reason exit_reason;
>
>         bool initialized;
> diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> index ceafd6e18f4e..aae0f4449ec5 100644
> --- a/arch/x86/kvm/vmx/x86_ops.h
> +++ b/arch/x86/kvm/vmx/x86_ops.h
> @@ -132,6 +132,8 @@ void __init tdx_pre_kvm_init(unsigned int *vcpu_size,
>  bool tdx_is_vm_type_supported(unsigned long type);
>  void __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops);
>  void tdx_hardware_unsetup(void);
> +void tdx_hardware_enable(void);
> +void tdx_hardware_disable(void);
>
>  int tdx_vm_init(struct kvm *kvm);
>  void tdx_mmu_prezap(struct kvm *kvm);
> @@ -156,6 +158,8 @@ static inline void tdx_pre_kvm_init(
>  static inline bool tdx_is_vm_type_supported(unsigned long type) { return false; }
>  static inline void tdx_hardware_setup(struct kvm_x86_ops *x86_ops) {}
>  static inline void tdx_hardware_unsetup(void) {}
> +static inline void tdx_hardware_enable(void) {}
> +static inline void tdx_hardware_disable(void) {}
>
>  static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
>  static inline void tdx_mmu_prezap(struct kvm *kvm) {}
> --
> 2.25.1
>
Isaku Yamahata March 23, 2022, 7:08 p.m. UTC | #2
On Tue, Mar 22, 2022 at 05:54:45PM -0700,
Erdem Aktas <erdemaktas@google.com> wrote:

> On Fri, Mar 4, 2022 at 11:50 AM <isaku.yamahata@intel.com> wrote:
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index a6b1a8ce888d..690298fb99c7 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -48,6 +48,14 @@ struct tdx_capabilities tdx_caps;
> >  static DEFINE_MUTEX(tdx_lock);
> >  static struct mutex *tdx_mng_key_config_lock;
> >
> > +/*
> > + * A per-CPU list of TD vCPUs associated with a given CPU.  Used when a CPU
> > + * is brought down to invoke TDH_VP_FLUSH on the approapriate TD vCPUS.
> > + * Protected by interrupt mask.  This list is manipulated in process context
> > + * of vcpu and IPI callback.  See tdx_flush_vp_on_cpu().
> > + */
> > +static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
> > +
> >  static u64 hkid_mask __ro_after_init;
> >  static u8 hkid_start_pos __ro_after_init;
> >
> > @@ -87,6 +95,8 @@ static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
> >
> >  static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
> >  {
> > +       list_del(&to_tdx(vcpu)->cpu_list);
> > +
> >         /*
> >          * Ensure tdx->cpu_list is updated is before setting vcpu->cpu to -1,
> >          * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
> > @@ -97,6 +107,22 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
> >         vcpu->cpu = -1;
> >  }
> >
> > +void tdx_hardware_enable(void)
> > +{
> > +       INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, raw_smp_processor_id()));
> > +}
> > +
> > +void tdx_hardware_disable(void)
> > +{
> > +       int cpu = raw_smp_processor_id();
> > +       struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
> > +       struct vcpu_tdx *tdx, *tmp;
> > +
> > +       /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
> > +       list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list)
> > +               tdx_disassociate_vp(&tdx->vcpu);
> > +}
> > +
> >  static void tdx_clear_page(unsigned long page)
> >  {
> >         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> > @@ -230,9 +256,11 @@ void tdx_mmu_prezap(struct kvm *kvm)
> >         struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> >         cpumask_var_t packages;
> >         bool cpumask_allocated;
> > +       struct kvm_vcpu *vcpu;
> >         u64 err;
> >         int ret;
> >         int i;
> > +       unsigned long j;
> >
> >         if (!is_hkid_assigned(kvm_tdx))
> >                 return;
> > @@ -248,6 +276,17 @@ void tdx_mmu_prezap(struct kvm *kvm)
> >                 return;
> >         }
> >
> > +       kvm_for_each_vcpu(j, vcpu, kvm)
> > +               tdx_flush_vp_on_cpu(vcpu);
> > +
> > +       mutex_lock(&tdx_lock);
> > +       err = tdh_mng_vpflushdone(kvm_tdx->tdr.pa);
> 
> Hi Isaku,

Hi.


> I am wondering about the impact of the failures on these functions. Is
> there any other function which recovers any failures here?
> When I look at the tdx_flush_vp function, it seems like it can fail
> due to task migration so tdx_flush_vp_on_cpu might also fail and if it
> fails, tdh_mng_vpflushdone returns err. Since tdx_vm_teardown does not
> return any error , how the VMM can free the keyid used in this TD.
> Will they be forever in "used state"?
> Also if tdx_vm_teardown fails, the kvm_tdx->hkid is never set to -1
> which will prevent tdx_vcpu_free to free and reclaim the resources
> allocated for the vcpu.

mmu_prezap() is called via release callback of mmu notifier when the last mmu
reference of this process is dropped.  It is after all kvm vcpu fd and kvm vm
fd were closed.  vcpu will never run.  But we still hold kvm_vcpu structures.
There is no race between tdh_vp_flush()/tdh_mng_vpflushdone() here and process
migration.  tdh_vp_flush()/tdh_mng_vp_flushdone() should success.

The cpuid check in tdx_flush_vp() is for vcpu_load() which may race with process
migration. 

Anyway what if one of those TDX seamcalls fails?  HKID is leaked and will be
never used because there is no good way to free and use HKID safely.  Such
failure is due to unknown issue and probably a bug.

One mitigation is to add pr_err() when HKID leak happens.  I'll add such message
on next respin.

thanks,
Erdem Aktas March 23, 2022, 8:17 p.m. UTC | #3
So the tdh_vp_flush should always succeed while vm is being torn down.
Thanks Isaku for the explanation, and I think it would be great to add
the error message.

-Erdem

On Wed, Mar 23, 2022 at 12:08 PM Isaku Yamahata
<isaku.yamahata@gmail.com> wrote:
>
> On Tue, Mar 22, 2022 at 05:54:45PM -0700,
> Erdem Aktas <erdemaktas@google.com> wrote:
>
> > On Fri, Mar 4, 2022 at 11:50 AM <isaku.yamahata@intel.com> wrote:
> > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > > index a6b1a8ce888d..690298fb99c7 100644
> > > --- a/arch/x86/kvm/vmx/tdx.c
> > > +++ b/arch/x86/kvm/vmx/tdx.c
> > > @@ -48,6 +48,14 @@ struct tdx_capabilities tdx_caps;
> > >  static DEFINE_MUTEX(tdx_lock);
> > >  static struct mutex *tdx_mng_key_config_lock;
> > >
> > > +/*
> > > + * A per-CPU list of TD vCPUs associated with a given CPU.  Used when a CPU
> > > + * is brought down to invoke TDH_VP_FLUSH on the approapriate TD vCPUS.
> > > + * Protected by interrupt mask.  This list is manipulated in process context
> > > + * of vcpu and IPI callback.  See tdx_flush_vp_on_cpu().
> > > + */
> > > +static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
> > > +
> > >  static u64 hkid_mask __ro_after_init;
> > >  static u8 hkid_start_pos __ro_after_init;
> > >
> > > @@ -87,6 +95,8 @@ static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
> > >
> > >  static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
> > >  {
> > > +       list_del(&to_tdx(vcpu)->cpu_list);
> > > +
> > >         /*
> > >          * Ensure tdx->cpu_list is updated is before setting vcpu->cpu to -1,
> > >          * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
> > > @@ -97,6 +107,22 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
> > >         vcpu->cpu = -1;
> > >  }
> > >
> > > +void tdx_hardware_enable(void)
> > > +{
> > > +       INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, raw_smp_processor_id()));
> > > +}
> > > +
> > > +void tdx_hardware_disable(void)
> > > +{
> > > +       int cpu = raw_smp_processor_id();
> > > +       struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
> > > +       struct vcpu_tdx *tdx, *tmp;
> > > +
> > > +       /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
> > > +       list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list)
> > > +               tdx_disassociate_vp(&tdx->vcpu);
> > > +}
> > > +
> > >  static void tdx_clear_page(unsigned long page)
> > >  {
> > >         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> > > @@ -230,9 +256,11 @@ void tdx_mmu_prezap(struct kvm *kvm)
> > >         struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > >         cpumask_var_t packages;
> > >         bool cpumask_allocated;
> > > +       struct kvm_vcpu *vcpu;
> > >         u64 err;
> > >         int ret;
> > >         int i;
> > > +       unsigned long j;
> > >
> > >         if (!is_hkid_assigned(kvm_tdx))
> > >                 return;
> > > @@ -248,6 +276,17 @@ void tdx_mmu_prezap(struct kvm *kvm)
> > >                 return;
> > >         }
> > >
> > > +       kvm_for_each_vcpu(j, vcpu, kvm)
> > > +               tdx_flush_vp_on_cpu(vcpu);
> > > +
> > > +       mutex_lock(&tdx_lock);
> > > +       err = tdh_mng_vpflushdone(kvm_tdx->tdr.pa);
> >
> > Hi Isaku,
>
> Hi.
>
>
> > I am wondering about the impact of the failures on these functions. Is
> > there any other function which recovers any failures here?
> > When I look at the tdx_flush_vp function, it seems like it can fail
> > due to task migration so tdx_flush_vp_on_cpu might also fail and if it
> > fails, tdh_mng_vpflushdone returns err. Since tdx_vm_teardown does not
> > return any error , how the VMM can free the keyid used in this TD.
> > Will they be forever in "used state"?
> > Also if tdx_vm_teardown fails, the kvm_tdx->hkid is never set to -1
> > which will prevent tdx_vcpu_free to free and reclaim the resources
> > allocated for the vcpu.
>
> mmu_prezap() is called via release callback of mmu notifier when the last mmu
> reference of this process is dropped.  It is after all kvm vcpu fd and kvm vm
> fd were closed.  vcpu will never run.  But we still hold kvm_vcpu structures.
> There is no race between tdh_vp_flush()/tdh_mng_vpflushdone() here and process
> migration.  tdh_vp_flush()/tdh_mng_vp_flushdone() should success.
>
> The cpuid check in tdx_flush_vp() is for vcpu_load() which may race with process
> migration.
>
> Anyway what if one of those TDX seamcalls fails?  HKID is leaked and will be
> never used because there is no good way to free and use HKID safely.  Such
> failure is due to unknown issue and probably a bug.
>
> One mitigation is to add pr_err() when HKID leak happens.  I'll add such message
> on next respin.
>
> thanks,
> --
> Isaku Yamahata <isaku.yamahata@gmail.com>
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 2cd5ba0e8788..882358ac270b 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -13,6 +13,25 @@  static bool vt_is_vm_type_supported(unsigned long type)
 	return type == KVM_X86_DEFAULT_VM || tdx_is_vm_type_supported(type);
 }
 
+static int vt_hardware_enable(void)
+{
+	int ret;
+
+	ret = vmx_hardware_enable();
+	if (ret)
+		return ret;
+
+	tdx_hardware_enable();
+	return 0;
+}
+
+static void vt_hardware_disable(void)
+{
+	/* Note, TDX *and* VMX need to be disabled if TDX is enabled. */
+	tdx_hardware_disable();
+	vmx_hardware_disable();
+}
+
 static __init int vt_hardware_setup(void)
 {
 	int ret;
@@ -199,8 +218,8 @@  struct kvm_x86_ops vt_x86_ops __initdata = {
 
 	.hardware_unsetup = vt_hardware_unsetup,
 
-	.hardware_enable = vmx_hardware_enable,
-	.hardware_disable = vmx_hardware_disable,
+	.hardware_enable = vt_hardware_enable,
+	.hardware_disable = vt_hardware_disable,
 	.cpu_has_accelerated_tpr = report_flexpriority,
 	.has_emulated_msr = vmx_has_emulated_msr,
 
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index a6b1a8ce888d..690298fb99c7 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -48,6 +48,14 @@  struct tdx_capabilities tdx_caps;
 static DEFINE_MUTEX(tdx_lock);
 static struct mutex *tdx_mng_key_config_lock;
 
+/*
+ * A per-CPU list of TD vCPUs associated with a given CPU.  Used when a CPU
+ * is brought down to invoke TDH_VP_FLUSH on the approapriate TD vCPUS.
+ * Protected by interrupt mask.  This list is manipulated in process context
+ * of vcpu and IPI callback.  See tdx_flush_vp_on_cpu().
+ */
+static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
+
 static u64 hkid_mask __ro_after_init;
 static u8 hkid_start_pos __ro_after_init;
 
@@ -87,6 +95,8 @@  static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
 
 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
 {
+	list_del(&to_tdx(vcpu)->cpu_list);
+
 	/*
 	 * Ensure tdx->cpu_list is updated is before setting vcpu->cpu to -1,
 	 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
@@ -97,6 +107,22 @@  static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
 	vcpu->cpu = -1;
 }
 
+void tdx_hardware_enable(void)
+{
+	INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, raw_smp_processor_id()));
+}
+
+void tdx_hardware_disable(void)
+{
+	int cpu = raw_smp_processor_id();
+	struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
+	struct vcpu_tdx *tdx, *tmp;
+
+	/* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
+	list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list)
+		tdx_disassociate_vp(&tdx->vcpu);
+}
+
 static void tdx_clear_page(unsigned long page)
 {
 	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
@@ -230,9 +256,11 @@  void tdx_mmu_prezap(struct kvm *kvm)
 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
 	cpumask_var_t packages;
 	bool cpumask_allocated;
+	struct kvm_vcpu *vcpu;
 	u64 err;
 	int ret;
 	int i;
+	unsigned long j;
 
 	if (!is_hkid_assigned(kvm_tdx))
 		return;
@@ -248,6 +276,17 @@  void tdx_mmu_prezap(struct kvm *kvm)
 		return;
 	}
 
+	kvm_for_each_vcpu(j, vcpu, kvm)
+		tdx_flush_vp_on_cpu(vcpu);
+
+	mutex_lock(&tdx_lock);
+	err = tdh_mng_vpflushdone(kvm_tdx->tdr.pa);
+	mutex_unlock(&tdx_lock);
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_MNG_VPFLUSHDONE, err, NULL);
+		return;
+	}
+
 	cpumask_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
 	for_each_online_cpu(i) {
 		if (cpumask_allocated &&
@@ -472,8 +511,22 @@  int tdx_vcpu_create(struct kvm_vcpu *vcpu)
 
 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	if (vcpu->cpu != cpu)
-		tdx_flush_vp_on_cpu(vcpu);
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+	if (vcpu->cpu == cpu)
+		return;
+
+	tdx_flush_vp_on_cpu(vcpu);
+
+	local_irq_disable();
+	/*
+	 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
+	 * vcpu->cpu is read before tdx->cpu_list.
+	 */
+	smp_rmb();
+
+	list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
+	local_irq_enable();
 }
 
 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
@@ -522,6 +575,19 @@  void tdx_vcpu_free(struct kvm_vcpu *vcpu)
 		tdx_reclaim_td_page(&tdx->tdvpx[i]);
 	kfree(tdx->tdvpx);
 	tdx_reclaim_td_page(&tdx->tdvpr);
+
+	/*
+	 * kvm_free_vcpus()
+	 *   -> kvm_unload_vcpu_mmu()
+	 *
+	 * does vcpu_load() for every vcpu after they already disassociated
+	 * from the per cpu list when tdx_vm_teardown(). So we need to
+	 * disassociate them again, otherwise the freed vcpu data will be
+	 * accessed when do list_{del,add}() on associated_tdvcpus list
+	 * later.
+	 */
+	tdx_flush_vp_on_cpu(vcpu);
+	WARN_ON(vcpu->cpu != -1);
 }
 
 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 8b1cf9c158e3..180360a65545 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -81,6 +81,8 @@  struct vcpu_tdx {
 	struct tdx_td_page tdvpr;
 	struct tdx_td_page *tdvpx;
 
+	struct list_head cpu_list;
+
 	union tdx_exit_reason exit_reason;
 
 	bool initialized;
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index ceafd6e18f4e..aae0f4449ec5 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -132,6 +132,8 @@  void __init tdx_pre_kvm_init(unsigned int *vcpu_size,
 bool tdx_is_vm_type_supported(unsigned long type);
 void __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops);
 void tdx_hardware_unsetup(void);
+void tdx_hardware_enable(void);
+void tdx_hardware_disable(void);
 
 int tdx_vm_init(struct kvm *kvm);
 void tdx_mmu_prezap(struct kvm *kvm);
@@ -156,6 +158,8 @@  static inline void tdx_pre_kvm_init(
 static inline bool tdx_is_vm_type_supported(unsigned long type) { return false; }
 static inline void tdx_hardware_setup(struct kvm_x86_ops *x86_ops) {}
 static inline void tdx_hardware_unsetup(void) {}
+static inline void tdx_hardware_enable(void) {}
+static inline void tdx_hardware_disable(void) {}
 
 static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
 static inline void tdx_mmu_prezap(struct kvm *kvm) {}