diff mbox series

[v19,087/130] KVM: TDX: handle vcpu migration over logical processor

Message ID b9fe57ceeaabe650f0aecb21db56ef2b1456dcfe.1708933498.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series [v19,001/130] x86/virt/tdx: Rename _offset to _member for TD_SYSINFO_MAP() macro | expand

Commit Message

Isaku Yamahata Feb. 26, 2024, 8:26 a.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

For vcpu migration, in the case of VMX, VMCS is flushed on the source pcpu,
and load it on the target pcpu.  There are corresponding TDX SEAMCALL APIs,
call them on vcpu migration.  The logic is mostly same as VMX except the
TDX SEAMCALLs are used.

When shutting down the machine, (VMX or TDX) vcpus needs to be shutdown on
each pcpu.  Do the similar for TDX with TDX SEAMCALL APIs.

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/kvm/vmx/main.c    |  32 ++++++-
 arch/x86/kvm/vmx/tdx.c     | 190 ++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/tdx.h     |   2 +
 arch/x86/kvm/vmx/x86_ops.h |   4 +
 4 files changed, 221 insertions(+), 7 deletions(-)

Comments

Binbin Wu April 7, 2024, 9:03 a.m. UTC | #1
On 2/26/2024 4:26 PM, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
>
> For vcpu migration, in the case of VMX, VMCS is flushed on the source pcpu,
> and load it on the target pcpu.  There are corresponding TDX SEAMCALL APIs,
> call them on vcpu migration.  The logic is mostly same as VMX except the
> TDX SEAMCALLs are used.
>
> When shutting down the machine,

"the machine" -> "a VM"

>   (VMX or TDX) vcpus needs to be shutdown on
                    ^
                    need
> each pcpu.  Do the similar for TDX with TDX SEAMCALL APIs.
>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>   arch/x86/kvm/vmx/main.c    |  32 ++++++-
>   arch/x86/kvm/vmx/tdx.c     | 190 ++++++++++++++++++++++++++++++++++++-
>   arch/x86/kvm/vmx/tdx.h     |   2 +
>   arch/x86/kvm/vmx/x86_ops.h |   4 +
>   4 files changed, 221 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index 8275a242ce07..9b336c1a6508 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -33,6 +33,14 @@ static int vt_max_vcpus(struct kvm *kvm)
>   static int vt_flush_remote_tlbs(struct kvm *kvm);
>   #endif
>   
> +static void vt_hardware_disable(void)
> +{
> +	/* Note, TDX *and* VMX need to be disabled if TDX is enabled. */
> +	if (enable_tdx)
> +		tdx_hardware_disable();
> +	vmx_hardware_disable();
> +}
> +
>   static __init int vt_hardware_setup(void)
>   {
>   	int ret;
> @@ -201,6 +209,16 @@ static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu)
>   	return vmx_vcpu_run(vcpu);
>   }
>   
> +static void vt_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> +{
> +	if (is_td_vcpu(vcpu)) {
> +		tdx_vcpu_load(vcpu, cpu);
> +		return;
> +	}
> +
> +	vmx_vcpu_load(vcpu, cpu);
> +}
> +
>   static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
>   {
>   	if (is_td_vcpu(vcpu)) {
> @@ -262,6 +280,14 @@ static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
>   	vmx_load_mmu_pgd(vcpu, root_hpa, pgd_level);
>   }
>   
> +static void vt_sched_in(struct kvm_vcpu *vcpu, int cpu)
> +{
> +	if (is_td_vcpu(vcpu))
> +		return;
> +
> +	vmx_sched_in(vcpu, cpu);
> +}
> +
>   static u8 vt_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
>   {
>   	if (is_td_vcpu(vcpu))
> @@ -335,7 +361,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>   
>   	/* TDX cpu enablement is done by tdx_hardware_setup(). */
>   	.hardware_enable = vmx_hardware_enable,
> -	.hardware_disable = vmx_hardware_disable,
> +	.hardware_disable = vt_hardware_disable,
>   	.has_emulated_msr = vmx_has_emulated_msr,
>   
>   	.is_vm_type_supported = vt_is_vm_type_supported,
> @@ -353,7 +379,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>   	.vcpu_reset = vt_vcpu_reset,
>   
>   	.prepare_switch_to_guest = vt_prepare_switch_to_guest,
> -	.vcpu_load = vmx_vcpu_load,
> +	.vcpu_load = vt_vcpu_load,
>   	.vcpu_put = vt_vcpu_put,
>   
>   	.update_exception_bitmap = vmx_update_exception_bitmap,
> @@ -440,7 +466,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>   
>   	.request_immediate_exit = vmx_request_immediate_exit,
>   
> -	.sched_in = vmx_sched_in,
> +	.sched_in = vt_sched_in,
>   
>   	.cpu_dirty_log_size = PML_ENTITY_NUM,
>   	.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index ad4d3d4eaf6c..7aa9188f384d 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -106,6 +106,14 @@ static DEFINE_MUTEX(tdx_lock);
>   static struct mutex *tdx_mng_key_config_lock;
>   static atomic_t nr_configured_hkid;
>   
> +/*
> + * A per-CPU list of TD vCPUs associated with a given CPU.  Used when a CPU
> + * is brought down

Not necessarily to be a CPU brought down.
It will also be triggered by the last VM being destroyed.

But had a seond thought, if it is trigged by the last VM case, the list 
should be empty already.
So I am OK with the descripton.


>   to invoke TDH_VP_FLUSH on the approapriate TD vCPUS.

"approapriate" -> "appropriate"

> + * Protected by interrupt mask.  This list is manipulated in process context
> + * of vcpu and IPI callback.  See tdx_flush_vp_on_cpu().
> + */
> +static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
> +
>   static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
>   {
>   	return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
> @@ -138,6 +146,37 @@ static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
>   	return kvm_tdx->finalized;
>   }
>   
> +static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
> +{
> +	lockdep_assert_irqs_disabled();
> +
> +	list_del(&to_tdx(vcpu)->cpu_list);
> +
> +	/*
> +	 * Ensure tdx->cpu_list is updated is before setting vcpu->cpu to -1,
> +	 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
> +	 * to its list before its deleted from this CPUs list.
> +	 */
> +	smp_wmb();
> +
> +	vcpu->cpu = -1;
> +}
> +
> +static void tdx_disassociate_vp_arg(void *vcpu)
> +{
> +	tdx_disassociate_vp(vcpu);
> +}
> +
> +static void tdx_disassociate_vp_on_cpu(struct kvm_vcpu *vcpu)
> +{
> +	int cpu = vcpu->cpu;
> +
> +	if (unlikely(cpu == -1))
> +		return;
> +
> +	smp_call_function_single(cpu, tdx_disassociate_vp_arg, vcpu, 1);
> +}
> +
>   static void tdx_clear_page(unsigned long page_pa)
>   {
>   	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> @@ -218,6 +257,87 @@ static void tdx_reclaim_control_page(unsigned long td_page_pa)
>   	free_page((unsigned long)__va(td_page_pa));
>   }
>   
> +struct tdx_flush_vp_arg {
> +	struct kvm_vcpu *vcpu;
> +	u64 err;
> +};
> +
> +static void tdx_flush_vp(void *arg_)

It is more common to use "_arg" instead of "arg_".

> +{
> +	struct tdx_flush_vp_arg *arg = arg_;
> +	struct kvm_vcpu *vcpu = arg->vcpu;
> +	u64 err;
> +
> +	arg->err = 0;
> +	lockdep_assert_irqs_disabled();
> +
> +	/* Task migration can race with CPU offlining. */
> +	if (unlikely(vcpu->cpu != raw_smp_processor_id()))
> +		return;
> +
> +	/*
> +	 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
> +	 * list tracking still needs to be updated so that it's correct if/when
> +	 * the vCPU does get initialized.
> +	 */
> +	if (is_td_vcpu_created(to_tdx(vcpu))) {
> +		/*
> +		 * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are,
> +		 * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
> +		 * vp flush function is called when destructing vcpu/TD or vcpu
> +		 * migration.  No other thread uses TDVPR in those cases.
> +		 */
> +		err = tdh_vp_flush(to_tdx(vcpu)->tdvpr_pa);
> +		if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
> +			/*
> +			 * This function is called in IPI context. Do not use
> +			 * printk to avoid console semaphore.
> +			 * The caller prints out the error message, instead.
> +			 */
> +			if (err)
> +				arg->err = err;
> +		}
> +	}
> +
> +	tdx_disassociate_vp(vcpu);
> +}
> +
> +static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
> +{
> +	struct tdx_flush_vp_arg arg = {
> +		.vcpu = vcpu,
> +	};
> +	int cpu = vcpu->cpu;
> +
> +	if (unlikely(cpu == -1))
> +		return;
> +
> +	smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
> +	if (WARN_ON_ONCE(arg.err)) {
> +		pr_err("cpu: %d ", cpu);
> +		pr_tdx_error(TDH_VP_FLUSH, arg.err, NULL);
> +	}
> +}
> +
> +void tdx_hardware_disable(void)
> +{
> +	int cpu = raw_smp_processor_id();
> +	struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
> +	struct tdx_flush_vp_arg arg;
> +	struct vcpu_tdx *tdx, *tmp;
> +	unsigned long flags;
> +
> +	lockdep_assert_preemption_disabled();
> +
> +	local_irq_save(flags);
> +	/* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
> +	list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
> +		arg.vcpu = &tdx->vcpu;
> +		tdx_flush_vp(&arg);
> +	}
> +	local_irq_restore(flags);
> +}
> +
>   static void tdx_do_tdh_phymem_cache_wb(void *unused)
>   {
>   	u64 err = 0;
> @@ -233,26 +353,31 @@ static void tdx_do_tdh_phymem_cache_wb(void *unused)
>   		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
>   }
>   
> -void tdx_mmu_release_hkid(struct kvm *kvm)
> +static int __tdx_mmu_release_hkid(struct kvm *kvm)
>   {
>   	bool packages_allocated, targets_allocated;
>   	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>   	cpumask_var_t packages, targets;
> +	struct kvm_vcpu *vcpu;
> +	unsigned long j;
> +	int i, ret = 0;
>   	u64 err;
> -	int i;
>   
>   	if (!is_hkid_assigned(kvm_tdx))
> -		return;
> +		return 0;
>   
>   	if (!is_td_created(kvm_tdx)) {
>   		tdx_hkid_free(kvm_tdx);
> -		return;
> +		return 0;
>   	}
>   
>   	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
>   	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
>   	cpus_read_lock();
>   
> +	kvm_for_each_vcpu(j, vcpu, kvm)
> +		tdx_flush_vp_on_cpu(vcpu);
> +
>   	/*
>   	 * We can destroy multiple guest TDs simultaneously.  Prevent
>   	 * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
> @@ -270,6 +395,19 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
>   	 */
>   	write_lock(&kvm->mmu_lock);
>   
> +	err = tdh_mng_vpflushdone(kvm_tdx->tdr_pa);
> +	if (err == TDX_FLUSHVP_NOT_DONE) {
> +		ret = -EBUSY;
> +		goto out;
> +	}
> +	if (WARN_ON_ONCE(err)) {
> +		pr_tdx_error(TDH_MNG_VPFLUSHDONE, err, NULL);
> +		pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
> +		       kvm_tdx->hkid);
> +		ret = -EIO;
> +		goto out;
> +	}
> +
>   	for_each_online_cpu(i) {
>   		if (packages_allocated &&
>   		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
> @@ -291,14 +429,24 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
>   		pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
>   		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
>   		       kvm_tdx->hkid);
> +		ret = -EIO;
>   	} else
>   		tdx_hkid_free(kvm_tdx);
>   
> +out:
>   	write_unlock(&kvm->mmu_lock);
>   	mutex_unlock(&tdx_lock);
>   	cpus_read_unlock();
>   	free_cpumask_var(targets);
>   	free_cpumask_var(packages);
> +
> +	return ret;
> +}
> +
> +void tdx_mmu_release_hkid(struct kvm *kvm)
> +{
> +	while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
> +		;
>   }
>   
>   void tdx_vm_free(struct kvm *kvm)
> @@ -455,6 +603,26 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
>   	return 0;
>   }
>   
> +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> +{
> +	struct vcpu_tdx *tdx = to_tdx(vcpu);
> +
> +	if (vcpu->cpu == cpu)
> +		return;
> +
> +	tdx_flush_vp_on_cpu(vcpu);
> +
> +	local_irq_disable();
> +	/*
> +	 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
> +	 * vcpu->cpu is read before tdx->cpu_list.
> +	 */
> +	smp_rmb();
> +
> +	list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
> +	local_irq_enable();
> +}
> +
>   void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_tdx *tdx = to_tdx(vcpu);
> @@ -495,6 +663,16 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu)
>   	struct vcpu_tdx *tdx = to_tdx(vcpu);
>   	int i;
>   
> +	/*
> +	 * When destroying VM, kvm_unload_vcpu_mmu() calls vcpu_load() for every
> +	 * vcpu after they already disassociated from the per cpu list by
> +	 * tdx_mmu_release_hkid().  So we need to disassociate them again,
> +	 * otherwise the freed vcpu data will be accessed when do
> +	 * list_{del,add}() on associated_tdvcpus list later.
> +	 */
> +	tdx_disassociate_vp_on_cpu(vcpu);
> +	WARN_ON_ONCE(vcpu->cpu != -1);
> +
>   	/*
>   	 * This methods can be called when vcpu allocation/initialization
>   	 * failed. So it's possible that hkid, tdvpx and tdvpr are not assigned
> @@ -2030,6 +2208,10 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
>   		return -EINVAL;
>   	}
>   
> +	/* tdx_hardware_disable() uses associated_tdvcpus. */
> +	for_each_possible_cpu(i)
> +		INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
> +
>   	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
>   		/*
>   		 * Here it checks if MSRs (tdx_uret_msrs) can be saved/restored
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 0d8a98feb58e..7f8c78f06508 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -73,6 +73,8 @@ struct vcpu_tdx {
>   	unsigned long *tdvpx_pa;
>   	bool td_vcpu_created;
>   
> +	struct list_head cpu_list;
> +
>   	union tdx_exit_reason exit_reason;
>   
>   	bool initialized;
> diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> index 9fd997c79c33..5853f29f0af3 100644
> --- a/arch/x86/kvm/vmx/x86_ops.h
> +++ b/arch/x86/kvm/vmx/x86_ops.h
> @@ -137,6 +137,7 @@ void vmx_setup_mce(struct kvm_vcpu *vcpu);
>   #ifdef CONFIG_INTEL_TDX_HOST
>   int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops);
>   void tdx_hardware_unsetup(void);
> +void tdx_hardware_disable(void);
>   bool tdx_is_vm_type_supported(unsigned long type);
>   int tdx_offline_cpu(void);
>   
> @@ -153,6 +154,7 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
>   fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu);
>   void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
>   void tdx_vcpu_put(struct kvm_vcpu *vcpu);
> +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
>   u8 tdx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
>   
>   int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
> @@ -171,6 +173,7 @@ void tdx_post_memory_mapping(struct kvm_vcpu *vcpu,
>   #else
>   static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -EOPNOTSUPP; }
>   static inline void tdx_hardware_unsetup(void) {}
> +static inline void tdx_hardware_disable(void) {}
>   static inline bool tdx_is_vm_type_supported(unsigned long type) { return false; }
>   static inline int tdx_offline_cpu(void) { return 0; }
>   
> @@ -190,6 +193,7 @@ static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
>   static inline fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu) { return EXIT_FASTPATH_NONE; }
>   static inline void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) {}
>   static inline void tdx_vcpu_put(struct kvm_vcpu *vcpu) {}
> +static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {}
>   static inline u8 tdx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; }
>   
>   static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
Reinette Chatre April 12, 2024, 4:15 p.m. UTC | #2
Hi Isaku,

On 2/26/2024 12:26 AM, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>

...

> @@ -218,6 +257,87 @@ static void tdx_reclaim_control_page(unsigned long td_page_pa)
>  	free_page((unsigned long)__va(td_page_pa));
>  }
>  
> +struct tdx_flush_vp_arg {
> +	struct kvm_vcpu *vcpu;
> +	u64 err;
> +};
> +
> +static void tdx_flush_vp(void *arg_)
> +{
> +	struct tdx_flush_vp_arg *arg = arg_;
> +	struct kvm_vcpu *vcpu = arg->vcpu;
> +	u64 err;
> +
> +	arg->err = 0;
> +	lockdep_assert_irqs_disabled();
> +
> +	/* Task migration can race with CPU offlining. */
> +	if (unlikely(vcpu->cpu != raw_smp_processor_id()))
> +		return;
> +
> +	/*
> +	 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
> +	 * list tracking still needs to be updated so that it's correct if/when
> +	 * the vCPU does get initialized.
> +	 */
> +	if (is_td_vcpu_created(to_tdx(vcpu))) {
> +		/*
> +		 * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are,
> +		 * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
> +		 * vp flush function is called when destructing vcpu/TD or vcpu
> +		 * migration.  No other thread uses TDVPR in those cases.
> +		 */

(I have comment later that refer back to this comment about needing retry.)

...

> @@ -233,26 +353,31 @@ static void tdx_do_tdh_phymem_cache_wb(void *unused)
>  		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
>  }
>  
> -void tdx_mmu_release_hkid(struct kvm *kvm)
> +static int __tdx_mmu_release_hkid(struct kvm *kvm)
>  {
>  	bool packages_allocated, targets_allocated;
>  	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>  	cpumask_var_t packages, targets;
> +	struct kvm_vcpu *vcpu;
> +	unsigned long j;
> +	int i, ret = 0;
>  	u64 err;
> -	int i;
>  
>  	if (!is_hkid_assigned(kvm_tdx))
> -		return;
> +		return 0;
>  
>  	if (!is_td_created(kvm_tdx)) {
>  		tdx_hkid_free(kvm_tdx);
> -		return;
> +		return 0;
>  	}
>  
>  	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
>  	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
>  	cpus_read_lock();
>  
> +	kvm_for_each_vcpu(j, vcpu, kvm)
> +		tdx_flush_vp_on_cpu(vcpu);
> +
>  	/*
>  	 * We can destroy multiple guest TDs simultaneously.  Prevent
>  	 * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
> @@ -270,6 +395,19 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
>  	 */
>  	write_lock(&kvm->mmu_lock);
>  
> +	err = tdh_mng_vpflushdone(kvm_tdx->tdr_pa);
> +	if (err == TDX_FLUSHVP_NOT_DONE) {
> +		ret = -EBUSY;
> +		goto out;
> +	}
> +	if (WARN_ON_ONCE(err)) {
> +		pr_tdx_error(TDH_MNG_VPFLUSHDONE, err, NULL);
> +		pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
> +		       kvm_tdx->hkid);
> +		ret = -EIO;
> +		goto out;
> +	}
> +
>  	for_each_online_cpu(i) {
>  		if (packages_allocated &&
>  		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
> @@ -291,14 +429,24 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
>  		pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
>  		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
>  		       kvm_tdx->hkid);
> +		ret = -EIO;
>  	} else
>  		tdx_hkid_free(kvm_tdx);
>  
> +out:
>  	write_unlock(&kvm->mmu_lock);
>  	mutex_unlock(&tdx_lock);
>  	cpus_read_unlock();
>  	free_cpumask_var(targets);
>  	free_cpumask_var(packages);
> +
> +	return ret;
> +}
> +
> +void tdx_mmu_release_hkid(struct kvm *kvm)
> +{
> +	while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
> +		;
>  }

As I understand, __tdx_mmu_release_hkid() returns -EBUSY
after TDH.VP.FLUSH has been sent for every vCPU followed by
TDH.MNG.VPFLUSHDONE, which returns TDX_FLUSHVP_NOT_DONE.

Considering earlier comment that a retry of TDH.VP.FLUSH is not
needed, why is this while() loop here that sends the
TDH.VP.FLUSH again to all vCPUs instead of just a loop within
__tdx_mmu_release_hkid() to _just_ resend TDH.MNG.VPFLUSHDONE?

Could it be possible for a vCPU to appear during this time, thus
be missed in one TDH.VP.FLUSH cycle, to require a new cycle of
TDH.VP.FLUSH?

I note that TDX_FLUSHVP_NOT_DONE is distinct from TDX_OPERAND_BUSY
that can also be returned from TDH.MNG.VPFLUSHDONE and
wonder if a retry may be needed in that case also/instead? It looks like
TDH.MNG.VPFLUSHDONE needs exclusive access to all operands and I
do not know enough yet if this is the case here.

Reinette
Isaku Yamahata April 12, 2024, 9:42 p.m. UTC | #3
On Fri, Apr 12, 2024 at 09:15:29AM -0700,
Reinette Chatre <reinette.chatre@intel.com> wrote:

> Hi Isaku,
> 
> On 2/26/2024 12:26 AM, isaku.yamahata@intel.com wrote:
> > From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> ...
> 
> > @@ -218,6 +257,87 @@ static void tdx_reclaim_control_page(unsigned long td_page_pa)
> >  	free_page((unsigned long)__va(td_page_pa));
> >  }
> >  
> > +struct tdx_flush_vp_arg {
> > +	struct kvm_vcpu *vcpu;
> > +	u64 err;
> > +};
> > +
> > +static void tdx_flush_vp(void *arg_)
> > +{
> > +	struct tdx_flush_vp_arg *arg = arg_;
> > +	struct kvm_vcpu *vcpu = arg->vcpu;
> > +	u64 err;
> > +
> > +	arg->err = 0;
> > +	lockdep_assert_irqs_disabled();
> > +
> > +	/* Task migration can race with CPU offlining. */
> > +	if (unlikely(vcpu->cpu != raw_smp_processor_id()))
> > +		return;
> > +
> > +	/*
> > +	 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
> > +	 * list tracking still needs to be updated so that it's correct if/when
> > +	 * the vCPU does get initialized.
> > +	 */
> > +	if (is_td_vcpu_created(to_tdx(vcpu))) {
> > +		/*
> > +		 * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are,
> > +		 * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
> > +		 * vp flush function is called when destructing vcpu/TD or vcpu
> > +		 * migration.  No other thread uses TDVPR in those cases.
> > +		 */
> 
> (I have comment later that refer back to this comment about needing retry.)
> 
> ...
> 
> > @@ -233,26 +353,31 @@ static void tdx_do_tdh_phymem_cache_wb(void *unused)
> >  		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
> >  }
> >  
> > -void tdx_mmu_release_hkid(struct kvm *kvm)
> > +static int __tdx_mmu_release_hkid(struct kvm *kvm)
> >  {
> >  	bool packages_allocated, targets_allocated;
> >  	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> >  	cpumask_var_t packages, targets;
> > +	struct kvm_vcpu *vcpu;
> > +	unsigned long j;
> > +	int i, ret = 0;
> >  	u64 err;
> > -	int i;
> >  
> >  	if (!is_hkid_assigned(kvm_tdx))
> > -		return;
> > +		return 0;
> >  
> >  	if (!is_td_created(kvm_tdx)) {
> >  		tdx_hkid_free(kvm_tdx);
> > -		return;
> > +		return 0;
> >  	}
> >  
> >  	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
> >  	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
> >  	cpus_read_lock();
> >  
> > +	kvm_for_each_vcpu(j, vcpu, kvm)
> > +		tdx_flush_vp_on_cpu(vcpu);
> > +
> >  	/*
> >  	 * We can destroy multiple guest TDs simultaneously.  Prevent
> >  	 * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
> > @@ -270,6 +395,19 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
> >  	 */
> >  	write_lock(&kvm->mmu_lock);
> >  
> > +	err = tdh_mng_vpflushdone(kvm_tdx->tdr_pa);
> > +	if (err == TDX_FLUSHVP_NOT_DONE) {
> > +		ret = -EBUSY;
> > +		goto out;
> > +	}
> > +	if (WARN_ON_ONCE(err)) {
> > +		pr_tdx_error(TDH_MNG_VPFLUSHDONE, err, NULL);
> > +		pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
> > +		       kvm_tdx->hkid);
> > +		ret = -EIO;
> > +		goto out;
> > +	}
> > +
> >  	for_each_online_cpu(i) {
> >  		if (packages_allocated &&
> >  		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
> > @@ -291,14 +429,24 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
> >  		pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
> >  		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
> >  		       kvm_tdx->hkid);
> > +		ret = -EIO;
> >  	} else
> >  		tdx_hkid_free(kvm_tdx);
> >  
> > +out:
> >  	write_unlock(&kvm->mmu_lock);
> >  	mutex_unlock(&tdx_lock);
> >  	cpus_read_unlock();
> >  	free_cpumask_var(targets);
> >  	free_cpumask_var(packages);
> > +
> > +	return ret;
> > +}
> > +
> > +void tdx_mmu_release_hkid(struct kvm *kvm)
> > +{
> > +	while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
> > +		;
> >  }
> 
> As I understand, __tdx_mmu_release_hkid() returns -EBUSY
> after TDH.VP.FLUSH has been sent for every vCPU followed by
> TDH.MNG.VPFLUSHDONE, which returns TDX_FLUSHVP_NOT_DONE.
> 
> Considering earlier comment that a retry of TDH.VP.FLUSH is not
> needed, why is this while() loop here that sends the
> TDH.VP.FLUSH again to all vCPUs instead of just a loop within
> __tdx_mmu_release_hkid() to _just_ resend TDH.MNG.VPFLUSHDONE?
> 
> Could it be possible for a vCPU to appear during this time, thus
> be missed in one TDH.VP.FLUSH cycle, to require a new cycle of
> TDH.VP.FLUSH?

Yes. There is a race between closing KVM vCPU fd and MMU notifier release hook.
When KVM vCPU fd is closed, vCPU context can be loaded again.  The MMU notifier
release hook eventually calls tdx_mmu_release_hkid().  Other kernel thread
(concretely, vhost krenel thread) can get reference count to mmu and put it by
timer, the MMU notifier release hook can be triggered during closing vCPU fd.

The possible alternative is to make the vCPU closing path complicated not to
load vCPU context instead f sending IPI on every retry.


> I note that TDX_FLUSHVP_NOT_DONE is distinct from TDX_OPERAND_BUSY
> that can also be returned from TDH.MNG.VPFLUSHDONE and
> wonder if a retry may be needed in that case also/instead? It looks like
> TDH.MNG.VPFLUSHDONE needs exclusive access to all operands and I
> do not know enough yet if this is the case here.

Because we're destructing the guest and gain mmu_lock, we shouldn't have other
thread racing.  Probably we can simply retry on TDX_OPERAND_BUSY without
worrying race.  It would be more robust and easier to understand.
Sean Christopherson April 12, 2024, 10:46 p.m. UTC | #4
On Fri, Apr 12, 2024, Isaku Yamahata wrote:
> On Fri, Apr 12, 2024 at 09:15:29AM -0700, Reinette Chatre <reinette.chatre@intel.com> wrote:
> > > +void tdx_mmu_release_hkid(struct kvm *kvm)
> > > +{
> > > +	while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
> > > +		;
> > >  }
> > 
> > As I understand, __tdx_mmu_release_hkid() returns -EBUSY
> > after TDH.VP.FLUSH has been sent for every vCPU followed by
> > TDH.MNG.VPFLUSHDONE, which returns TDX_FLUSHVP_NOT_DONE.
> > 
> > Considering earlier comment that a retry of TDH.VP.FLUSH is not
> > needed, why is this while() loop here that sends the
> > TDH.VP.FLUSH again to all vCPUs instead of just a loop within
> > __tdx_mmu_release_hkid() to _just_ resend TDH.MNG.VPFLUSHDONE?
> > 
> > Could it be possible for a vCPU to appear during this time, thus
> > be missed in one TDH.VP.FLUSH cycle, to require a new cycle of
> > TDH.VP.FLUSH?
> 
> Yes. There is a race between closing KVM vCPU fd and MMU notifier release hook.
> When KVM vCPU fd is closed, vCPU context can be loaded again.

But why is _loading_ a vCPU context problematic?  If I'm reading the TDX module
code correctly, TDX_FLUSHVP_NOT_DONE is returned when a vCPU is "associated" with
a pCPU, and association only happens during TDH.VP_ENTER, TDH.MNG.RD, and TDH.MNG.WR,
none of which I see in tdx_vcpu_load().

Assuming there is something problematic lurking under vcpu_load(), I would love,
love, LOVE an excuse to not do vcpu_{load,put}() in kvm_unload_vcpu_mmu(), i.e.
get rid of that thing entirely.

I have definitely looked into kvm_unload_vcpu_mmu() on more than one occassion,
but I can't remember off the top of my head why I have never yanked out the
vcpu_{load,put}().  Maybe I was just scared of breaking something and didn't have
a good reason to risk breakage?

> The MMU notifier release hook eventually calls tdx_mmu_release_hkid().  Other
> kernel thread (concretely, vhost krenel thread) can get reference count to
> mmu and put it by timer, the MMU notifier release hook can be triggered
> during closing vCPU fd.
> 
> The possible alternative is to make the vCPU closing path complicated not to
> load vCPU context instead f sending IPI on every retry.
Isaku Yamahata April 13, 2024, 12:40 a.m. UTC | #5
On Fri, Apr 12, 2024 at 03:46:05PM -0700,
Sean Christopherson <seanjc@google.com> wrote:

> On Fri, Apr 12, 2024, Isaku Yamahata wrote:
> > On Fri, Apr 12, 2024 at 09:15:29AM -0700, Reinette Chatre <reinette.chatre@intel.com> wrote:
> > > > +void tdx_mmu_release_hkid(struct kvm *kvm)
> > > > +{
> > > > +	while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
> > > > +		;
> > > >  }
> > > 
> > > As I understand, __tdx_mmu_release_hkid() returns -EBUSY
> > > after TDH.VP.FLUSH has been sent for every vCPU followed by
> > > TDH.MNG.VPFLUSHDONE, which returns TDX_FLUSHVP_NOT_DONE.
> > > 
> > > Considering earlier comment that a retry of TDH.VP.FLUSH is not
> > > needed, why is this while() loop here that sends the
> > > TDH.VP.FLUSH again to all vCPUs instead of just a loop within
> > > __tdx_mmu_release_hkid() to _just_ resend TDH.MNG.VPFLUSHDONE?
> > > 
> > > Could it be possible for a vCPU to appear during this time, thus
> > > be missed in one TDH.VP.FLUSH cycle, to require a new cycle of
> > > TDH.VP.FLUSH?
> > 
> > Yes. There is a race between closing KVM vCPU fd and MMU notifier release hook.
> > When KVM vCPU fd is closed, vCPU context can be loaded again.
> 
> But why is _loading_ a vCPU context problematic?

It's nothing problematic.  It becomes a bit harder to understand why
tdx_mmu_release_hkid() issues IPI on each loop.  I think it's reasonable
to make the normal path easy and to complicate/penalize the destruction path.
Probably I should've added comment on the function.
Sean Christopherson April 15, 2024, 1:49 p.m. UTC | #6
On Fri, Apr 12, 2024, Isaku Yamahata wrote:
> On Fri, Apr 12, 2024 at 03:46:05PM -0700,
> Sean Christopherson <seanjc@google.com> wrote:
> 
> > On Fri, Apr 12, 2024, Isaku Yamahata wrote:
> > > On Fri, Apr 12, 2024 at 09:15:29AM -0700, Reinette Chatre <reinette.chatre@intel.com> wrote:
> > > > > +void tdx_mmu_release_hkid(struct kvm *kvm)
> > > > > +{
> > > > > +	while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
> > > > > +		;
> > > > >  }
> > > > 
> > > > As I understand, __tdx_mmu_release_hkid() returns -EBUSY
> > > > after TDH.VP.FLUSH has been sent for every vCPU followed by
> > > > TDH.MNG.VPFLUSHDONE, which returns TDX_FLUSHVP_NOT_DONE.
> > > > 
> > > > Considering earlier comment that a retry of TDH.VP.FLUSH is not
> > > > needed, why is this while() loop here that sends the
> > > > TDH.VP.FLUSH again to all vCPUs instead of just a loop within
> > > > __tdx_mmu_release_hkid() to _just_ resend TDH.MNG.VPFLUSHDONE?
> > > > 
> > > > Could it be possible for a vCPU to appear during this time, thus
> > > > be missed in one TDH.VP.FLUSH cycle, to require a new cycle of
> > > > TDH.VP.FLUSH?
> > > 
> > > Yes. There is a race between closing KVM vCPU fd and MMU notifier release hook.
> > > When KVM vCPU fd is closed, vCPU context can be loaded again.
> > 
> > But why is _loading_ a vCPU context problematic?
> 
> It's nothing problematic.  It becomes a bit harder to understand why
> tdx_mmu_release_hkid() issues IPI on each loop.  I think it's reasonable
> to make the normal path easy and to complicate/penalize the destruction path.
> Probably I should've added comment on the function.

By "problematic", I meant, why can that result in a "missed in one TDH.VP.FLUSH
cycle"?  AFAICT, loading a vCPU shouldn't cause that vCPU to be associated from
the TDX module's perspective, and thus shouldn't trigger TDX_FLUSHVP_NOT_DONE.

I.e. looping should be unnecessary, no?
Isaku Yamahata April 15, 2024, 10:48 p.m. UTC | #7
On Mon, Apr 15, 2024 at 06:49:35AM -0700,
Sean Christopherson <seanjc@google.com> wrote:

> On Fri, Apr 12, 2024, Isaku Yamahata wrote:
> > On Fri, Apr 12, 2024 at 03:46:05PM -0700,
> > Sean Christopherson <seanjc@google.com> wrote:
> > 
> > > On Fri, Apr 12, 2024, Isaku Yamahata wrote:
> > > > On Fri, Apr 12, 2024 at 09:15:29AM -0700, Reinette Chatre <reinette.chatre@intel.com> wrote:
> > > > > > +void tdx_mmu_release_hkid(struct kvm *kvm)
> > > > > > +{
> > > > > > +	while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
> > > > > > +		;
> > > > > >  }
> > > > > 
> > > > > As I understand, __tdx_mmu_release_hkid() returns -EBUSY
> > > > > after TDH.VP.FLUSH has been sent for every vCPU followed by
> > > > > TDH.MNG.VPFLUSHDONE, which returns TDX_FLUSHVP_NOT_DONE.
> > > > > 
> > > > > Considering earlier comment that a retry of TDH.VP.FLUSH is not
> > > > > needed, why is this while() loop here that sends the
> > > > > TDH.VP.FLUSH again to all vCPUs instead of just a loop within
> > > > > __tdx_mmu_release_hkid() to _just_ resend TDH.MNG.VPFLUSHDONE?
> > > > > 
> > > > > Could it be possible for a vCPU to appear during this time, thus
> > > > > be missed in one TDH.VP.FLUSH cycle, to require a new cycle of
> > > > > TDH.VP.FLUSH?
> > > > 
> > > > Yes. There is a race between closing KVM vCPU fd and MMU notifier release hook.
> > > > When KVM vCPU fd is closed, vCPU context can be loaded again.
> > > 
> > > But why is _loading_ a vCPU context problematic?
> > 
> > It's nothing problematic.  It becomes a bit harder to understand why
> > tdx_mmu_release_hkid() issues IPI on each loop.  I think it's reasonable
> > to make the normal path easy and to complicate/penalize the destruction path.
> > Probably I should've added comment on the function.
> 
> By "problematic", I meant, why can that result in a "missed in one TDH.VP.FLUSH
> cycle"?  AFAICT, loading a vCPU shouldn't cause that vCPU to be associated from
> the TDX module's perspective, and thus shouldn't trigger TDX_FLUSHVP_NOT_DONE.
> 
> I.e. looping should be unnecessary, no?

The loop is unnecessary with the current code.

The possible future optimization is to reduce destruction time of Secure-EPT
somehow.  One possible option is to release HKID while vCPUs are still alive and
destruct Secure-EPT with multiple vCPU context.  Because that's future
optimization, we can ignore it at this phase.
Huang, Kai April 16, 2024, 12:05 a.m. UTC | #8
On 16/04/2024 10:48 am, Yamahata, Isaku wrote:
> On Mon, Apr 15, 2024 at 06:49:35AM -0700,
> Sean Christopherson <seanjc@google.com> wrote:
> 
>> On Fri, Apr 12, 2024, Isaku Yamahata wrote:
>>> On Fri, Apr 12, 2024 at 03:46:05PM -0700,
>>> Sean Christopherson <seanjc@google.com> wrote:
>>>
>>>> On Fri, Apr 12, 2024, Isaku Yamahata wrote:
>>>>> On Fri, Apr 12, 2024 at 09:15:29AM -0700, Reinette Chatre <reinette.chatre@intel.com> wrote:
>>>>>>> +void tdx_mmu_release_hkid(struct kvm *kvm)
>>>>>>> +{
>>>>>>> +	while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
>>>>>>> +		;
>>>>>>>   }
>>>>>>
>>>>>> As I understand, __tdx_mmu_release_hkid() returns -EBUSY
>>>>>> after TDH.VP.FLUSH has been sent for every vCPU followed by
>>>>>> TDH.MNG.VPFLUSHDONE, which returns TDX_FLUSHVP_NOT_DONE.
>>>>>>
>>>>>> Considering earlier comment that a retry of TDH.VP.FLUSH is not
>>>>>> needed, why is this while() loop here that sends the
>>>>>> TDH.VP.FLUSH again to all vCPUs instead of just a loop within
>>>>>> __tdx_mmu_release_hkid() to _just_ resend TDH.MNG.VPFLUSHDONE?
>>>>>>
>>>>>> Could it be possible for a vCPU to appear during this time, thus
>>>>>> be missed in one TDH.VP.FLUSH cycle, to require a new cycle of
>>>>>> TDH.VP.FLUSH?
>>>>>
>>>>> Yes. There is a race between closing KVM vCPU fd and MMU notifier release hook.
>>>>> When KVM vCPU fd is closed, vCPU context can be loaded again.
>>>>
>>>> But why is _loading_ a vCPU context problematic?
>>>
>>> It's nothing problematic.  It becomes a bit harder to understand why
>>> tdx_mmu_release_hkid() issues IPI on each loop.  I think it's reasonable
>>> to make the normal path easy and to complicate/penalize the destruction path.
>>> Probably I should've added comment on the function.
>>
>> By "problematic", I meant, why can that result in a "missed in one TDH.VP.FLUSH
>> cycle"?  AFAICT, loading a vCPU shouldn't cause that vCPU to be associated from
>> the TDX module's perspective, and thus shouldn't trigger TDX_FLUSHVP_NOT_DONE.
>>
>> I.e. looping should be unnecessary, no?
> 
> The loop is unnecessary with the current code.
> 
> The possible future optimization is to reduce destruction time of Secure-EPT
> somehow.  One possible option is to release HKID while vCPUs are still alive and
> destruct Secure-EPT with multiple vCPU context.  Because that's future
> optimization, we can ignore it at this phase.

I kinda lost here.

I thought in the current v19 code, you have already implemented this 
optimization?

Or is this optimization totally different from what we discussed in an 
earlier patch?

https://lore.kernel.org/lkml/8feaba8f8ef249950b629f3a8300ddfb4fbcf11c.camel@intel.com/
Isaku Yamahata April 16, 2024, 4:44 p.m. UTC | #9
On Tue, Apr 16, 2024 at 12:05:31PM +1200,
"Huang, Kai" <kai.huang@intel.com> wrote:

> 
> 
> On 16/04/2024 10:48 am, Yamahata, Isaku wrote:
> > On Mon, Apr 15, 2024 at 06:49:35AM -0700,
> > Sean Christopherson <seanjc@google.com> wrote:
> > 
> > > On Fri, Apr 12, 2024, Isaku Yamahata wrote:
> > > > On Fri, Apr 12, 2024 at 03:46:05PM -0700,
> > > > Sean Christopherson <seanjc@google.com> wrote:
> > > > 
> > > > > On Fri, Apr 12, 2024, Isaku Yamahata wrote:
> > > > > > On Fri, Apr 12, 2024 at 09:15:29AM -0700, Reinette Chatre <reinette.chatre@intel.com> wrote:
> > > > > > > > +void tdx_mmu_release_hkid(struct kvm *kvm)
> > > > > > > > +{
> > > > > > > > +	while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
> > > > > > > > +		;
> > > > > > > >   }
> > > > > > > 
> > > > > > > As I understand, __tdx_mmu_release_hkid() returns -EBUSY
> > > > > > > after TDH.VP.FLUSH has been sent for every vCPU followed by
> > > > > > > TDH.MNG.VPFLUSHDONE, which returns TDX_FLUSHVP_NOT_DONE.
> > > > > > > 
> > > > > > > Considering earlier comment that a retry of TDH.VP.FLUSH is not
> > > > > > > needed, why is this while() loop here that sends the
> > > > > > > TDH.VP.FLUSH again to all vCPUs instead of just a loop within
> > > > > > > __tdx_mmu_release_hkid() to _just_ resend TDH.MNG.VPFLUSHDONE?
> > > > > > > 
> > > > > > > Could it be possible for a vCPU to appear during this time, thus
> > > > > > > be missed in one TDH.VP.FLUSH cycle, to require a new cycle of
> > > > > > > TDH.VP.FLUSH?
> > > > > > 
> > > > > > Yes. There is a race between closing KVM vCPU fd and MMU notifier release hook.
> > > > > > When KVM vCPU fd is closed, vCPU context can be loaded again.
> > > > > 
> > > > > But why is _loading_ a vCPU context problematic?
> > > > 
> > > > It's nothing problematic.  It becomes a bit harder to understand why
> > > > tdx_mmu_release_hkid() issues IPI on each loop.  I think it's reasonable
> > > > to make the normal path easy and to complicate/penalize the destruction path.
> > > > Probably I should've added comment on the function.
> > > 
> > > By "problematic", I meant, why can that result in a "missed in one TDH.VP.FLUSH
> > > cycle"?  AFAICT, loading a vCPU shouldn't cause that vCPU to be associated from
> > > the TDX module's perspective, and thus shouldn't trigger TDX_FLUSHVP_NOT_DONE.
> > > 
> > > I.e. looping should be unnecessary, no?
> > 
> > The loop is unnecessary with the current code.
> > 
> > The possible future optimization is to reduce destruction time of Secure-EPT
> > somehow.  One possible option is to release HKID while vCPUs are still alive and
> > destruct Secure-EPT with multiple vCPU context.  Because that's future
> > optimization, we can ignore it at this phase.
> 
> I kinda lost here.
> 
> I thought in the current v19 code, you have already implemented this
> optimization?
> 
> Or is this optimization totally different from what we discussed in an
> earlier patch?
> 
> https://lore.kernel.org/lkml/8feaba8f8ef249950b629f3a8300ddfb4fbcf11c.camel@intel.com/

That's only the first step.  We can optimize it further with multiple vCPUs
context.
Huang, Kai April 18, 2024, 1:09 a.m. UTC | #10
On 17/04/2024 4:44 am, Isaku Yamahata wrote:
> On Tue, Apr 16, 2024 at 12:05:31PM +1200,
> "Huang, Kai" <kai.huang@intel.com> wrote:
> 
>>
>>
>> On 16/04/2024 10:48 am, Yamahata, Isaku wrote:
>>> On Mon, Apr 15, 2024 at 06:49:35AM -0700,
>>> Sean Christopherson <seanjc@google.com> wrote:
>>>
>>>> On Fri, Apr 12, 2024, Isaku Yamahata wrote:
>>>>> On Fri, Apr 12, 2024 at 03:46:05PM -0700,
>>>>> Sean Christopherson <seanjc@google.com> wrote:
>>>>>
>>>>>> On Fri, Apr 12, 2024, Isaku Yamahata wrote:
>>>>>>> On Fri, Apr 12, 2024 at 09:15:29AM -0700, Reinette Chatre <reinette.chatre@intel.com> wrote:
>>>>>>>>> +void tdx_mmu_release_hkid(struct kvm *kvm)
>>>>>>>>> +{
>>>>>>>>> +	while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
>>>>>>>>> +		;
>>>>>>>>>    }
>>>>>>>>
>>>>>>>> As I understand, __tdx_mmu_release_hkid() returns -EBUSY
>>>>>>>> after TDH.VP.FLUSH has been sent for every vCPU followed by
>>>>>>>> TDH.MNG.VPFLUSHDONE, which returns TDX_FLUSHVP_NOT_DONE.
>>>>>>>>
>>>>>>>> Considering earlier comment that a retry of TDH.VP.FLUSH is not
>>>>>>>> needed, why is this while() loop here that sends the
>>>>>>>> TDH.VP.FLUSH again to all vCPUs instead of just a loop within
>>>>>>>> __tdx_mmu_release_hkid() to _just_ resend TDH.MNG.VPFLUSHDONE?
>>>>>>>>
>>>>>>>> Could it be possible for a vCPU to appear during this time, thus
>>>>>>>> be missed in one TDH.VP.FLUSH cycle, to require a new cycle of
>>>>>>>> TDH.VP.FLUSH?
>>>>>>>
>>>>>>> Yes. There is a race between closing KVM vCPU fd and MMU notifier release hook.
>>>>>>> When KVM vCPU fd is closed, vCPU context can be loaded again.
>>>>>>
>>>>>> But why is _loading_ a vCPU context problematic?
>>>>>
>>>>> It's nothing problematic.  It becomes a bit harder to understand why
>>>>> tdx_mmu_release_hkid() issues IPI on each loop.  I think it's reasonable
>>>>> to make the normal path easy and to complicate/penalize the destruction path.
>>>>> Probably I should've added comment on the function.
>>>>
>>>> By "problematic", I meant, why can that result in a "missed in one TDH.VP.FLUSH
>>>> cycle"?  AFAICT, loading a vCPU shouldn't cause that vCPU to be associated from
>>>> the TDX module's perspective, and thus shouldn't trigger TDX_FLUSHVP_NOT_DONE.
>>>>
>>>> I.e. looping should be unnecessary, no?
>>>
>>> The loop is unnecessary with the current code.
>>>
>>> The possible future optimization is to reduce destruction time of Secure-EPT
>>> somehow.  One possible option is to release HKID while vCPUs are still alive and
>>> destruct Secure-EPT with multiple vCPU context.  Because that's future
>>> optimization, we can ignore it at this phase.
>>
>> I kinda lost here.
>>
>> I thought in the current v19 code, you have already implemented this
>> optimization?
>>
>> Or is this optimization totally different from what we discussed in an
>> earlier patch?
>>
>> https://lore.kernel.org/lkml/8feaba8f8ef249950b629f3a8300ddfb4fbcf11c.camel@intel.com/
> 
> That's only the first step.  We can optimize it further with multiple vCPUs
> context.

OK.  Let's put aside how important these optimizations are and whether 
they should be done in the initial TDX support, I think the right way to 
organize the patches is to bring functionality first and then put 
performance optimization later.

That can make both writing code and code review easier.

And more importantly, the "performance optimization" can be discussed 
_separately_.

For example, as mentioned in the link above, I think the optimization of 
"releasing HKID in the MMU notifier release to improve TD teardown 
latency" complicates things a lot, e.g., not only to the TD 
creation/teardown sequence, but also here -- w/o it we don't even need 
to consider the race between vCPU load and MMU notifier release:

https://lore.kernel.org/kvm/20240412214201.GO3039520@ls.amr.corp.intel.com/

So I think we should start with implementing these sequences in "normal 
way" first, and then do the optimization(s) later.

And to me the "normal way" for TD creation/destruction we can just do:

1) Use normal SEPT sequence to teardown private EPT page table;
2) Do VP.FLUSH when vCPU is destroyed
3) Do VPFLUSHDONE after all vCPUs are destroyed
4) release HKID at last stage of destroying VM.

For vCPU migration, you do VP.FLUSH on the old pCPU before you load the 
vCPU to the new pCPU, as shown in this patch.

Then you don't need to cover the silly code change around 
tdx_mmu_release_hkid() in this patch.

Am I missing anything?
Binbin Wu April 23, 2024, 12:13 p.m. UTC | #11
On 4/13/2024 12:15 AM, Reinette Chatre wrote:
> Hi Isaku,
>
> On 2/26/2024 12:26 AM, isaku.yamahata@intel.com wrote:
>> From: Isaku Yamahata <isaku.yamahata@intel.com>
> ...
>
>> @@ -218,6 +257,87 @@ static void tdx_reclaim_control_page(unsigned long td_page_pa)
>>   	free_page((unsigned long)__va(td_page_pa));
>>   }
>>   
>> +struct tdx_flush_vp_arg {
>> +	struct kvm_vcpu *vcpu;
>> +	u64 err;
>> +};
>> +
>> +static void tdx_flush_vp(void *arg_)
>> +{
>> +	struct tdx_flush_vp_arg *arg = arg_;
>> +	struct kvm_vcpu *vcpu = arg->vcpu;
>> +	u64 err;
>> +
>> +	arg->err = 0;
>> +	lockdep_assert_irqs_disabled();
>> +
>> +	/* Task migration can race with CPU offlining. */
>> +	if (unlikely(vcpu->cpu != raw_smp_processor_id()))
>> +		return;
>> +
>> +	/*
>> +	 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
>> +	 * list tracking still needs to be updated so that it's correct if/when
>> +	 * the vCPU does get initialized.
>> +	 */
>> +	if (is_td_vcpu_created(to_tdx(vcpu))) {
>> +		/*
>> +		 * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are,
>> +		 * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
>> +		 * vp flush function is called when destructing vcpu/TD or vcpu
>> +		 * migration.  No other thread uses TDVPR in those cases.
>> +		 */
Is it possible that other thread uses TDR or TDCS as exclusive?

>>
Isaku Yamahata May 6, 2024, 7:02 p.m. UTC | #12
On Tue, Apr 23, 2024 at 08:13:25PM +0800,
Binbin Wu <binbin.wu@linux.intel.com> wrote:

> On 4/13/2024 12:15 AM, Reinette Chatre wrote:
> > Hi Isaku,
> > 
> > On 2/26/2024 12:26 AM, isaku.yamahata@intel.com wrote:
> > > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > ...
> > 
> > > @@ -218,6 +257,87 @@ static void tdx_reclaim_control_page(unsigned long td_page_pa)
> > >   	free_page((unsigned long)__va(td_page_pa));
> > >   }
> > > +struct tdx_flush_vp_arg {
> > > +	struct kvm_vcpu *vcpu;
> > > +	u64 err;
> > > +};
> > > +
> > > +static void tdx_flush_vp(void *arg_)
> > > +{
> > > +	struct tdx_flush_vp_arg *arg = arg_;
> > > +	struct kvm_vcpu *vcpu = arg->vcpu;
> > > +	u64 err;
> > > +
> > > +	arg->err = 0;
> > > +	lockdep_assert_irqs_disabled();
> > > +
> > > +	/* Task migration can race with CPU offlining. */
> > > +	if (unlikely(vcpu->cpu != raw_smp_processor_id()))
> > > +		return;
> > > +
> > > +	/*
> > > +	 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
> > > +	 * list tracking still needs to be updated so that it's correct if/when
> > > +	 * the vCPU does get initialized.
> > > +	 */
> > > +	if (is_td_vcpu_created(to_tdx(vcpu))) {
> > > +		/*
> > > +		 * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are,
> > > +		 * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
> > > +		 * vp flush function is called when destructing vcpu/TD or vcpu
> > > +		 * migration.  No other thread uses TDVPR in those cases.
> > > +		 */
> Is it possible that other thread uses TDR or TDCS as exclusive?

Exclusive lock is taken only when the guest creation or destruction.
TDH.MNG.{ADDCX, CREATE, INIT, KEY.CONFIG, KEY.FREEID, FLUSHDONE}()
TDH.MR.{EXTEND, FINALIZE}()
TDH.MEM.PAGE.ADD()
TDH.VP.{CREATE, INIT, ADDCX, FLUSH}()

During run time (while vcpu can run), they are locked as shared.
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 8275a242ce07..9b336c1a6508 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -33,6 +33,14 @@  static int vt_max_vcpus(struct kvm *kvm)
 static int vt_flush_remote_tlbs(struct kvm *kvm);
 #endif
 
+static void vt_hardware_disable(void)
+{
+	/* Note, TDX *and* VMX need to be disabled if TDX is enabled. */
+	if (enable_tdx)
+		tdx_hardware_disable();
+	vmx_hardware_disable();
+}
+
 static __init int vt_hardware_setup(void)
 {
 	int ret;
@@ -201,6 +209,16 @@  static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu)
 	return vmx_vcpu_run(vcpu);
 }
 
+static void vt_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	if (is_td_vcpu(vcpu)) {
+		tdx_vcpu_load(vcpu, cpu);
+		return;
+	}
+
+	vmx_vcpu_load(vcpu, cpu);
+}
+
 static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
 {
 	if (is_td_vcpu(vcpu)) {
@@ -262,6 +280,14 @@  static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
 	vmx_load_mmu_pgd(vcpu, root_hpa, pgd_level);
 }
 
+static void vt_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+	if (is_td_vcpu(vcpu))
+		return;
+
+	vmx_sched_in(vcpu, cpu);
+}
+
 static u8 vt_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
 	if (is_td_vcpu(vcpu))
@@ -335,7 +361,7 @@  struct kvm_x86_ops vt_x86_ops __initdata = {
 
 	/* TDX cpu enablement is done by tdx_hardware_setup(). */
 	.hardware_enable = vmx_hardware_enable,
-	.hardware_disable = vmx_hardware_disable,
+	.hardware_disable = vt_hardware_disable,
 	.has_emulated_msr = vmx_has_emulated_msr,
 
 	.is_vm_type_supported = vt_is_vm_type_supported,
@@ -353,7 +379,7 @@  struct kvm_x86_ops vt_x86_ops __initdata = {
 	.vcpu_reset = vt_vcpu_reset,
 
 	.prepare_switch_to_guest = vt_prepare_switch_to_guest,
-	.vcpu_load = vmx_vcpu_load,
+	.vcpu_load = vt_vcpu_load,
 	.vcpu_put = vt_vcpu_put,
 
 	.update_exception_bitmap = vmx_update_exception_bitmap,
@@ -440,7 +466,7 @@  struct kvm_x86_ops vt_x86_ops __initdata = {
 
 	.request_immediate_exit = vmx_request_immediate_exit,
 
-	.sched_in = vmx_sched_in,
+	.sched_in = vt_sched_in,
 
 	.cpu_dirty_log_size = PML_ENTITY_NUM,
 	.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index ad4d3d4eaf6c..7aa9188f384d 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -106,6 +106,14 @@  static DEFINE_MUTEX(tdx_lock);
 static struct mutex *tdx_mng_key_config_lock;
 static atomic_t nr_configured_hkid;
 
+/*
+ * A per-CPU list of TD vCPUs associated with a given CPU.  Used when a CPU
+ * is brought down to invoke TDH_VP_FLUSH on the approapriate TD vCPUS.
+ * Protected by interrupt mask.  This list is manipulated in process context
+ * of vcpu and IPI callback.  See tdx_flush_vp_on_cpu().
+ */
+static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
+
 static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
 {
 	return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
@@ -138,6 +146,37 @@  static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
 	return kvm_tdx->finalized;
 }
 
+static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
+{
+	lockdep_assert_irqs_disabled();
+
+	list_del(&to_tdx(vcpu)->cpu_list);
+
+	/*
+	 * Ensure tdx->cpu_list is updated is before setting vcpu->cpu to -1,
+	 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
+	 * to its list before its deleted from this CPUs list.
+	 */
+	smp_wmb();
+
+	vcpu->cpu = -1;
+}
+
+static void tdx_disassociate_vp_arg(void *vcpu)
+{
+	tdx_disassociate_vp(vcpu);
+}
+
+static void tdx_disassociate_vp_on_cpu(struct kvm_vcpu *vcpu)
+{
+	int cpu = vcpu->cpu;
+
+	if (unlikely(cpu == -1))
+		return;
+
+	smp_call_function_single(cpu, tdx_disassociate_vp_arg, vcpu, 1);
+}
+
 static void tdx_clear_page(unsigned long page_pa)
 {
 	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
@@ -218,6 +257,87 @@  static void tdx_reclaim_control_page(unsigned long td_page_pa)
 	free_page((unsigned long)__va(td_page_pa));
 }
 
+struct tdx_flush_vp_arg {
+	struct kvm_vcpu *vcpu;
+	u64 err;
+};
+
+static void tdx_flush_vp(void *arg_)
+{
+	struct tdx_flush_vp_arg *arg = arg_;
+	struct kvm_vcpu *vcpu = arg->vcpu;
+	u64 err;
+
+	arg->err = 0;
+	lockdep_assert_irqs_disabled();
+
+	/* Task migration can race with CPU offlining. */
+	if (unlikely(vcpu->cpu != raw_smp_processor_id()))
+		return;
+
+	/*
+	 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
+	 * list tracking still needs to be updated so that it's correct if/when
+	 * the vCPU does get initialized.
+	 */
+	if (is_td_vcpu_created(to_tdx(vcpu))) {
+		/*
+		 * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are,
+		 * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
+		 * vp flush function is called when destructing vcpu/TD or vcpu
+		 * migration.  No other thread uses TDVPR in those cases.
+		 */
+		err = tdh_vp_flush(to_tdx(vcpu)->tdvpr_pa);
+		if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
+			/*
+			 * This function is called in IPI context. Do not use
+			 * printk to avoid console semaphore.
+			 * The caller prints out the error message, instead.
+			 */
+			if (err)
+				arg->err = err;
+		}
+	}
+
+	tdx_disassociate_vp(vcpu);
+}
+
+static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
+{
+	struct tdx_flush_vp_arg arg = {
+		.vcpu = vcpu,
+	};
+	int cpu = vcpu->cpu;
+
+	if (unlikely(cpu == -1))
+		return;
+
+	smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
+	if (WARN_ON_ONCE(arg.err)) {
+		pr_err("cpu: %d ", cpu);
+		pr_tdx_error(TDH_VP_FLUSH, arg.err, NULL);
+	}
+}
+
+void tdx_hardware_disable(void)
+{
+	int cpu = raw_smp_processor_id();
+	struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
+	struct tdx_flush_vp_arg arg;
+	struct vcpu_tdx *tdx, *tmp;
+	unsigned long flags;
+
+	lockdep_assert_preemption_disabled();
+
+	local_irq_save(flags);
+	/* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
+	list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
+		arg.vcpu = &tdx->vcpu;
+		tdx_flush_vp(&arg);
+	}
+	local_irq_restore(flags);
+}
+
 static void tdx_do_tdh_phymem_cache_wb(void *unused)
 {
 	u64 err = 0;
@@ -233,26 +353,31 @@  static void tdx_do_tdh_phymem_cache_wb(void *unused)
 		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
 }
 
-void tdx_mmu_release_hkid(struct kvm *kvm)
+static int __tdx_mmu_release_hkid(struct kvm *kvm)
 {
 	bool packages_allocated, targets_allocated;
 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
 	cpumask_var_t packages, targets;
+	struct kvm_vcpu *vcpu;
+	unsigned long j;
+	int i, ret = 0;
 	u64 err;
-	int i;
 
 	if (!is_hkid_assigned(kvm_tdx))
-		return;
+		return 0;
 
 	if (!is_td_created(kvm_tdx)) {
 		tdx_hkid_free(kvm_tdx);
-		return;
+		return 0;
 	}
 
 	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
 	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
 	cpus_read_lock();
 
+	kvm_for_each_vcpu(j, vcpu, kvm)
+		tdx_flush_vp_on_cpu(vcpu);
+
 	/*
 	 * We can destroy multiple guest TDs simultaneously.  Prevent
 	 * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
@@ -270,6 +395,19 @@  void tdx_mmu_release_hkid(struct kvm *kvm)
 	 */
 	write_lock(&kvm->mmu_lock);
 
+	err = tdh_mng_vpflushdone(kvm_tdx->tdr_pa);
+	if (err == TDX_FLUSHVP_NOT_DONE) {
+		ret = -EBUSY;
+		goto out;
+	}
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_MNG_VPFLUSHDONE, err, NULL);
+		pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
+		       kvm_tdx->hkid);
+		ret = -EIO;
+		goto out;
+	}
+
 	for_each_online_cpu(i) {
 		if (packages_allocated &&
 		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
@@ -291,14 +429,24 @@  void tdx_mmu_release_hkid(struct kvm *kvm)
 		pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
 		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
 		       kvm_tdx->hkid);
+		ret = -EIO;
 	} else
 		tdx_hkid_free(kvm_tdx);
 
+out:
 	write_unlock(&kvm->mmu_lock);
 	mutex_unlock(&tdx_lock);
 	cpus_read_unlock();
 	free_cpumask_var(targets);
 	free_cpumask_var(packages);
+
+	return ret;
+}
+
+void tdx_mmu_release_hkid(struct kvm *kvm)
+{
+	while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
+		;
 }
 
 void tdx_vm_free(struct kvm *kvm)
@@ -455,6 +603,26 @@  int tdx_vcpu_create(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+	if (vcpu->cpu == cpu)
+		return;
+
+	tdx_flush_vp_on_cpu(vcpu);
+
+	local_irq_disable();
+	/*
+	 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
+	 * vcpu->cpu is read before tdx->cpu_list.
+	 */
+	smp_rmb();
+
+	list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
+	local_irq_enable();
+}
+
 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_tdx *tdx = to_tdx(vcpu);
@@ -495,6 +663,16 @@  void tdx_vcpu_free(struct kvm_vcpu *vcpu)
 	struct vcpu_tdx *tdx = to_tdx(vcpu);
 	int i;
 
+	/*
+	 * When destroying VM, kvm_unload_vcpu_mmu() calls vcpu_load() for every
+	 * vcpu after they already disassociated from the per cpu list by
+	 * tdx_mmu_release_hkid().  So we need to disassociate them again,
+	 * otherwise the freed vcpu data will be accessed when do
+	 * list_{del,add}() on associated_tdvcpus list later.
+	 */
+	tdx_disassociate_vp_on_cpu(vcpu);
+	WARN_ON_ONCE(vcpu->cpu != -1);
+
 	/*
 	 * This methods can be called when vcpu allocation/initialization
 	 * failed. So it's possible that hkid, tdvpx and tdvpr are not assigned
@@ -2030,6 +2208,10 @@  int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
 		return -EINVAL;
 	}
 
+	/* tdx_hardware_disable() uses associated_tdvcpus. */
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
+
 	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
 		/*
 		 * Here it checks if MSRs (tdx_uret_msrs) can be saved/restored
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 0d8a98feb58e..7f8c78f06508 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -73,6 +73,8 @@  struct vcpu_tdx {
 	unsigned long *tdvpx_pa;
 	bool td_vcpu_created;
 
+	struct list_head cpu_list;
+
 	union tdx_exit_reason exit_reason;
 
 	bool initialized;
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 9fd997c79c33..5853f29f0af3 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -137,6 +137,7 @@  void vmx_setup_mce(struct kvm_vcpu *vcpu);
 #ifdef CONFIG_INTEL_TDX_HOST
 int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops);
 void tdx_hardware_unsetup(void);
+void tdx_hardware_disable(void);
 bool tdx_is_vm_type_supported(unsigned long type);
 int tdx_offline_cpu(void);
 
@@ -153,6 +154,7 @@  void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu);
 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
 void tdx_vcpu_put(struct kvm_vcpu *vcpu);
+void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 u8 tdx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 
 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
@@ -171,6 +173,7 @@  void tdx_post_memory_mapping(struct kvm_vcpu *vcpu,
 #else
 static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -EOPNOTSUPP; }
 static inline void tdx_hardware_unsetup(void) {}
+static inline void tdx_hardware_disable(void) {}
 static inline bool tdx_is_vm_type_supported(unsigned long type) { return false; }
 static inline int tdx_offline_cpu(void) { return 0; }
 
@@ -190,6 +193,7 @@  static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
 static inline fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu) { return EXIT_FASTPATH_NONE; }
 static inline void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) {}
 static inline void tdx_vcpu_put(struct kvm_vcpu *vcpu) {}
+static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {}
 static inline u8 tdx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; }
 
 static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }