diff mbox series

[v19,038/130] KVM: TDX: create/destroy VM structure

Message ID 7a508f88e8c8b5199da85b7a9959882ddf390796.1708933498.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series [v19,001/130] x86/virt/tdx: Rename _offset to _member for TD_SYSINFO_MAP() macro | expand

Commit Message

Isaku Yamahata Feb. 26, 2024, 8:25 a.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

As the first step to create TDX guest, create/destroy VM struct.  Assign
TDX private Host Key ID (HKID) to the TDX guest for memory encryption and
allocate extra pages for the TDX guest. On destruction, free allocated
pages, and HKID.

Before tearing down private page tables, TDX requires some resources of the
guest TD to be destroyed (i.e. HKID must have been reclaimed, etc).  Add
mmu notifier release callback before tearing down private page tables for
it.

Add vm_free() of kvm_x86_ops hook at the end of kvm_arch_destroy_vm()
because some per-VM TDX resources, e.g. TDR, need to be freed after other
TDX resources, e.g. HKID, were freed.

Co-developed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>

---
v19:
- fix check error code of TDH.PHYMEM.PAGE.RECLAIM. RCX and TDR.

v18:
- Use TDH.SYS.RD() instead of struct tdsysinfo_struct.
- Rename tdx_reclaim_td_page() to tdx_reclaim_control_page()
- return -EAGAIN on TDX_RND_NO_ENTROPY of TDH.MNG.CREATE(), TDH.MNG.ADDCX()
- fix comment to remove extra the.
- use true instead of 1 for boolean.
- remove an extra white line.

v16:
- Simplified tdx_reclaim_page()
- Reorganize the locking of tdx_release_hkid(), and use smp_call_mask()
  instead of smp_call_on_cpu() to hold spinlock to race with invalidation
  on releasing guest memfd

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |   2 +
 arch/x86/include/asm/kvm_host.h    |   2 +
 arch/x86/kvm/Kconfig               |   3 +-
 arch/x86/kvm/mmu/mmu.c             |   7 +
 arch/x86/kvm/vmx/main.c            |  26 +-
 arch/x86/kvm/vmx/tdx.c             | 475 ++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/tdx.h             |   6 +-
 arch/x86/kvm/vmx/x86_ops.h         |   6 +
 arch/x86/kvm/x86.c                 |   1 +
 9 files changed, 520 insertions(+), 8 deletions(-)

Comments

Chao Gao March 20, 2024, 5:12 a.m. UTC | #1
> config KVM_SW_PROTECTED_VM
> 	bool "Enable support for KVM software-protected VMs"
>-	depends on EXPERT
> 	depends on KVM && X86_64
> 	select KVM_GENERIC_PRIVATE_MEM
> 	help
>@@ -89,6 +88,8 @@ config KVM_SW_PROTECTED_VM
> config KVM_INTEL
> 	tristate "KVM for Intel (and compatible) processors support"
> 	depends on KVM && IA32_FEAT_CTL
>+	select KVM_SW_PROTECTED_VM if INTEL_TDX_HOST

why does INTEL_TDX_HOST select KVM_SW_PROTECTED_VM?

>+	select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
> 	help
> 	.vcpu_precreate = vmx_vcpu_precreate,
> 	.vcpu_create = vmx_vcpu_create,

>--- a/arch/x86/kvm/vmx/tdx.c
>+++ b/arch/x86/kvm/vmx/tdx.c
>@@ -5,10 +5,11 @@
> 
> #include "capabilities.h"
> #include "x86_ops.h"
>-#include "x86.h"
> #include "mmu.h"
> #include "tdx_arch.h"
> #include "tdx.h"
>+#include "tdx_ops.h"
>+#include "x86.h"

any reason to reorder x86.h?

>+static void tdx_do_tdh_phymem_cache_wb(void *unused)
>+{
>+	u64 err = 0;
>+
>+	do {
>+		err = tdh_phymem_cache_wb(!!err);
>+	} while (err == TDX_INTERRUPTED_RESUMABLE);
>+
>+	/* Other thread may have done for us. */
>+	if (err == TDX_NO_HKID_READY_TO_WBCACHE)
>+		err = TDX_SUCCESS;
>+	if (WARN_ON_ONCE(err))
>+		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
>+}
>+
>+void tdx_mmu_release_hkid(struct kvm *kvm)
>+{
>+	bool packages_allocated, targets_allocated;
>+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>+	cpumask_var_t packages, targets;
>+	u64 err;
>+	int i;
>+
>+	if (!is_hkid_assigned(kvm_tdx))
>+		return;
>+
>+	if (!is_td_created(kvm_tdx)) {
>+		tdx_hkid_free(kvm_tdx);
>+		return;
>+	}
>+
>+	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
>+	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
>+	cpus_read_lock();
>+
>+	/*
>+	 * We can destroy multiple guest TDs simultaneously.  Prevent
>+	 * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
>+	 */
>+	mutex_lock(&tdx_lock);
>+
>+	/*
>+	 * Go through multiple TDX HKID state transitions with three SEAMCALLs
>+	 * to make TDH.PHYMEM.PAGE.RECLAIM() usable.  Make the transition atomic
>+	 * to other functions to operate private pages and Secure-EPT pages.
>+	 *
>+	 * Avoid race for kvm_gmem_release() to call kvm_mmu_unmap_gfn_range().
>+	 * This function is called via mmu notifier, mmu_release().
>+	 * kvm_gmem_release() is called via fput() on process exit.
>+	 */
>+	write_lock(&kvm->mmu_lock);
>+
>+	for_each_online_cpu(i) {
>+		if (packages_allocated &&
>+		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
>+					     packages))
>+			continue;
>+		if (targets_allocated)
>+			cpumask_set_cpu(i, targets);
>+	}
>+	if (targets_allocated)
>+		on_each_cpu_mask(targets, tdx_do_tdh_phymem_cache_wb, NULL, true);
>+	else
>+		on_each_cpu(tdx_do_tdh_phymem_cache_wb, NULL, true);

This tries flush cache on all CPUs when we run out of memory. I am not sure if
it is the best solution. A simple solution is just use two global bitmaps.

And current logic isn't optimal. e.g., if packages_allocated is true while
targets_allocated is false, then we will fill in the packages bitmap but don't
use it at all.

That said, I prefer to optimize the rare case in a separate patch. We can just use
two global bitmaps or let the flush fail here just as you are doing below on
seamcall failure.

>+	/*
>+	 * In the case of error in tdx_do_tdh_phymem_cache_wb(), the following
>+	 * tdh_mng_key_freeid() will fail.
>+	 */
>+	err = tdh_mng_key_freeid(kvm_tdx->tdr_pa);
>+	if (WARN_ON_ONCE(err)) {
>+		pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
>+		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
>+		       kvm_tdx->hkid);
>+	} else
>+		tdx_hkid_free(kvm_tdx);

curly brackets are missing.

>+
>+	write_unlock(&kvm->mmu_lock);
>+	mutex_unlock(&tdx_lock);
>+	cpus_read_unlock();
>+	free_cpumask_var(targets);
>+	free_cpumask_var(packages);
>+}
>+

>+static int __tdx_td_init(struct kvm *kvm)
>+{
>+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>+	cpumask_var_t packages;
>+	unsigned long *tdcs_pa = NULL;
>+	unsigned long tdr_pa = 0;
>+	unsigned long va;
>+	int ret, i;
>+	u64 err;
>+
>+	ret = tdx_guest_keyid_alloc();
>+	if (ret < 0)
>+		return ret;
>+	kvm_tdx->hkid = ret;
>+
>+	va = __get_free_page(GFP_KERNEL_ACCOUNT);
>+	if (!va)
>+		goto free_hkid;
>+	tdr_pa = __pa(va);
>+
>+	tdcs_pa = kcalloc(tdx_info->nr_tdcs_pages, sizeof(*kvm_tdx->tdcs_pa),
>+			  GFP_KERNEL_ACCOUNT | __GFP_ZERO);
>+	if (!tdcs_pa)
>+		goto free_tdr;
>+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
>+		va = __get_free_page(GFP_KERNEL_ACCOUNT);
>+		if (!va)
>+			goto free_tdcs;
>+		tdcs_pa[i] = __pa(va);
>+	}
>+
>+	if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
>+		ret = -ENOMEM;
>+		goto free_tdcs;
>+	}
>+	cpus_read_lock();
>+	/*
>+	 * Need at least one CPU of the package to be online in order to
>+	 * program all packages for host key id.  Check it.
>+	 */
>+	for_each_present_cpu(i)
>+		cpumask_set_cpu(topology_physical_package_id(i), packages);
>+	for_each_online_cpu(i)
>+		cpumask_clear_cpu(topology_physical_package_id(i), packages);
>+	if (!cpumask_empty(packages)) {
>+		ret = -EIO;
>+		/*
>+		 * Because it's hard for human operator to figure out the
>+		 * reason, warn it.
>+		 */
>+#define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
>+		pr_warn_ratelimited(MSG_ALLPKG);
>+		goto free_packages;
>+	}
>+
>+	/*
>+	 * Acquire global lock to avoid TDX_OPERAND_BUSY:
>+	 * TDH.MNG.CREATE and other APIs try to lock the global Key Owner
>+	 * Table (KOT) to track the assigned TDX private HKID.  It doesn't spin
>+	 * to acquire the lock, returns TDX_OPERAND_BUSY instead, and let the
>+	 * caller to handle the contention.  This is because of time limitation
>+	 * usable inside the TDX module and OS/VMM knows better about process
>+	 * scheduling.
>+	 *
>+	 * APIs to acquire the lock of KOT:
>+	 * TDH.MNG.CREATE, TDH.MNG.KEY.FREEID, TDH.MNG.VPFLUSHDONE, and
>+	 * TDH.PHYMEM.CACHE.WB.
>+	 */
>+	mutex_lock(&tdx_lock);
>+	err = tdh_mng_create(tdr_pa, kvm_tdx->hkid);
>+	mutex_unlock(&tdx_lock);
>+	if (err == TDX_RND_NO_ENTROPY) {
>+		ret = -EAGAIN;
>+		goto free_packages;
>+	}
>+	if (WARN_ON_ONCE(err)) {
>+		pr_tdx_error(TDH_MNG_CREATE, err, NULL);
>+		ret = -EIO;
>+		goto free_packages;
>+	}
>+	kvm_tdx->tdr_pa = tdr_pa;
>+
>+	for_each_online_cpu(i) {
>+		int pkg = topology_physical_package_id(i);
>+
>+		if (cpumask_test_and_set_cpu(pkg, packages))
>+			continue;
>+
>+		/*
>+		 * Program the memory controller in the package with an
>+		 * encryption key associated to a TDX private host key id
>+		 * assigned to this TDR.  Concurrent operations on same memory
>+		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
>+		 * mutex.
>+		 */
>+		mutex_lock(&tdx_mng_key_config_lock[pkg]);

the lock is superfluous to me. with cpu lock held, even if multiple CPUs try to
create TDs, the same set of CPUs (the first online CPU of each package) will be
selected to configure the key because of the cpumask_test_and_set_cpu() above.
it means, we never have two CPUs in the same socket trying to program the key,
i.e., no concurrent calls.

>+		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
>+				      &kvm_tdx->tdr_pa, true);
>+		mutex_unlock(&tdx_mng_key_config_lock[pkg]);
>+		if (ret)
>+			break;
>+	}
>+	cpus_read_unlock();
>+	free_cpumask_var(packages);
>+	if (ret) {
>+		i = 0;
>+		goto teardown;
>+	}
>+
>+	kvm_tdx->tdcs_pa = tdcs_pa;
>+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
>+		err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
>+		if (err == TDX_RND_NO_ENTROPY) {
>+			/* Here it's hard to allow userspace to retry. */
>+			ret = -EBUSY;
>+			goto teardown;
>+		}
>+		if (WARN_ON_ONCE(err)) {
>+			pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
>+			ret = -EIO;
>+			goto teardown;
>+		}
>+	}
>+
>+	/*
>+	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
>+	 * ioctl() to define the configure CPUID values for the TD.
>+	 */
>+	return 0;
>+
>+	/*
>+	 * The sequence for freeing resources from a partially initialized TD
>+	 * varies based on where in the initialization flow failure occurred.
>+	 * Simply use the full teardown and destroy, which naturally play nice
>+	 * with partial initialization.
>+	 */
>+teardown:
>+	for (; i < tdx_info->nr_tdcs_pages; i++) {
>+		if (tdcs_pa[i]) {
>+			free_page((unsigned long)__va(tdcs_pa[i]));
>+			tdcs_pa[i] = 0;
>+		}
>+	}
>+	if (!kvm_tdx->tdcs_pa)
>+		kfree(tdcs_pa);
>+	tdx_mmu_release_hkid(kvm);
>+	tdx_vm_free(kvm);
>+	return ret;
>+
>+free_packages:
>+	cpus_read_unlock();
>+	free_cpumask_var(packages);
>+free_tdcs:
>+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
>+		if (tdcs_pa[i])
>+			free_page((unsigned long)__va(tdcs_pa[i]));
>+	}
>+	kfree(tdcs_pa);
>+	kvm_tdx->tdcs_pa = NULL;
>+
>+free_tdr:
>+	if (tdr_pa)
>+		free_page((unsigned long)__va(tdr_pa));
>+	kvm_tdx->tdr_pa = 0;
>+free_hkid:
>+	if (is_hkid_assigned(kvm_tdx))

IIUC, this is always true because you just return if keyid
allocation fails.

	>+	ret = tdx_guest_keyid_alloc();
	>+	if (ret < 0)
	>+		return ret;
	>+	kvm_tdx->hkid = ret;
	>+
	>+	va = __get_free_page(GFP_KERNEL_ACCOUNT);
	>+	if (!va)
	>+		goto free_hkid;

>+		tdx_hkid_free(kvm_tdx);
>+	return ret;
>+}
Isaku Yamahata March 21, 2024, 2:17 p.m. UTC | #2
On Wed, Mar 20, 2024 at 01:12:01PM +0800,
Chao Gao <chao.gao@intel.com> wrote:

> > config KVM_SW_PROTECTED_VM
> > 	bool "Enable support for KVM software-protected VMs"
> >-	depends on EXPERT
> > 	depends on KVM && X86_64
> > 	select KVM_GENERIC_PRIVATE_MEM
> > 	help
> >@@ -89,6 +88,8 @@ config KVM_SW_PROTECTED_VM
> > config KVM_INTEL
> > 	tristate "KVM for Intel (and compatible) processors support"
> > 	depends on KVM && IA32_FEAT_CTL
> >+	select KVM_SW_PROTECTED_VM if INTEL_TDX_HOST
> 
> why does INTEL_TDX_HOST select KVM_SW_PROTECTED_VM?

I wanted KVM_GENERIC_PRIVATE_MEM.  Ah, we should do

        select KKVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST


> >+	select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
> > 	help
> > 	.vcpu_precreate = vmx_vcpu_precreate,
> > 	.vcpu_create = vmx_vcpu_create,
> 
> >--- a/arch/x86/kvm/vmx/tdx.c
> >+++ b/arch/x86/kvm/vmx/tdx.c
> >@@ -5,10 +5,11 @@
> > 
> > #include "capabilities.h"
> > #include "x86_ops.h"
> >-#include "x86.h"
> > #include "mmu.h"
> > #include "tdx_arch.h"
> > #include "tdx.h"
> >+#include "tdx_ops.h"
> >+#include "x86.h"
> 
> any reason to reorder x86.h?

No, I think it's accidental during rebase.
Will fix.



> >+static void tdx_do_tdh_phymem_cache_wb(void *unused)
> >+{
> >+	u64 err = 0;
> >+
> >+	do {
> >+		err = tdh_phymem_cache_wb(!!err);
> >+	} while (err == TDX_INTERRUPTED_RESUMABLE);
> >+
> >+	/* Other thread may have done for us. */
> >+	if (err == TDX_NO_HKID_READY_TO_WBCACHE)
> >+		err = TDX_SUCCESS;
> >+	if (WARN_ON_ONCE(err))
> >+		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
> >+}
> >+
> >+void tdx_mmu_release_hkid(struct kvm *kvm)
> >+{
> >+	bool packages_allocated, targets_allocated;
> >+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> >+	cpumask_var_t packages, targets;
> >+	u64 err;
> >+	int i;
> >+
> >+	if (!is_hkid_assigned(kvm_tdx))
> >+		return;
> >+
> >+	if (!is_td_created(kvm_tdx)) {
> >+		tdx_hkid_free(kvm_tdx);
> >+		return;
> >+	}
> >+
> >+	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
> >+	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
> >+	cpus_read_lock();
> >+
> >+	/*
> >+	 * We can destroy multiple guest TDs simultaneously.  Prevent
> >+	 * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
> >+	 */
> >+	mutex_lock(&tdx_lock);
> >+
> >+	/*
> >+	 * Go through multiple TDX HKID state transitions with three SEAMCALLs
> >+	 * to make TDH.PHYMEM.PAGE.RECLAIM() usable.  Make the transition atomic
> >+	 * to other functions to operate private pages and Secure-EPT pages.
> >+	 *
> >+	 * Avoid race for kvm_gmem_release() to call kvm_mmu_unmap_gfn_range().
> >+	 * This function is called via mmu notifier, mmu_release().
> >+	 * kvm_gmem_release() is called via fput() on process exit.
> >+	 */
> >+	write_lock(&kvm->mmu_lock);
> >+
> >+	for_each_online_cpu(i) {
> >+		if (packages_allocated &&
> >+		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
> >+					     packages))
> >+			continue;
> >+		if (targets_allocated)
> >+			cpumask_set_cpu(i, targets);
> >+	}
> >+	if (targets_allocated)
> >+		on_each_cpu_mask(targets, tdx_do_tdh_phymem_cache_wb, NULL, true);
> >+	else
> >+		on_each_cpu(tdx_do_tdh_phymem_cache_wb, NULL, true);
> 
> This tries flush cache on all CPUs when we run out of memory. I am not sure if
> it is the best solution. A simple solution is just use two global bitmaps.
> 
> And current logic isn't optimal. e.g., if packages_allocated is true while
> targets_allocated is false, then we will fill in the packages bitmap but don't
> use it at all.
> 
> That said, I prefer to optimize the rare case in a separate patch. We can just use
> two global bitmaps or let the flush fail here just as you are doing below on
> seamcall failure.

Makes sense. We can allocate cpumasks on hardware_setup/unsetup() and update them
on hardware_enable/disable().

...

> >+static int __tdx_td_init(struct kvm *kvm)
> >+{
> >+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> >+	cpumask_var_t packages;
> >+	unsigned long *tdcs_pa = NULL;
> >+	unsigned long tdr_pa = 0;
> >+	unsigned long va;
> >+	int ret, i;
> >+	u64 err;
> >+
> >+	ret = tdx_guest_keyid_alloc();
> >+	if (ret < 0)
> >+		return ret;
> >+	kvm_tdx->hkid = ret;
> >+
> >+	va = __get_free_page(GFP_KERNEL_ACCOUNT);
> >+	if (!va)
> >+		goto free_hkid;
> >+	tdr_pa = __pa(va);
> >+
> >+	tdcs_pa = kcalloc(tdx_info->nr_tdcs_pages, sizeof(*kvm_tdx->tdcs_pa),
> >+			  GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> >+	if (!tdcs_pa)
> >+		goto free_tdr;
> >+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> >+		va = __get_free_page(GFP_KERNEL_ACCOUNT);
> >+		if (!va)
> >+			goto free_tdcs;
> >+		tdcs_pa[i] = __pa(va);
> >+	}
> >+
> >+	if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
> >+		ret = -ENOMEM;
> >+		goto free_tdcs;
> >+	}
> >+	cpus_read_lock();
> >+	/*
> >+	 * Need at least one CPU of the package to be online in order to
> >+	 * program all packages for host key id.  Check it.
> >+	 */
> >+	for_each_present_cpu(i)
> >+		cpumask_set_cpu(topology_physical_package_id(i), packages);
> >+	for_each_online_cpu(i)
> >+		cpumask_clear_cpu(topology_physical_package_id(i), packages);
> >+	if (!cpumask_empty(packages)) {
> >+		ret = -EIO;
> >+		/*
> >+		 * Because it's hard for human operator to figure out the
> >+		 * reason, warn it.
> >+		 */
> >+#define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
> >+		pr_warn_ratelimited(MSG_ALLPKG);
> >+		goto free_packages;
> >+	}
> >+
> >+	/*
> >+	 * Acquire global lock to avoid TDX_OPERAND_BUSY:
> >+	 * TDH.MNG.CREATE and other APIs try to lock the global Key Owner
> >+	 * Table (KOT) to track the assigned TDX private HKID.  It doesn't spin
> >+	 * to acquire the lock, returns TDX_OPERAND_BUSY instead, and let the
> >+	 * caller to handle the contention.  This is because of time limitation
> >+	 * usable inside the TDX module and OS/VMM knows better about process
> >+	 * scheduling.
> >+	 *
> >+	 * APIs to acquire the lock of KOT:
> >+	 * TDH.MNG.CREATE, TDH.MNG.KEY.FREEID, TDH.MNG.VPFLUSHDONE, and
> >+	 * TDH.PHYMEM.CACHE.WB.
> >+	 */
> >+	mutex_lock(&tdx_lock);
> >+	err = tdh_mng_create(tdr_pa, kvm_tdx->hkid);
> >+	mutex_unlock(&tdx_lock);
> >+	if (err == TDX_RND_NO_ENTROPY) {
> >+		ret = -EAGAIN;
> >+		goto free_packages;
> >+	}
> >+	if (WARN_ON_ONCE(err)) {
> >+		pr_tdx_error(TDH_MNG_CREATE, err, NULL);
> >+		ret = -EIO;
> >+		goto free_packages;
> >+	}
> >+	kvm_tdx->tdr_pa = tdr_pa;
> >+
> >+	for_each_online_cpu(i) {
> >+		int pkg = topology_physical_package_id(i);
> >+
> >+		if (cpumask_test_and_set_cpu(pkg, packages))
> >+			continue;
> >+
> >+		/*
> >+		 * Program the memory controller in the package with an
> >+		 * encryption key associated to a TDX private host key id
> >+		 * assigned to this TDR.  Concurrent operations on same memory
> >+		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
> >+		 * mutex.
> >+		 */
> >+		mutex_lock(&tdx_mng_key_config_lock[pkg]);
> 
> the lock is superfluous to me. with cpu lock held, even if multiple CPUs try to
> create TDs, the same set of CPUs (the first online CPU of each package) will be
> selected to configure the key because of the cpumask_test_and_set_cpu() above.
> it means, we never have two CPUs in the same socket trying to program the key,
> i.e., no concurrent calls.

Makes sense. Will drop the lock.


> >+		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
> >+				      &kvm_tdx->tdr_pa, true);
> >+		mutex_unlock(&tdx_mng_key_config_lock[pkg]);
> >+		if (ret)
> >+			break;
> >+	}
> >+	cpus_read_unlock();
> >+	free_cpumask_var(packages);
> >+	if (ret) {
> >+		i = 0;
> >+		goto teardown;
> >+	}
> >+
> >+	kvm_tdx->tdcs_pa = tdcs_pa;
> >+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> >+		err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
> >+		if (err == TDX_RND_NO_ENTROPY) {
> >+			/* Here it's hard to allow userspace to retry. */
> >+			ret = -EBUSY;
> >+			goto teardown;
> >+		}
> >+		if (WARN_ON_ONCE(err)) {
> >+			pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
> >+			ret = -EIO;
> >+			goto teardown;
> >+		}
> >+	}
> >+
> >+	/*
> >+	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
> >+	 * ioctl() to define the configure CPUID values for the TD.
> >+	 */
> >+	return 0;
> >+
> >+	/*
> >+	 * The sequence for freeing resources from a partially initialized TD
> >+	 * varies based on where in the initialization flow failure occurred.
> >+	 * Simply use the full teardown and destroy, which naturally play nice
> >+	 * with partial initialization.
> >+	 */
> >+teardown:
> >+	for (; i < tdx_info->nr_tdcs_pages; i++) {
> >+		if (tdcs_pa[i]) {
> >+			free_page((unsigned long)__va(tdcs_pa[i]));
> >+			tdcs_pa[i] = 0;
> >+		}
> >+	}
> >+	if (!kvm_tdx->tdcs_pa)
> >+		kfree(tdcs_pa);
> >+	tdx_mmu_release_hkid(kvm);
> >+	tdx_vm_free(kvm);
> >+	return ret;
> >+
> >+free_packages:
> >+	cpus_read_unlock();
> >+	free_cpumask_var(packages);
> >+free_tdcs:
> >+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> >+		if (tdcs_pa[i])
> >+			free_page((unsigned long)__va(tdcs_pa[i]));
> >+	}
> >+	kfree(tdcs_pa);
> >+	kvm_tdx->tdcs_pa = NULL;
> >+
> >+free_tdr:
> >+	if (tdr_pa)
> >+		free_page((unsigned long)__va(tdr_pa));
> >+	kvm_tdx->tdr_pa = 0;
> >+free_hkid:
> >+	if (is_hkid_assigned(kvm_tdx))
> 
> IIUC, this is always true because you just return if keyid
> allocation fails.

You're right. Will fix
Kai Huang March 22, 2024, 1:06 a.m. UTC | #3
On 26/02/2024 9:25 pm, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> As the first step to create TDX guest, create/destroy VM struct.  Assign
> TDX private Host Key ID (HKID) to the TDX guest for memory encryption and

Here we are at patch 38, and I don't think you still need to fully spell 
"Host KeyID (HKID)" anymore.

Just say: Assign one TDX private KeyID ...

> allocate extra pages for the TDX guest. On destruction, free allocated
> pages, and HKID.

Could we put more information here?

For instance, here should be a wonderful place to explain what are TDR, 
TDCS, etc??

And briefly describe the sequence of creating the TD?

Roughly checking the code, you have implemented many things including 
MNG.KEY.CONFIG staff.  It's worth to add some text here to give reviewer 
a rough idea what's going on here.

> 
> Before tearing down private page tables, TDX requires some resources of the
> guest TD to be destroyed (i.e. HKID must have been reclaimed, etc).  Add
> mmu notifier release callback before tearing down private page tables for
> it. >
> Add vm_free() of kvm_x86_ops hook at the end of kvm_arch_destroy_vm()
> because some per-VM TDX resources, e.g. TDR, need to be freed after other
> TDX resources, e.g. HKID, were freed.

I think we should split the "adding callbacks' part out, given you have ...

	9 files changed, 520 insertions(+), 8 deletions(-)

... in this patch.

IMHO, >500 LOC change normally means there are too many things in this 
patch, thus hard to review, and we should split.

I think perhaps we can split this big patch to smaller pieces based on 
the steps, like we did for the init_tdx_module() function in the TDX 
host patchset??

(But I would like to hear from others too.)

> 
> Co-developed-by: Kai Huang <kai.huang@intel.com>
> Signed-off-by: Kai Huang <kai.huang@intel.com>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>

Sean's tag doesn't make sense anymore.

> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> 
[...]

> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index c45252ed2ffd..2becc86c71b2 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -6866,6 +6866,13 @@ static void kvm_mmu_zap_all(struct kvm *kvm)
>   
>   void kvm_arch_flush_shadow_all(struct kvm *kvm)
>   {
> +	/*
> +	 * kvm_mmu_zap_all() zaps both private and shared page tables.  Before
> +	 * tearing down private page tables, TDX requires some TD resources to
> +	 * be destroyed (i.e. keyID must have been reclaimed, etc).  Invoke
> +	 * kvm_x86_flush_shadow_all_private() for this.
> +	 */
> +	static_call_cond(kvm_x86_flush_shadow_all_private)(kvm);
>   	kvm_mmu_zap_all(kvm);
>   }
>   
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index e8a1a7533eea..437c6d5e802e 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -62,11 +62,31 @@ static int vt_vm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
>   static int vt_vm_init(struct kvm *kvm)
>   {
>   	if (is_td(kvm))
> -		return -EOPNOTSUPP;	/* Not ready to create guest TD yet. */
> +		return tdx_vm_init(kvm);
>   
>   	return vmx_vm_init(kvm);
>   }
>   
> +static void vt_flush_shadow_all_private(struct kvm *kvm)
> +{
> +	if (is_td(kvm))
> +		tdx_mmu_release_hkid(kvm);

Add a comment to explain please.

> +}
> +
> +static void vt_vm_destroy(struct kvm *kvm)
> +{
> +	if (is_td(kvm))
> +		return;

Please add a comment to explain why we don't do anything here, but have 
to delay to vt_vm_free().

> +
> +	vmx_vm_destroy(kvm);
> +}
> +
> +static void vt_vm_free(struct kvm *kvm)
> +{
> +	if (is_td(kvm))
> +		tdx_vm_free(kvm);
> +} > +
>   static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
>   {
>   	if (!is_td(kvm))
> @@ -101,7 +121,9 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>   	.vm_size = sizeof(struct kvm_vmx),
>   	.vm_enable_cap = vt_vm_enable_cap,
>   	.vm_init = vt_vm_init,
> -	.vm_destroy = vmx_vm_destroy,
> +	.flush_shadow_all_private = vt_flush_shadow_all_private,
> +	.vm_destroy = vt_vm_destroy,
> +	.vm_free = vt_vm_free,
>   
>   	.vcpu_precreate = vmx_vcpu_precreate,
>   	.vcpu_create = vmx_vcpu_create,
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index ee015f3ce2c9..1cf2b15da257 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -5,10 +5,11 @@
>   
>   #include "capabilities.h"
>   #include "x86_ops.h"
> -#include "x86.h"
>   #include "mmu.h"
>   #include "tdx_arch.h"
>   #include "tdx.h"
> +#include "tdx_ops.h"
> +#include "x86.h"
>   
>   #undef pr_fmt
>   #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> @@ -22,7 +23,7 @@
>   /* TDX KeyID pool */
>   static DEFINE_IDA(tdx_guest_keyid_pool);
>   
> -static int __used tdx_guest_keyid_alloc(void)
> +static int tdx_guest_keyid_alloc(void)
>   {
>   	if (WARN_ON_ONCE(!tdx_guest_keyid_start || !tdx_nr_guest_keyids))
>   		return -EINVAL;
> @@ -32,7 +33,7 @@ static int __used tdx_guest_keyid_alloc(void)
>   			       GFP_KERNEL);
>   }
>   
> -static void __used tdx_guest_keyid_free(int keyid)
> +static void tdx_guest_keyid_free(int keyid)
>   {
>   	if (WARN_ON_ONCE(keyid < tdx_guest_keyid_start ||
>   			 keyid > tdx_guest_keyid_start + tdx_nr_guest_keyids - 1))
> @@ -48,6 +49,8 @@ struct tdx_info {
>   	u64 xfam_fixed0;
>   	u64 xfam_fixed1;
>   
> +	u8 nr_tdcs_pages;
> +
>   	u16 num_cpuid_config;
>   	/* This must the last member. */
>   	DECLARE_FLEX_ARRAY(struct kvm_tdx_cpuid_config, cpuid_configs);
> @@ -85,6 +88,282 @@ int tdx_vm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
>   	return r;
>   }
>   
> +/*
> + * Some TDX SEAMCALLs (TDH.MNG.CREATE, TDH.PHYMEM.CACHE.WB,
> + * TDH.MNG.KEY.RECLAIMID, TDH.MNG.KEY.FREEID etc) tries to acquire a global lock
> + * internally in TDX module.  If failed, TDX_OPERAND_BUSY is returned without
> + * spinning or waiting due to a constraint on execution time.  It's caller's
> + * responsibility to avoid race (or retry on TDX_OPERAND_BUSY).  Use this mutex
> + * to avoid race in TDX module because the kernel knows better about scheduling.
> + */

/*
  * Some SEAMCALLs acquire TDX module globlly.  They fail with
  * TDX_OPERAND_BUSY if fail to acquire.  Use a global mutex to
  * serialize these SEAMCALLs.
  */
> +static DEFINE_MUTEX(tdx_lock);
> +static struct mutex *tdx_mng_key_config_lock;
> +
> +static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
> +{
> +	return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
> +}
> +
> +static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
> +{
> +	return kvm_tdx->tdr_pa;
> +}
> +
> +static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
> +{
> +	tdx_guest_keyid_free(kvm_tdx->hkid);
> +	kvm_tdx->hkid = -1;
> +}
> +
> +static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
> +{
> +	return kvm_tdx->hkid > 0;
> +}

Hmm...

"Allocating a TDX private KeyID" seems to be one step of creating the 
TDX guest.  Perhaps we should split this patch based on the steps of 
creating TD.

> +
> +static void tdx_clear_page(unsigned long page_pa)
> +{
> +	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> +	void *page = __va(page_pa);
> +	unsigned long i;
> +
> +	/*
> +	 * When re-assign one page from old keyid to a new keyid, MOVDIR64B is
> +	 * required to clear/write the page with new keyid to prevent integrity
> +	 * error when read on the page with new keyid.
> +	 *
> +	 * clflush doesn't flush cache with HKID set.  

I don't understand this "clflush".

(Firstly, I think it's better to use TDX private KeyID or TDX KeyID 
instead of HKID, which can be MKTME KeyID really.)

How is "clflush doesn't flush cache with HKID set" relevant here??

What you really want is "all caches associated with the given page must 
have been flushed before tdx_clear_page()".

You can add as a function comment for tdx_clear_page(), but certainly 
not here.

The cache line could be
> +	 * poisoned (even without MKTME-i), clear the poison bit. > +	 */

Is below better?

	/*
	 * The page could have been poisoned.  MOVDIR64B also clears
	 * the poison bit so the kernel can safely use the page again.
	 */


> +	for (i = 0; i < PAGE_SIZE; i += 64)
> +		movdir64b(page + i, zero_page);
> +	/*
> +	 * MOVDIR64B store uses WC buffer.  Prevent following memory reads
> +	 * from seeing potentially poisoned cache.
> +	 */
> +	__mb();
> +}
> +
> +static int __tdx_reclaim_page(hpa_t pa)
> +{
> +	struct tdx_module_args out;
> +	u64 err;
> +
> +	do {
> +		err = tdh_phymem_page_reclaim(pa, &out);
> +		/*
> +		 * TDH.PHYMEM.PAGE.RECLAIM is allowed only when TD is shutdown.
> +		 * state.  i.e. destructing TD.

Does this mean __tdx_reclaim_page() can only be used when destroying the TD?

Pleas add this to the function comment of __tdx_reclaim_page().

> +		 * TDH.PHYMEM.PAGE.RECLAIM requires TDR and target page.
> +		 * Because we're destructing TD, it's rare to contend with TDR.
> +		 */

It's rare to contend, so what?

If you want to justify the loop, and the "unlikely()" used in the loop, 
then put this part right before the 'do { } while ()' loop, where your 
intention applies, and explicitly call out.

And in general I think it's better to add a 'cond_resched()' for such 
loop because SEAMCALL is time-costy.  If your comment is intended for 
not adding 'cond_resched()', please also call out.

> +	} while (unlikely(err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX) ||
> +			  err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_TDR)));
> +	if (WARN_ON_ONCE(err)) {
> +		pr_tdx_error(TDH_PHYMEM_PAGE_RECLAIM, err, &out);
> +		return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
> +static int tdx_reclaim_page(hpa_t pa)
> +{
> +	int r;
> +
> +	r = __tdx_reclaim_page(pa);
> +	if (!r)
> +		tdx_clear_page(pa);
> +	return r;
> +}
> +
> +static void tdx_reclaim_control_page(unsigned long td_page_pa)
> +{
> +	WARN_ON_ONCE(!td_page_pa);
> +
> +	/*
> +	 * TDCX are being reclaimed.  TDX module maps TDCX with HKID
> +	 * assigned to the TD.  Here the cache associated to the TD
> +	 * was already flushed by TDH.PHYMEM.CACHE.WB before here, So
> +	 * cache doesn't need to be flushed again.
> +	 */
> +	if (tdx_reclaim_page(td_page_pa))
> +		/*
> +		 * Leak the page on failure:
> +		 * tdx_reclaim_page() returns an error if and only if there's an
> +		 * unexpected, fatal error, e.g. a SEAMCALL with bad params,
> +		 * incorrect concurrency in KVM, a TDX Module bug, etc.
> +		 * Retrying at a later point is highly unlikely to be
> +		 * successful.
> +		 * No log here as tdx_reclaim_page() already did.
> +		 */
> +		return;

Use empty lines to make text more breathable between paragraphs.

> +	free_page((unsigned long)__va(td_page_pa));
> +}
> +
> +static void tdx_do_tdh_phymem_cache_wb(void *unused)
> +{
> +	u64 err = 0;
> +
> +	do {
> +		err = tdh_phymem_cache_wb(!!err);
> +	} while (err == TDX_INTERRUPTED_RESUMABLE);
> +
> +	/* Other thread may have done for us. */
> +	if (err == TDX_NO_HKID_READY_TO_WBCACHE)
> +		err = TDX_SUCCESS;

Empty line.

> +	if (WARN_ON_ONCE(err))
> +		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
> +}

[snip]

I am stopping here, because I need to take a break.

Again I think we should split this patch, there are just too many things 
to review here.
Yuan Yao March 22, 2024, 3:46 a.m. UTC | #4
On Thu, Mar 21, 2024 at 07:17:09AM -0700, Isaku Yamahata wrote:
> On Wed, Mar 20, 2024 at 01:12:01PM +0800,
> Chao Gao <chao.gao@intel.com> wrote:
>
> > > config KVM_SW_PROTECTED_VM
> > > 	bool "Enable support for KVM software-protected VMs"
> > >-	depends on EXPERT
> > > 	depends on KVM && X86_64
> > > 	select KVM_GENERIC_PRIVATE_MEM
> > > 	help
> > >@@ -89,6 +88,8 @@ config KVM_SW_PROTECTED_VM
> > > config KVM_INTEL
> > > 	tristate "KVM for Intel (and compatible) processors support"
> > > 	depends on KVM && IA32_FEAT_CTL
> > >+	select KVM_SW_PROTECTED_VM if INTEL_TDX_HOST
> >
> > why does INTEL_TDX_HOST select KVM_SW_PROTECTED_VM?
>
> I wanted KVM_GENERIC_PRIVATE_MEM.  Ah, we should do
>
>         select KKVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST
>
>
> > >+	select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
> > > 	help
> > > 	.vcpu_precreate = vmx_vcpu_precreate,
> > > 	.vcpu_create = vmx_vcpu_create,
> >
> > >--- a/arch/x86/kvm/vmx/tdx.c
> > >+++ b/arch/x86/kvm/vmx/tdx.c
> > >@@ -5,10 +5,11 @@
> > >
> > > #include "capabilities.h"
> > > #include "x86_ops.h"
> > >-#include "x86.h"
> > > #include "mmu.h"
> > > #include "tdx_arch.h"
> > > #include "tdx.h"
> > >+#include "tdx_ops.h"
> > >+#include "x86.h"
> >
> > any reason to reorder x86.h?
>
> No, I think it's accidental during rebase.
> Will fix.
>
>
>
> > >+static void tdx_do_tdh_phymem_cache_wb(void *unused)
> > >+{
> > >+	u64 err = 0;
> > >+
> > >+	do {
> > >+		err = tdh_phymem_cache_wb(!!err);
> > >+	} while (err == TDX_INTERRUPTED_RESUMABLE);
> > >+
> > >+	/* Other thread may have done for us. */
> > >+	if (err == TDX_NO_HKID_READY_TO_WBCACHE)
> > >+		err = TDX_SUCCESS;
> > >+	if (WARN_ON_ONCE(err))
> > >+		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
> > >+}
> > >+
> > >+void tdx_mmu_release_hkid(struct kvm *kvm)
> > >+{
> > >+	bool packages_allocated, targets_allocated;
> > >+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > >+	cpumask_var_t packages, targets;
> > >+	u64 err;
> > >+	int i;
> > >+
> > >+	if (!is_hkid_assigned(kvm_tdx))
> > >+		return;
> > >+
> > >+	if (!is_td_created(kvm_tdx)) {
> > >+		tdx_hkid_free(kvm_tdx);
> > >+		return;
> > >+	}
> > >+
> > >+	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
> > >+	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
> > >+	cpus_read_lock();
> > >+
> > >+	/*
> > >+	 * We can destroy multiple guest TDs simultaneously.  Prevent
> > >+	 * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
> > >+	 */
> > >+	mutex_lock(&tdx_lock);
> > >+
> > >+	/*
> > >+	 * Go through multiple TDX HKID state transitions with three SEAMCALLs
> > >+	 * to make TDH.PHYMEM.PAGE.RECLAIM() usable.  Make the transition atomic
> > >+	 * to other functions to operate private pages and Secure-EPT pages.
> > >+	 *
> > >+	 * Avoid race for kvm_gmem_release() to call kvm_mmu_unmap_gfn_range().
> > >+	 * This function is called via mmu notifier, mmu_release().
> > >+	 * kvm_gmem_release() is called via fput() on process exit.
> > >+	 */
> > >+	write_lock(&kvm->mmu_lock);
> > >+
> > >+	for_each_online_cpu(i) {
> > >+		if (packages_allocated &&
> > >+		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
> > >+					     packages))
> > >+			continue;
> > >+		if (targets_allocated)
> > >+			cpumask_set_cpu(i, targets);
> > >+	}
> > >+	if (targets_allocated)
> > >+		on_each_cpu_mask(targets, tdx_do_tdh_phymem_cache_wb, NULL, true);
> > >+	else
> > >+		on_each_cpu(tdx_do_tdh_phymem_cache_wb, NULL, true);
> >
> > This tries flush cache on all CPUs when we run out of memory. I am not sure if
> > it is the best solution. A simple solution is just use two global bitmaps.
> >
> > And current logic isn't optimal. e.g., if packages_allocated is true while
> > targets_allocated is false, then we will fill in the packages bitmap but don't
> > use it at all.
> >
> > That said, I prefer to optimize the rare case in a separate patch. We can just use
> > two global bitmaps or let the flush fail here just as you are doing below on
> > seamcall failure.
>
> Makes sense. We can allocate cpumasks on hardware_setup/unsetup() and update them
> on hardware_enable/disable().
>
> ...
>
> > >+static int __tdx_td_init(struct kvm *kvm)
> > >+{
> > >+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > >+	cpumask_var_t packages;
> > >+	unsigned long *tdcs_pa = NULL;
> > >+	unsigned long tdr_pa = 0;
> > >+	unsigned long va;
> > >+	int ret, i;
> > >+	u64 err;
> > >+
> > >+	ret = tdx_guest_keyid_alloc();
> > >+	if (ret < 0)
> > >+		return ret;
> > >+	kvm_tdx->hkid = ret;
> > >+
> > >+	va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > >+	if (!va)
> > >+		goto free_hkid;
> > >+	tdr_pa = __pa(va);
> > >+
> > >+	tdcs_pa = kcalloc(tdx_info->nr_tdcs_pages, sizeof(*kvm_tdx->tdcs_pa),
> > >+			  GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> > >+	if (!tdcs_pa)
> > >+		goto free_tdr;
> > >+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> > >+		va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > >+		if (!va)
> > >+			goto free_tdcs;
> > >+		tdcs_pa[i] = __pa(va);
> > >+	}
> > >+
> > >+	if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
> > >+		ret = -ENOMEM;
> > >+		goto free_tdcs;
> > >+	}
> > >+	cpus_read_lock();
> > >+	/*
> > >+	 * Need at least one CPU of the package to be online in order to
> > >+	 * program all packages for host key id.  Check it.
> > >+	 */
> > >+	for_each_present_cpu(i)
> > >+		cpumask_set_cpu(topology_physical_package_id(i), packages);
> > >+	for_each_online_cpu(i)
> > >+		cpumask_clear_cpu(topology_physical_package_id(i), packages);
> > >+	if (!cpumask_empty(packages)) {
> > >+		ret = -EIO;
> > >+		/*
> > >+		 * Because it's hard for human operator to figure out the
> > >+		 * reason, warn it.
> > >+		 */
> > >+#define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
> > >+		pr_warn_ratelimited(MSG_ALLPKG);
> > >+		goto free_packages;
> > >+	}
> > >+
> > >+	/*
> > >+	 * Acquire global lock to avoid TDX_OPERAND_BUSY:
> > >+	 * TDH.MNG.CREATE and other APIs try to lock the global Key Owner
> > >+	 * Table (KOT) to track the assigned TDX private HKID.  It doesn't spin
> > >+	 * to acquire the lock, returns TDX_OPERAND_BUSY instead, and let the
> > >+	 * caller to handle the contention.  This is because of time limitation
> > >+	 * usable inside the TDX module and OS/VMM knows better about process
> > >+	 * scheduling.
> > >+	 *
> > >+	 * APIs to acquire the lock of KOT:
> > >+	 * TDH.MNG.CREATE, TDH.MNG.KEY.FREEID, TDH.MNG.VPFLUSHDONE, and
> > >+	 * TDH.PHYMEM.CACHE.WB.
> > >+	 */
> > >+	mutex_lock(&tdx_lock);
> > >+	err = tdh_mng_create(tdr_pa, kvm_tdx->hkid);
> > >+	mutex_unlock(&tdx_lock);
> > >+	if (err == TDX_RND_NO_ENTROPY) {
> > >+		ret = -EAGAIN;
> > >+		goto free_packages;
> > >+	}
> > >+	if (WARN_ON_ONCE(err)) {
> > >+		pr_tdx_error(TDH_MNG_CREATE, err, NULL);
> > >+		ret = -EIO;
> > >+		goto free_packages;
> > >+	}
> > >+	kvm_tdx->tdr_pa = tdr_pa;
> > >+
> > >+	for_each_online_cpu(i) {
> > >+		int pkg = topology_physical_package_id(i);
> > >+
> > >+		if (cpumask_test_and_set_cpu(pkg, packages))
> > >+			continue;
> > >+
> > >+		/*
> > >+		 * Program the memory controller in the package with an
> > >+		 * encryption key associated to a TDX private host key id
> > >+		 * assigned to this TDR.  Concurrent operations on same memory
> > >+		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
> > >+		 * mutex.
> > >+		 */
> > >+		mutex_lock(&tdx_mng_key_config_lock[pkg]);
> >
> > the lock is superfluous to me. with cpu lock held, even if multiple CPUs try to
> > create TDs, the same set of CPUs (the first online CPU of each package) will be
> > selected to configure the key because of the cpumask_test_and_set_cpu() above.
> > it means, we never have two CPUs in the same socket trying to program the key,
> > i.e., no concurrent calls.
>
> Makes sense. Will drop the lock.

Not get the point, the variable "packages" on stack, and it's
possible that "i" is same for 2 threads which are trying to create td.
Anything I missed ?

>
>
> > >+		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
> > >+				      &kvm_tdx->tdr_pa, true);
> > >+		mutex_unlock(&tdx_mng_key_config_lock[pkg]);
> > >+		if (ret)
> > >+			break;
> > >+	}
> > >+	cpus_read_unlock();
> > >+	free_cpumask_var(packages);
> > >+	if (ret) {
> > >+		i = 0;
> > >+		goto teardown;
> > >+	}
> > >+
> > >+	kvm_tdx->tdcs_pa = tdcs_pa;
> > >+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> > >+		err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
> > >+		if (err == TDX_RND_NO_ENTROPY) {
> > >+			/* Here it's hard to allow userspace to retry. */
> > >+			ret = -EBUSY;
> > >+			goto teardown;
> > >+		}
> > >+		if (WARN_ON_ONCE(err)) {
> > >+			pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
> > >+			ret = -EIO;
> > >+			goto teardown;
> > >+		}
> > >+	}
> > >+
> > >+	/*
> > >+	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
> > >+	 * ioctl() to define the configure CPUID values for the TD.
> > >+	 */
> > >+	return 0;
> > >+
> > >+	/*
> > >+	 * The sequence for freeing resources from a partially initialized TD
> > >+	 * varies based on where in the initialization flow failure occurred.
> > >+	 * Simply use the full teardown and destroy, which naturally play nice
> > >+	 * with partial initialization.
> > >+	 */
> > >+teardown:
> > >+	for (; i < tdx_info->nr_tdcs_pages; i++) {
> > >+		if (tdcs_pa[i]) {
> > >+			free_page((unsigned long)__va(tdcs_pa[i]));
> > >+			tdcs_pa[i] = 0;
> > >+		}
> > >+	}
> > >+	if (!kvm_tdx->tdcs_pa)
> > >+		kfree(tdcs_pa);
> > >+	tdx_mmu_release_hkid(kvm);
> > >+	tdx_vm_free(kvm);
> > >+	return ret;
> > >+
> > >+free_packages:
> > >+	cpus_read_unlock();
> > >+	free_cpumask_var(packages);
> > >+free_tdcs:
> > >+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> > >+		if (tdcs_pa[i])
> > >+			free_page((unsigned long)__va(tdcs_pa[i]));
> > >+	}
> > >+	kfree(tdcs_pa);
> > >+	kvm_tdx->tdcs_pa = NULL;
> > >+
> > >+free_tdr:
> > >+	if (tdr_pa)
> > >+		free_page((unsigned long)__va(tdr_pa));
> > >+	kvm_tdx->tdr_pa = 0;
> > >+free_hkid:
> > >+	if (is_hkid_assigned(kvm_tdx))
> >
> > IIUC, this is always true because you just return if keyid
> > allocation fails.
>
> You're right. Will fix
> --
> Isaku Yamahata <isaku.yamahata@intel.com>
>
Yuan Yao March 22, 2024, 5:32 a.m. UTC | #5
On Fri, Mar 22, 2024 at 11:46:41AM +0800, Yuan Yao wrote:
> On Thu, Mar 21, 2024 at 07:17:09AM -0700, Isaku Yamahata wrote:
> > On Wed, Mar 20, 2024 at 01:12:01PM +0800,
> > Chao Gao <chao.gao@intel.com> wrote:
...
> > > >+static int __tdx_td_init(struct kvm *kvm)
> > > >+{
> > > >+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > > >+	cpumask_var_t packages;
> > > >+	unsigned long *tdcs_pa = NULL;
> > > >+	unsigned long tdr_pa = 0;
> > > >+	unsigned long va;
> > > >+	int ret, i;
> > > >+	u64 err;
> > > >+
> > > >+	ret = tdx_guest_keyid_alloc();
> > > >+	if (ret < 0)
> > > >+		return ret;
> > > >+	kvm_tdx->hkid = ret;
> > > >+
> > > >+	va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > > >+	if (!va)
> > > >+		goto free_hkid;
> > > >+	tdr_pa = __pa(va);
> > > >+
> > > >+	tdcs_pa = kcalloc(tdx_info->nr_tdcs_pages, sizeof(*kvm_tdx->tdcs_pa),
> > > >+			  GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> > > >+	if (!tdcs_pa)
> > > >+		goto free_tdr;
> > > >+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> > > >+		va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > > >+		if (!va)
> > > >+			goto free_tdcs;
> > > >+		tdcs_pa[i] = __pa(va);
> > > >+	}
> > > >+
> > > >+	if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
> > > >+		ret = -ENOMEM;
> > > >+		goto free_tdcs;
> > > >+	}
> > > >+	cpus_read_lock();
> > > >+	/*
> > > >+	 * Need at least one CPU of the package to be online in order to
> > > >+	 * program all packages for host key id.  Check it.
> > > >+	 */
> > > >+	for_each_present_cpu(i)
> > > >+		cpumask_set_cpu(topology_physical_package_id(i), packages);
> > > >+	for_each_online_cpu(i)
> > > >+		cpumask_clear_cpu(topology_physical_package_id(i), packages);
> > > >+	if (!cpumask_empty(packages)) {
> > > >+		ret = -EIO;
> > > >+		/*
> > > >+		 * Because it's hard for human operator to figure out the
> > > >+		 * reason, warn it.
> > > >+		 */
> > > >+#define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
> > > >+		pr_warn_ratelimited(MSG_ALLPKG);
> > > >+		goto free_packages;
> > > >+	}
> > > >+
> > > >+	/*
> > > >+	 * Acquire global lock to avoid TDX_OPERAND_BUSY:
> > > >+	 * TDH.MNG.CREATE and other APIs try to lock the global Key Owner
> > > >+	 * Table (KOT) to track the assigned TDX private HKID.  It doesn't spin
> > > >+	 * to acquire the lock, returns TDX_OPERAND_BUSY instead, and let the
> > > >+	 * caller to handle the contention.  This is because of time limitation
> > > >+	 * usable inside the TDX module and OS/VMM knows better about process
> > > >+	 * scheduling.
> > > >+	 *
> > > >+	 * APIs to acquire the lock of KOT:
> > > >+	 * TDH.MNG.CREATE, TDH.MNG.KEY.FREEID, TDH.MNG.VPFLUSHDONE, and
> > > >+	 * TDH.PHYMEM.CACHE.WB.
> > > >+	 */
> > > >+	mutex_lock(&tdx_lock);
> > > >+	err = tdh_mng_create(tdr_pa, kvm_tdx->hkid);
> > > >+	mutex_unlock(&tdx_lock);
> > > >+	if (err == TDX_RND_NO_ENTROPY) {
> > > >+		ret = -EAGAIN;
> > > >+		goto free_packages;
> > > >+	}
> > > >+	if (WARN_ON_ONCE(err)) {
> > > >+		pr_tdx_error(TDH_MNG_CREATE, err, NULL);
> > > >+		ret = -EIO;
> > > >+		goto free_packages;
> > > >+	}
> > > >+	kvm_tdx->tdr_pa = tdr_pa;
> > > >+
> > > >+	for_each_online_cpu(i) {
> > > >+		int pkg = topology_physical_package_id(i);
> > > >+
> > > >+		if (cpumask_test_and_set_cpu(pkg, packages))
> > > >+			continue;
> > > >+
> > > >+		/*
> > > >+		 * Program the memory controller in the package with an
> > > >+		 * encryption key associated to a TDX private host key id
> > > >+		 * assigned to this TDR.  Concurrent operations on same memory
> > > >+		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
> > > >+		 * mutex.
> > > >+		 */
> > > >+		mutex_lock(&tdx_mng_key_config_lock[pkg]);
> > >
> > > the lock is superfluous to me. with cpu lock held, even if multiple CPUs try to
> > > create TDs, the same set of CPUs (the first online CPU of each package) will be
> > > selected to configure the key because of the cpumask_test_and_set_cpu() above.
> > > it means, we never have two CPUs in the same socket trying to program the key,
> > > i.e., no concurrent calls.
> >
> > Makes sense. Will drop the lock.
>
> Not get the point, the variable "packages" on stack, and it's
> possible that "i" is same for 2 threads which are trying to create td.
> Anything I missed ?

Got the point after synced with chao.
in case of using for_each_online_cpu() it's safe to remove the mutex_lock(&tdx_mng_key_config_lock[pkg]),
since every thread will select only 1 cpu for each sockets in same order, and requests submited
to same cpu by smp_call_on_cpu() are ordered on the target cpu. That means removing the lock works for
using for_each_online_cpu() but does NOT work for randomly pick up a cpu per socket.

Maybe it's just my issue that doesn't realize what's going on here, but
I think it still worth to give comment here for why it works/does not work.

>
> >
> >
> > > >+		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
> > > >+				      &kvm_tdx->tdr_pa, true);
> > > >+		mutex_unlock(&tdx_mng_key_config_lock[pkg]);
> > > >+		if (ret)
> > > >+			break;
> > > >+	}
> > > >+	cpus_read_unlock();
> > > >+	free_cpumask_var(packages);
> > > >+	if (ret) {
> > > >+		i = 0;
> > > >+		goto teardown;
> > > >+	}
> > > >+
> > > >+	kvm_tdx->tdcs_pa = tdcs_pa;
> > > >+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> > > >+		err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
> > > >+		if (err == TDX_RND_NO_ENTROPY) {
> > > >+			/* Here it's hard to allow userspace to retry. */
> > > >+			ret = -EBUSY;
> > > >+			goto teardown;
> > > >+		}
> > > >+		if (WARN_ON_ONCE(err)) {
> > > >+			pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
> > > >+			ret = -EIO;
> > > >+			goto teardown;
> > > >+		}
> > > >+	}
> > > >+
> > > >+	/*
> > > >+	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
> > > >+	 * ioctl() to define the configure CPUID values for the TD.
> > > >+	 */
> > > >+	return 0;
> > > >+
> > > >+	/*
> > > >+	 * The sequence for freeing resources from a partially initialized TD
> > > >+	 * varies based on where in the initialization flow failure occurred.
> > > >+	 * Simply use the full teardown and destroy, which naturally play nice
> > > >+	 * with partial initialization.
> > > >+	 */
> > > >+teardown:
> > > >+	for (; i < tdx_info->nr_tdcs_pages; i++) {
> > > >+		if (tdcs_pa[i]) {
> > > >+			free_page((unsigned long)__va(tdcs_pa[i]));
> > > >+			tdcs_pa[i] = 0;
> > > >+		}
> > > >+	}
> > > >+	if (!kvm_tdx->tdcs_pa)
> > > >+		kfree(tdcs_pa);
> > > >+	tdx_mmu_release_hkid(kvm);
> > > >+	tdx_vm_free(kvm);
> > > >+	return ret;
> > > >+
> > > >+free_packages:
> > > >+	cpus_read_unlock();
> > > >+	free_cpumask_var(packages);
> > > >+free_tdcs:
> > > >+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> > > >+		if (tdcs_pa[i])
> > > >+			free_page((unsigned long)__va(tdcs_pa[i]));
> > > >+	}
> > > >+	kfree(tdcs_pa);
> > > >+	kvm_tdx->tdcs_pa = NULL;
> > > >+
> > > >+free_tdr:
> > > >+	if (tdr_pa)
> > > >+		free_page((unsigned long)__va(tdr_pa));
> > > >+	kvm_tdx->tdr_pa = 0;
> > > >+free_hkid:
> > > >+	if (is_hkid_assigned(kvm_tdx))
> > >
> > > IIUC, this is always true because you just return if keyid
> > > allocation fails.
> >
> > You're right. Will fix
> > --
> > Isaku Yamahata <isaku.yamahata@intel.com>
> >
>
Isaku Yamahata March 22, 2024, 11:44 p.m. UTC | #6
On Fri, Mar 22, 2024 at 01:32:15PM +0800,
Yuan Yao <yuan.yao@linux.intel.com> wrote:

> On Fri, Mar 22, 2024 at 11:46:41AM +0800, Yuan Yao wrote:
> > On Thu, Mar 21, 2024 at 07:17:09AM -0700, Isaku Yamahata wrote:
> > > On Wed, Mar 20, 2024 at 01:12:01PM +0800,
> > > Chao Gao <chao.gao@intel.com> wrote:
> ...
> > > > >+static int __tdx_td_init(struct kvm *kvm)
> > > > >+{
> > > > >+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > > > >+	cpumask_var_t packages;
> > > > >+	unsigned long *tdcs_pa = NULL;
> > > > >+	unsigned long tdr_pa = 0;
> > > > >+	unsigned long va;
> > > > >+	int ret, i;
> > > > >+	u64 err;
> > > > >+
> > > > >+	ret = tdx_guest_keyid_alloc();
> > > > >+	if (ret < 0)
> > > > >+		return ret;
> > > > >+	kvm_tdx->hkid = ret;
> > > > >+
> > > > >+	va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > > > >+	if (!va)
> > > > >+		goto free_hkid;
> > > > >+	tdr_pa = __pa(va);
> > > > >+
> > > > >+	tdcs_pa = kcalloc(tdx_info->nr_tdcs_pages, sizeof(*kvm_tdx->tdcs_pa),
> > > > >+			  GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> > > > >+	if (!tdcs_pa)
> > > > >+		goto free_tdr;
> > > > >+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> > > > >+		va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > > > >+		if (!va)
> > > > >+			goto free_tdcs;
> > > > >+		tdcs_pa[i] = __pa(va);
> > > > >+	}
> > > > >+
> > > > >+	if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
> > > > >+		ret = -ENOMEM;
> > > > >+		goto free_tdcs;
> > > > >+	}
> > > > >+	cpus_read_lock();
> > > > >+	/*
> > > > >+	 * Need at least one CPU of the package to be online in order to
> > > > >+	 * program all packages for host key id.  Check it.
> > > > >+	 */
> > > > >+	for_each_present_cpu(i)
> > > > >+		cpumask_set_cpu(topology_physical_package_id(i), packages);
> > > > >+	for_each_online_cpu(i)
> > > > >+		cpumask_clear_cpu(topology_physical_package_id(i), packages);
> > > > >+	if (!cpumask_empty(packages)) {
> > > > >+		ret = -EIO;
> > > > >+		/*
> > > > >+		 * Because it's hard for human operator to figure out the
> > > > >+		 * reason, warn it.
> > > > >+		 */
> > > > >+#define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
> > > > >+		pr_warn_ratelimited(MSG_ALLPKG);
> > > > >+		goto free_packages;
> > > > >+	}
> > > > >+
> > > > >+	/*
> > > > >+	 * Acquire global lock to avoid TDX_OPERAND_BUSY:
> > > > >+	 * TDH.MNG.CREATE and other APIs try to lock the global Key Owner
> > > > >+	 * Table (KOT) to track the assigned TDX private HKID.  It doesn't spin
> > > > >+	 * to acquire the lock, returns TDX_OPERAND_BUSY instead, and let the
> > > > >+	 * caller to handle the contention.  This is because of time limitation
> > > > >+	 * usable inside the TDX module and OS/VMM knows better about process
> > > > >+	 * scheduling.
> > > > >+	 *
> > > > >+	 * APIs to acquire the lock of KOT:
> > > > >+	 * TDH.MNG.CREATE, TDH.MNG.KEY.FREEID, TDH.MNG.VPFLUSHDONE, and
> > > > >+	 * TDH.PHYMEM.CACHE.WB.
> > > > >+	 */
> > > > >+	mutex_lock(&tdx_lock);
> > > > >+	err = tdh_mng_create(tdr_pa, kvm_tdx->hkid);
> > > > >+	mutex_unlock(&tdx_lock);
> > > > >+	if (err == TDX_RND_NO_ENTROPY) {
> > > > >+		ret = -EAGAIN;
> > > > >+		goto free_packages;
> > > > >+	}
> > > > >+	if (WARN_ON_ONCE(err)) {
> > > > >+		pr_tdx_error(TDH_MNG_CREATE, err, NULL);
> > > > >+		ret = -EIO;
> > > > >+		goto free_packages;
> > > > >+	}
> > > > >+	kvm_tdx->tdr_pa = tdr_pa;
> > > > >+
> > > > >+	for_each_online_cpu(i) {
> > > > >+		int pkg = topology_physical_package_id(i);
> > > > >+
> > > > >+		if (cpumask_test_and_set_cpu(pkg, packages))
> > > > >+			continue;
> > > > >+
> > > > >+		/*
> > > > >+		 * Program the memory controller in the package with an
> > > > >+		 * encryption key associated to a TDX private host key id
> > > > >+		 * assigned to this TDR.  Concurrent operations on same memory
> > > > >+		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
> > > > >+		 * mutex.
> > > > >+		 */
> > > > >+		mutex_lock(&tdx_mng_key_config_lock[pkg]);
> > > >
> > > > the lock is superfluous to me. with cpu lock held, even if multiple CPUs try to
> > > > create TDs, the same set of CPUs (the first online CPU of each package) will be
> > > > selected to configure the key because of the cpumask_test_and_set_cpu() above.
> > > > it means, we never have two CPUs in the same socket trying to program the key,
> > > > i.e., no concurrent calls.
> > >
> > > Makes sense. Will drop the lock.
> >
> > Not get the point, the variable "packages" on stack, and it's
> > possible that "i" is same for 2 threads which are trying to create td.
> > Anything I missed ?
> 
> Got the point after synced with chao.
> in case of using for_each_online_cpu() it's safe to remove the mutex_lock(&tdx_mng_key_config_lock[pkg]),
> since every thread will select only 1 cpu for each sockets in same order, and requests submited
> to same cpu by smp_call_on_cpu() are ordered on the target cpu. That means removing the lock works for
> using for_each_online_cpu() but does NOT work for randomly pick up a cpu per socket.
> 
> Maybe it's just my issue that doesn't realize what's going on here, but
> I think it still worth to give comment here for why it works/does not work.

It's deserves comment. Will add it.
Isaku Yamahata March 23, 2024, 1:36 a.m. UTC | #7
On Fri, Mar 22, 2024 at 02:06:19PM +1300,
"Huang, Kai" <kai.huang@intel.com> wrote:

> Roughly checking the code, you have implemented many things including
> MNG.KEY.CONFIG staff.  It's worth to add some text here to give reviewer a
> rough idea what's going on here.
> 
> > 
> > Before tearing down private page tables, TDX requires some resources of the
> > guest TD to be destroyed (i.e. HKID must have been reclaimed, etc).  Add
> > mmu notifier release callback before tearing down private page tables for
> > it. >
> > Add vm_free() of kvm_x86_ops hook at the end of kvm_arch_destroy_vm()
> > because some per-VM TDX resources, e.g. TDR, need to be freed after other
> > TDX resources, e.g. HKID, were freed.
> 
> I think we should split the "adding callbacks' part out, given you have ...
> 
> 	9 files changed, 520 insertions(+), 8 deletions(-)
> 
> ... in this patch.
> 
> IMHO, >500 LOC change normally means there are too many things in this
> patch, thus hard to review, and we should split.
> 
> I think perhaps we can split this big patch to smaller pieces based on the
> steps, like we did for the init_tdx_module() function in the TDX host
> patchset??
> 
> (But I would like to hear from others too.)

Ok, how about those steps
- tdr allocation/free
- allocate+configure/release HKID
- phyemme cache wb
- tdcs allocation/free
- clearing page

520/5 = 104.  Want more steps?


> > +	if (WARN_ON_ONCE(err))
> > +		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
> > +}
> 
> [snip]
> 
> I am stopping here, because I need to take a break.
> 
> Again I think we should split this patch, there are just too many things to
> review here.

Thank you so much for the review. Let me try to break this patch.
Binbin Wu March 25, 2024, 9:58 a.m. UTC | #8
On 2/26/2024 4:25 PM, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
>
> As the first step to create TDX guest, create/destroy VM struct.  Assign
> TDX private Host Key ID (HKID) to the TDX guest for memory encryption and
> allocate extra pages for the TDX guest. On destruction, free allocated
> pages, and HKID.
>
> Before tearing down private page tables, TDX requires some resources of the
> guest TD to be destroyed (i.e. HKID must have been reclaimed, etc).  Add
> mmu notifier release callback

It seems not accurate to say "Add mmu notifier release callback", since the
interface has already been there. This patch extends the cache flush 
function,
i.e, kvm_flush_shadow_all() to do TDX specific thing.

> before tearing down private page tables for
> it.
>
> Add vm_free() of kvm_x86_ops hook at the end of kvm_arch_destroy_vm()
> because some per-VM TDX resources, e.g. TDR, need to be freed after other
> TDX resources, e.g. HKID, were freed.
>
> Co-developed-by: Kai Huang <kai.huang@intel.com>
> Signed-off-by: Kai Huang <kai.huang@intel.com>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
>
> ---
> v19:
> - fix check error code of TDH.PHYMEM.PAGE.RECLAIM. RCX and TDR.
>
> v18:
> - Use TDH.SYS.RD() instead of struct tdsysinfo_struct.
> - Rename tdx_reclaim_td_page() to tdx_reclaim_control_page()
> - return -EAGAIN on TDX_RND_NO_ENTROPY of TDH.MNG.CREATE(), TDH.MNG.ADDCX()
> - fix comment to remove extra the.
> - use true instead of 1 for boolean.
> - remove an extra white line.
>
> v16:
> - Simplified tdx_reclaim_page()
> - Reorganize the locking of tdx_release_hkid(), and use smp_call_mask()
>    instead of smp_call_on_cpu() to hold spinlock to race with invalidation
>    on releasing guest memfd
>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>   arch/x86/include/asm/kvm-x86-ops.h |   2 +
>   arch/x86/include/asm/kvm_host.h    |   2 +
>   arch/x86/kvm/Kconfig               |   3 +-
>   arch/x86/kvm/mmu/mmu.c             |   7 +
>   arch/x86/kvm/vmx/main.c            |  26 +-
>   arch/x86/kvm/vmx/tdx.c             | 475 ++++++++++++++++++++++++++++-
>   arch/x86/kvm/vmx/tdx.h             |   6 +-
>   arch/x86/kvm/vmx/x86_ops.h         |   6 +
>   arch/x86/kvm/x86.c                 |   1 +
>   9 files changed, 520 insertions(+), 8 deletions(-)
>
[...]
> +
> +static void tdx_clear_page(unsigned long page_pa)
> +{
> +	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> +	void *page = __va(page_pa);
> +	unsigned long i;
> +
> +	/*
> +	 * When re-assign one page from old keyid to a new keyid, MOVDIR64B is
> +	 * required to clear/write the page with new keyid to prevent integrity
> +	 * error when read on the page with new keyid.
> +	 *
> +	 * clflush doesn't flush cache with HKID set.  The cache line could be
> +	 * poisoned (even without MKTME-i), clear the poison bit.
> +	 */
> +	for (i = 0; i < PAGE_SIZE; i += 64)
> +		movdir64b(page + i, zero_page);
> +	/*
> +	 * MOVDIR64B store uses WC buffer.  Prevent following memory reads
> +	 * from seeing potentially poisoned cache.
> +	 */
> +	__mb();

Is __wmb() sufficient for this case?

> +}
> +
[...]

> +
> +static int tdx_do_tdh_mng_key_config(void *param)
> +{
> +	hpa_t *tdr_p = param;
> +	u64 err;
> +
> +	do {
> +		err = tdh_mng_key_config(*tdr_p);
> +
> +		/*
> +		 * If it failed to generate a random key, retry it because this
> +		 * is typically caused by an entropy error of the CPU's random

Here you say "typically", is there other cause and is it safe to loop on 
retry?

> +		 * number generator.
> +		 */
> +	} while (err == TDX_KEY_GENERATION_FAILED);
> +
> +	if (WARN_ON_ONCE(err)) {
> +		pr_tdx_error(TDH_MNG_KEY_CONFIG, err, NULL);
> +		return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
[...]
Isaku Yamahata March 25, 2024, 9:48 p.m. UTC | #9
On Mon, Mar 25, 2024 at 05:58:47PM +0800,
Binbin Wu <binbin.wu@linux.intel.com> wrote:

> > +static void tdx_clear_page(unsigned long page_pa)
> > +{
> > +	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> > +	void *page = __va(page_pa);
> > +	unsigned long i;
> > +
> > +	/*
> > +	 * When re-assign one page from old keyid to a new keyid, MOVDIR64B is
> > +	 * required to clear/write the page with new keyid to prevent integrity
> > +	 * error when read on the page with new keyid.
> > +	 *
> > +	 * clflush doesn't flush cache with HKID set.  The cache line could be
> > +	 * poisoned (even without MKTME-i), clear the poison bit.
> > +	 */
> > +	for (i = 0; i < PAGE_SIZE; i += 64)
> > +		movdir64b(page + i, zero_page);
> > +	/*
> > +	 * MOVDIR64B store uses WC buffer.  Prevent following memory reads
> > +	 * from seeing potentially poisoned cache.
> > +	 */
> > +	__mb();
> 
> Is __wmb() sufficient for this case?

I don't think so because sfence is for other store. Here we care other load.

> > +
> > +static int tdx_do_tdh_mng_key_config(void *param)
> > +{
> > +	hpa_t *tdr_p = param;
> > +	u64 err;
> > +
> > +	do {
> > +		err = tdh_mng_key_config(*tdr_p);
> > +
> > +		/*
> > +		 * If it failed to generate a random key, retry it because this
> > +		 * is typically caused by an entropy error of the CPU's random
> 
> Here you say "typically", is there other cause and is it safe to loop on
> retry?


No as long as I know. the TDX module returns KEY_GENERATION_FAILED only when
rdrnd (or equivalent) failed.  But I don't know the future.

Let's delete "tyepically" because it seems confusing.
Kai Huang March 26, 2024, 1:43 a.m. UTC | #10
... continue the previous review ...

> +
> +static void tdx_reclaim_control_page(unsigned long td_page_pa)
> +{
> +	WARN_ON_ONCE(!td_page_pa);

 From the name 'td_page_pa' we cannot tell whether it is a control page, 
but this function is only intended for control page AFAICT, so perhaps a 
more specific name.

> +
> +	/*
> +	 * TDCX are being reclaimed.  TDX module maps TDCX with HKID

"are" -> "is".

Are you sure it is TDCX, but not TDCS?

AFAICT TDCX is the control structure for 'vcpu', but here you are 
handling the control structure for the VM.

> +	 * assigned to the TD.  Here the cache associated to the TD
> +	 * was already flushed by TDH.PHYMEM.CACHE.WB before here, So
> +	 * cache doesn't need to be flushed again.
> +	 */

How about put this part as the comment for this function?

/*
  * Reclaim <name of control page> page(s) which are crypto-protected
  * by TDX guest's private KeyID.  Assume the cache associated with the
  * TDX private KeyID has been flushed.
  */
> +	if (tdx_reclaim_page(td_page_pa))
> +		/*
> +		 * Leak the page on failure:
> +		 * tdx_reclaim_page() returns an error if and only if there's an
> +		 * unexpected, fatal error, e.g. a SEAMCALL with bad params,
> +		 * incorrect concurrency in KVM, a TDX Module bug, etc.
> +		 * Retrying at a later point is highly unlikely to be
> +		 * successful.
> +		 * No log here as tdx_reclaim_page() already did.

IMHO can be simplified to below, and nothing else matters.

	/*
	 * Leak the page if the kernel failed to reclaim the page.
	 * The krenel cannot use it safely anymore.
	 */

And you can put this comment above the 'if (tdx_reclaim_page())' statement.
	
> +		 */
> +		return;

Empty line.

> +	free_page((unsigned long)__va(td_page_pa));
> +}
> +
> +static void tdx_do_tdh_phymem_cache_wb(void *unused)

Better to make the name explicit that it is a smp_func, and you don't 
need the "tdx_" prefix for all the 'static' functions here:

	static void smp_func_do_phymem_cache_wb(void *unused)

> +{
> +	u64 err = 0;
> +
> +	do {
> +		err = tdh_phymem_cache_wb(!!err);

		bool resume = !!err;

		err = tdh_phymem_cache_wb(resume);

So that we don't need to jump to the tdh_phymem_cache_wb() to see what 
does !!err mean.

> +	} while (err == TDX_INTERRUPTED_RESUMABLE);

Add a comment before the do {} while():

	/*
	 * TDH.PHYMEM.CACHE.WB flushes caches associated with _ANY_
	 * TDX private KeyID on the package (or logical cpu?) where
	 * it is called on.  The TDX module may not finish the cache
	 * flush but return TDX_INTERRUPTED_RESUMEABLE instead.  The
	 * kernel should retry it until it returns success w/o
	 * rescheduling.
	 */
> +
> +	/* Other thread may have done for us. */
> +	if (err == TDX_NO_HKID_READY_TO_WBCACHE)
> +		err = TDX_SUCCESS;

Empty line.

> +	if (WARN_ON_ONCE(err))
> +		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
> +}
> +
> +void tdx_mmu_release_hkid(struct kvm *kvm)
> +{
> +	bool packages_allocated, targets_allocated;
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	cpumask_var_t packages, targets;
> +	u64 err;
> +	int i;
> +
> +	if (!is_hkid_assigned(kvm_tdx))
> +		return;
> +
> +	if (!is_td_created(kvm_tdx)) {
> +		tdx_hkid_free(kvm_tdx);
> +		return;
> +	}

I lost tracking what does "td_created()" mean.

I guess it means: KeyID has been allocated to the TDX guest, but not yet 
programmed/configured.

Perhaps add a comment to remind the reviewer?

> +
> +	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
> +	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
> +	cpus_read_lock();
> +
> +	/*
> +	 * We can destroy multiple guest TDs simultaneously.  Prevent
> +	 * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
> +	 */

IMHO it's better to remind people that TDH.PHYMEM.CACHE.WB tries to grab 
the global TDX module lock:

	/*
	 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global
	 * lock and can fail with TDX_OPERAND_BUSY when it fails to
	 * grab.  Multiple TDX guests can be destroyed simultaneously.
	 * Take the mutex to prevent it from getting error.
	 */
> +	mutex_lock(&tdx_lock);
> +
> +	/*
> +	 * Go through multiple TDX HKID state transitions with three SEAMCALLs
> +	 * to make TDH.PHYMEM.PAGE.RECLAIM() usable. 


What is "TDX HKID state transitions"?  Not mentioned before, so needs 
explanation _if_ you want to say this.

And what are the three "SEAMCALLs"?  Where are they?  The only _two_ 
SEAMCALLs that I can see here are: TDH.PHYMEM.CACHE.WB and 
TDH.MNG.KEY.FREEID.

  Make the transition atomic
> +	 * to other functions to operate private pages and Secure-EPT pages.

What's the consequence to "other functions" if we don't make it atomic here?

> +	 *
> +	 * Avoid race for kvm_gmem_release() to call kvm_mmu_unmap_gfn_range().
> +	 * This function is called via mmu notifier, mmu_release().
> +	 * kvm_gmem_release() is called via fput() on process exit.
> +	 */
> +	write_lock(&kvm->mmu_lock);

I don't fully get the race here, but it seems strange that this function 
is called via mmu notifier.

IIUC, this function is "supposedly" only be called when we tear down the 
VM, so I don't know why there's such race.

> +
> +	for_each_online_cpu(i) {
> +		if (packages_allocated &&
> +		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
> +					     packages))
> +			continue;
> +		if (targets_allocated)
> +			cpumask_set_cpu(i, targets);
> +	}
> +	if (targets_allocated)
> +		on_each_cpu_mask(targets, tdx_do_tdh_phymem_cache_wb, NULL, true);
> +	else
> +		on_each_cpu(tdx_do_tdh_phymem_cache_wb, NULL, true);

I don't understand the logic here -- no comments whatever.

But I am 99% sure the logic here could be simplified.

> +	/*
> +	 * In the case of error in tdx_do_tdh_phymem_cache_wb(), the following
> +	 * tdh_mng_key_freeid() will fail.
> +	 */
> +	err = tdh_mng_key_freeid(kvm_tdx->tdr_pa);
> +	if (WARN_ON_ONCE(err)) {

I see KVM_BUG_ON() is normally used for SEAMCALL error.  Why this uses 
WARN_ON_ONCE() here?

> +		pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
> +		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
> +		       kvm_tdx->hkid);
> +	} else
> +		tdx_hkid_free(kvm_tdx);
> +
> +	write_unlock(&kvm->mmu_lock);
> +	mutex_unlock(&tdx_lock);
> +	cpus_read_unlock();
> +	free_cpumask_var(targets);
> +	free_cpumask_var(packages);
> +}
> +
> +void tdx_vm_free(struct kvm *kvm)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	u64 err;
> +	int i;
> +
> +	/*
> +	 * tdx_mmu_release_hkid() failed to reclaim HKID.  Something went wrong
> +	 * heavily with TDX module.  Give up freeing TD pages.  As the function
> +	 * already warned, don't warn it again.
> +	 */
> +	if (is_hkid_assigned(kvm_tdx))
> +		return;
> +
> +	if (kvm_tdx->tdcs_pa) {
> +		for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> +			if (kvm_tdx->tdcs_pa[i])
> +				tdx_reclaim_control_page(kvm_tdx->tdcs_pa[i]);

AFAICT, here tdcs_pa[i] cannot be NULL, right?  How about:

			if (!WARN_ON_ONCE(!kvm_tdx->tdcs_pa[i]))
				continue;
			
			tdx_reclaim_control_page(...);

which at least saves you some indent.

Btw, does it make sense to stop if any tdx_reclaim_control_page() fails?

It's OK to continue, but perhaps worth to add a comment to point out:

			/*
			 * Continue to reclaim other control pages and
			 * TDR page, even failed to reclaim one control
			 * page.  Do the best to reclaim these TDX
			 * private pages.
			 */
			tdx_reclaim_control_page();
> +		}
> +		kfree(kvm_tdx->tdcs_pa);
> +		kvm_tdx->tdcs_pa = NULL;
> +	}
> +
> +	if (!kvm_tdx->tdr_pa)
> +		return;
> +	if (__tdx_reclaim_page(kvm_tdx->tdr_pa))
> +		return;
> +	/*
> +	 * TDX module maps TDR with TDX global HKID.  TDX module may access TDR
> +	 * while operating on TD (Especially reclaiming TDCS).  Cache flush with > +	 * TDX global HKID is needed.
> +	 */

"Especially reclaiming TDCS" -> "especially when it is reclaiming TDCS".

Use imperative mode to describe your change:

Use the SEAMCALL to ask the TDX module to flush the cache of it using 
the global KeyID.

> +	err = tdh_phymem_page_wbinvd(set_hkid_to_hpa(kvm_tdx->tdr_pa,
> +						     tdx_global_keyid));
> +	if (WARN_ON_ONCE(err)) {

Again, KVM_BUG_ON()?

Should't matter, though.

> +		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err, NULL);
> +		return;
> +	}
> +	tdx_clear_page(kvm_tdx->tdr_pa);
> +
> +	free_page((unsigned long)__va(kvm_tdx->tdr_pa));
> +	kvm_tdx->tdr_pa = 0;
> +}
> +
> +static int tdx_do_tdh_mng_key_config(void *param)
> +{
> +	hpa_t *tdr_p = param;
> +	u64 err;
> +
> +	do {
> +		err = tdh_mng_key_config(*tdr_p);
> +
> +		/*
> +		 * If it failed to generate a random key, retry it because this
> +		 * is typically caused by an entropy error of the CPU's random
> +		 * number generator.
> +		 */
> +	} while (err == TDX_KEY_GENERATION_FAILED);

If you want to handle TDX_KEY_GENERTION_FAILED, it's better to have a 
retry limit similar to the TDX host code does.

> +
> +	if (WARN_ON_ONCE(err)) {

KVM_BUG_ON()?

> +		pr_tdx_error(TDH_MNG_KEY_CONFIG, err, NULL);
> +		return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
> +static int __tdx_td_init(struct kvm *kvm);
> +
> +int tdx_vm_init(struct kvm *kvm)
> +{
> +	/*
> +	 * TDX has its own limit of the number of vcpus in addition to
> +	 * KVM_MAX_VCPUS.
> +	 */
> +	kvm->max_vcpus = min(kvm->max_vcpus, TDX_MAX_VCPUS);

I believe this should be part of the patch that handles KVM_CAP_MAX_VCPUS.

> +
> +	/* Place holder for TDX specific logic. */
> +	return __tdx_td_init(kvm);
> +}
> +

... to be continued ...
Isaku Yamahata March 27, 2024, 10:53 p.m. UTC | #11
On Tue, Mar 26, 2024 at 02:43:54PM +1300,
"Huang, Kai" <kai.huang@intel.com> wrote:

> ... continue the previous review ...
> 
> > +
> > +static void tdx_reclaim_control_page(unsigned long td_page_pa)
> > +{
> > +	WARN_ON_ONCE(!td_page_pa);
> 
> From the name 'td_page_pa' we cannot tell whether it is a control page, but
> this function is only intended for control page AFAICT, so perhaps a more
> specific name.
> 
> > +
> > +	/*
> > +	 * TDCX are being reclaimed.  TDX module maps TDCX with HKID
> 
> "are" -> "is".
> 
> Are you sure it is TDCX, but not TDCS?
> 
> AFAICT TDCX is the control structure for 'vcpu', but here you are handling
> the control structure for the VM.

TDCS, TDVPR, and TDCX.  Will update the comment.

> 
> > +	 * assigned to the TD.  Here the cache associated to the TD
> > +	 * was already flushed by TDH.PHYMEM.CACHE.WB before here, So
> > +	 * cache doesn't need to be flushed again.
> > +	 */
> 
> How about put this part as the comment for this function?
> 
> /*
>  * Reclaim <name of control page> page(s) which are crypto-protected
>  * by TDX guest's private KeyID.  Assume the cache associated with the
>  * TDX private KeyID has been flushed.
>  */
> > +	if (tdx_reclaim_page(td_page_pa))
> > +		/*
> > +		 * Leak the page on failure:
> > +		 * tdx_reclaim_page() returns an error if and only if there's an
> > +		 * unexpected, fatal error, e.g. a SEAMCALL with bad params,
> > +		 * incorrect concurrency in KVM, a TDX Module bug, etc.
> > +		 * Retrying at a later point is highly unlikely to be
> > +		 * successful.
> > +		 * No log here as tdx_reclaim_page() already did.
> 
> IMHO can be simplified to below, and nothing else matters.
> 
> 	/*
> 	 * Leak the page if the kernel failed to reclaim the page.
> 	 * The krenel cannot use it safely anymore.
> 	 */
> 
> And you can put this comment above the 'if (tdx_reclaim_page())' statement.

Sure.


> > +		 */
> > +		return;
> 
> Empty line.
> 
> > +	free_page((unsigned long)__va(td_page_pa));
> > +}
> > +
> > +static void tdx_do_tdh_phymem_cache_wb(void *unused)
> 
> Better to make the name explicit that it is a smp_func, and you don't need
> the "tdx_" prefix for all the 'static' functions here:
> 
> 	static void smp_func_do_phymem_cache_wb(void *unused)

Ok, will rename it.


> > +{
> > +	u64 err = 0;
> > +
> > +	do {
> > +		err = tdh_phymem_cache_wb(!!err);
> 
> 		bool resume = !!err;
> 
> 		err = tdh_phymem_cache_wb(resume);
> 
> So that we don't need to jump to the tdh_phymem_cache_wb() to see what does
> !!err mean.

Ok.


> > +	} while (err == TDX_INTERRUPTED_RESUMABLE);
> 
> Add a comment before the do {} while():
> 
> 	/*
> 	 * TDH.PHYMEM.CACHE.WB flushes caches associated with _ANY_
> 	 * TDX private KeyID on the package (or logical cpu?) where
> 	 * it is called on.  The TDX module may not finish the cache
> 	 * flush but return TDX_INTERRUPTED_RESUMEABLE instead.  The
> 	 * kernel should retry it until it returns success w/o
> 	 * rescheduling.
> 	 */

Ok.


> > +
> > +	/* Other thread may have done for us. */
> > +	if (err == TDX_NO_HKID_READY_TO_WBCACHE)
> > +		err = TDX_SUCCESS;
> 
> Empty line.
> 
> > +	if (WARN_ON_ONCE(err))
> > +		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
> > +}
> > +
> > +void tdx_mmu_release_hkid(struct kvm *kvm)
> > +{
> > +	bool packages_allocated, targets_allocated;
> > +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > +	cpumask_var_t packages, targets;
> > +	u64 err;
> > +	int i;
> > +
> > +	if (!is_hkid_assigned(kvm_tdx))
> > +		return;
> > +
> > +	if (!is_td_created(kvm_tdx)) {
> > +		tdx_hkid_free(kvm_tdx);
> > +		return;
> > +	}
> 
> I lost tracking what does "td_created()" mean.
> 
> I guess it means: KeyID has been allocated to the TDX guest, but not yet
> programmed/configured.
> 
> Perhaps add a comment to remind the reviewer?

As Chao suggested, will introduce state machine for vm and vcpu.

https://lore.kernel.org/kvm/ZfvI8t7SlfIsxbmT@chao-email/

> > +
> > +	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
> > +	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
> > +	cpus_read_lock();
> > +
> > +	/*
> > +	 * We can destroy multiple guest TDs simultaneously.  Prevent
> > +	 * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
> > +	 */
> 
> IMHO it's better to remind people that TDH.PHYMEM.CACHE.WB tries to grab the
> global TDX module lock:
> 
> 	/*
> 	 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global
> 	 * lock and can fail with TDX_OPERAND_BUSY when it fails to
> 	 * grab.  Multiple TDX guests can be destroyed simultaneously.
> 	 * Take the mutex to prevent it from getting error.
> 	 */
> > +	mutex_lock(&tdx_lock);
> > +
> > +	/*
> > +	 * Go through multiple TDX HKID state transitions with three SEAMCALLs
> > +	 * to make TDH.PHYMEM.PAGE.RECLAIM() usable.
> 
> 
> What is "TDX HKID state transitions"?  Not mentioned before, so needs
> explanation _if_ you want to say this.

Ok.
> And what are the three "SEAMCALLs"?  Where are they?  The only _two_
> SEAMCALLs that I can see here are: TDH.PHYMEM.CACHE.WB and
> TDH.MNG.KEY.FREEID.

tdh_mng_vpflushdone(). I'll those three in the comment.  It may not seem
to hkid state machine, though.


> 
>  Make the transition atomic
> > +	 * to other functions to operate private pages and Secure-EPT pages.
> 
> What's the consequence to "other functions" if we don't make it atomic here?

Other thread can be removing pages from TD.  If the HKID is freed, other
thread in loop to remove pages can get error.

TDH.MEM.SEPT.REMOVE(), TDH.MEM.PAGE.REMOVE() can fail with
TDX_OP_STATE_INCORRECT when HKID is not assigned.

When HKID is freed, we need to use TDH.PHYMEM.PAGE.RECLAIM().
TDH.PHYMEM.PAGE.RECLAIM() fails with TDX_LIECYCLE_STATE_INCORRECT when
HKID isn't freed.

How about this?

/*
 * We need three SEAMCALLs, TDH.MNG.VPFLUSHDONE(), TDH.PHYMEM.CACHE.WB(), and
 * TDH.MNG.KEY.FREEID() to free the HKID.
 * Other threads can remove pages from TD.  When the HKID is assigned, we need
 * to use TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE().
 * TDH.PHYMEM.PAGE.RECLAIM() is needed when the HKID is free.  Get lock to not
 * present transient state of HKID.
 */


> > +	 *
> > +	 * Avoid race for kvm_gmem_release() to call kvm_mmu_unmap_gfn_range().
> > +	 * This function is called via mmu notifier, mmu_release().
> > +	 * kvm_gmem_release() is called via fput() on process exit.
> > +	 */
> > +	write_lock(&kvm->mmu_lock);
> 
> I don't fully get the race here, but it seems strange that this function is
> called via mmu notifier.
> 
> IIUC, this function is "supposedly" only be called when we tear down the VM,

That's correct.  The hook when destroying the VM is mmu notifier mmu_release().
It's called on the behalf of mmput().  Because other component like vhost-net
can increment the reference, mmu_release mmu notifier can be triggered by
a thread other than the guest VM.


> so I don't know why there's such race.

When guest_memfd is released, the private memory is unmapped.  
the thread of guest VM can issue exit to closes guest_memfd and
other thread can trigger mmu notifier of the guest VM.

Also, if we have multiple fds for the same guest_memfd, the last file closure
can be done in the context of the guest VM or other process.


> > +
> > +	for_each_online_cpu(i) {
> > +		if (packages_allocated &&
> > +		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
> > +					     packages))
> > +			continue;
> > +		if (targets_allocated)
> > +			cpumask_set_cpu(i, targets);
> > +	}
> > +	if (targets_allocated)
> > +		on_each_cpu_mask(targets, tdx_do_tdh_phymem_cache_wb, NULL, true);
> > +	else
> > +		on_each_cpu(tdx_do_tdh_phymem_cache_wb, NULL, true);
> 
> I don't understand the logic here -- no comments whatever.
> 
> But I am 99% sure the logic here could be simplified.

Yes, as Chao suggested, I'll use global variable for those cpumasks.
https://lore.kernel.org/kvm/ZfpwIespKy8qxWWE@chao-email/


> > +	/*
> > +	 * In the case of error in tdx_do_tdh_phymem_cache_wb(), the following
> > +	 * tdh_mng_key_freeid() will fail.
> > +	 */
> > +	err = tdh_mng_key_freeid(kvm_tdx->tdr_pa);
> > +	if (WARN_ON_ONCE(err)) {
> 
> I see KVM_BUG_ON() is normally used for SEAMCALL error.  Why this uses
> WARN_ON_ONCE() here?

Because vm_free() hook is (one of) the final steps to free struct kvm.  No one
else touches this kvm.  Because it doesn't harm to use KVM_BUG_ON() here,
I'll change it for consistency.


> > +		pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
> > +		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
> > +		       kvm_tdx->hkid);
> > +	} else
> > +		tdx_hkid_free(kvm_tdx);
> > +
> > +	write_unlock(&kvm->mmu_lock);
> > +	mutex_unlock(&tdx_lock);
> > +	cpus_read_unlock();
> > +	free_cpumask_var(targets);
> > +	free_cpumask_var(packages);
> > +}
> > +
> > +void tdx_vm_free(struct kvm *kvm)
> > +{
> > +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > +	u64 err;
> > +	int i;
> > +
> > +	/*
> > +	 * tdx_mmu_release_hkid() failed to reclaim HKID.  Something went wrong
> > +	 * heavily with TDX module.  Give up freeing TD pages.  As the function
> > +	 * already warned, don't warn it again.
> > +	 */
> > +	if (is_hkid_assigned(kvm_tdx))
> > +		return;
> > +
> > +	if (kvm_tdx->tdcs_pa) {
> > +		for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> > +			if (kvm_tdx->tdcs_pa[i])
> > +				tdx_reclaim_control_page(kvm_tdx->tdcs_pa[i]);
> 
> AFAICT, here tdcs_pa[i] cannot be NULL, right?  How about:

If tdh_mng_addcx() fails in the middle of 0 < i < nr_tdcs_pages, tdcs_pa[i] can
be NULL.  The current allocation/free code is unnecessarily convoluted. I'll
clean them up.


> 			if (!WARN_ON_ONCE(!kvm_tdx->tdcs_pa[i]))
> 				continue;
> 			
> 			tdx_reclaim_control_page(...);
> 
> which at least saves you some indent.
> 
> Btw, does it make sense to stop if any tdx_reclaim_control_page() fails?

It doesn't matter much in practice because it's unlikely to hit error for some
of TDCS pages.  So I chose to make it return void to skip error check by the
caller.


> It's OK to continue, but perhaps worth to add a comment to point out:
> 
> 			/*
> 			 * Continue to reclaim other control pages and
> 			 * TDR page, even failed to reclaim one control
> 			 * page.  Do the best to reclaim these TDX
> 			 * private pages.
> 			 */
> 			tdx_reclaim_control_page();

Sure, it will make the intention clear.


> > +		}
> > +		kfree(kvm_tdx->tdcs_pa);
> > +		kvm_tdx->tdcs_pa = NULL;
> > +	}
> > +
> > +	if (!kvm_tdx->tdr_pa)
> > +		return;
> > +	if (__tdx_reclaim_page(kvm_tdx->tdr_pa))
> > +		return;
> > +	/*
> > +	 * TDX module maps TDR with TDX global HKID.  TDX module may access TDR
> > +	 * while operating on TD (Especially reclaiming TDCS).  Cache flush with > +	 * TDX global HKID is needed.
> > +	 */
> 
> "Especially reclaiming TDCS" -> "especially when it is reclaiming TDCS".
> 
> Use imperative mode to describe your change:
> 
> Use the SEAMCALL to ask the TDX module to flush the cache of it using the
> global KeyID.
> 
> > +	err = tdh_phymem_page_wbinvd(set_hkid_to_hpa(kvm_tdx->tdr_pa,
> > +						     tdx_global_keyid));
> > +	if (WARN_ON_ONCE(err)) {
> 
> Again, KVM_BUG_ON()?
> 
> Should't matter, though.

Ok, let's use KVM_BUG_ON() consistently.



> > +		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err, NULL);
> > +		return;
> > +	}
> > +	tdx_clear_page(kvm_tdx->tdr_pa);
> > +
> > +	free_page((unsigned long)__va(kvm_tdx->tdr_pa));
> > +	kvm_tdx->tdr_pa = 0;
> > +}
> > +
> > +static int tdx_do_tdh_mng_key_config(void *param)
> > +{
> > +	hpa_t *tdr_p = param;
> > +	u64 err;
> > +
> > +	do {
> > +		err = tdh_mng_key_config(*tdr_p);
> > +
> > +		/*
> > +		 * If it failed to generate a random key, retry it because this
> > +		 * is typically caused by an entropy error of the CPU's random
> > +		 * number generator.
> > +		 */
> > +	} while (err == TDX_KEY_GENERATION_FAILED);
> 
> If you want to handle TDX_KEY_GENERTION_FAILED, it's better to have a retry
> limit similar to the TDX host code does.

Ok, although it would complicates the error recovery path, let me update it.


> > +
> > +	if (WARN_ON_ONCE(err)) {
> 
> KVM_BUG_ON()?
> 
> > +		pr_tdx_error(TDH_MNG_KEY_CONFIG, err, NULL);
> > +		return -EIO;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int __tdx_td_init(struct kvm *kvm);
> > +
> > +int tdx_vm_init(struct kvm *kvm)
> > +{
> > +	/*
> > +	 * TDX has its own limit of the number of vcpus in addition to
> > +	 * KVM_MAX_VCPUS.
> > +	 */
> > +	kvm->max_vcpus = min(kvm->max_vcpus, TDX_MAX_VCPUS);
> 
> I believe this should be part of the patch that handles KVM_CAP_MAX_VCPUS.

Ok.
Kai Huang March 27, 2024, 11:33 p.m. UTC | #12
... to continue the previous review ...

>   
> +static int __tdx_td_init(struct kvm *kvm)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	cpumask_var_t packages;
> +	unsigned long *tdcs_pa = NULL;
> +	unsigned long tdr_pa = 0;
> +	unsigned long va;
> +	int ret, i;
> +	u64 err;
> +
> +	ret = tdx_guest_keyid_alloc();
> +	if (ret < 0)
> +		return ret;
> +	kvm_tdx->hkid = ret;
> +
> +	va = __get_free_page(GFP_KERNEL_ACCOUNT);
> +	if (!va)
> +		goto free_hkid;
> +	tdr_pa = __pa(va);
> +
> +	tdcs_pa = kcalloc(tdx_info->nr_tdcs_pages, sizeof(*kvm_tdx->tdcs_pa),
> +			  GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> +	if (!tdcs_pa)
> +		goto free_tdr;

Empty line.

> +	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> +		va = __get_free_page(GFP_KERNEL_ACCOUNT);
> +		if (!va)
> +			goto free_tdcs;
> +		tdcs_pa[i] = __pa(va);
> +	}
> +
> +	if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
> +		ret = -ENOMEM;
> +		goto free_tdcs;
> +	}

Empty line.

> +	cpus_read_lock();
> +	/*
> +	 * Need at least one CPU of the package to be online in order to
> +	 * program all packages for host key id.  Check it.
> +	 */
> +	for_each_present_cpu(i)
> +		cpumask_set_cpu(topology_physical_package_id(i), packages);
> +	for_each_online_cpu(i)
> +		cpumask_clear_cpu(topology_physical_package_id(i), packages);
> +	if (!cpumask_empty(packages)) {
> +		ret = -EIO;
> +		/*
> +		 * Because it's hard for human operator to figure out the
> +		 * reason, warn it.
> +		 */
> +#define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
> +		pr_warn_ratelimited(MSG_ALLPKG);
> +		goto free_packages;
> +	}
> +
> +	/*
> +	 * Acquire global lock to avoid TDX_OPERAND_BUSY:
> +	 * TDH.MNG.CREATE and other APIs try to lock the global Key Owner
> +	 * Table (KOT) to track the assigned TDX private HKID.  It doesn't spin
> +	 * to acquire the lock, returns TDX_OPERAND_BUSY instead, and let the
> +	 * caller to handle the contention.  This is because of time limitation
> +	 * usable inside the TDX module and OS/VMM knows better about process
> +	 * scheduling.
> +	 *
> +	 * APIs to acquire the lock of KOT:
> +	 * TDH.MNG.CREATE, TDH.MNG.KEY.FREEID, TDH.MNG.VPFLUSHDONE, and
> +	 * TDH.PHYMEM.CACHE.WB. > +	 */

Don't need to mention all SEAMCALLs here, but put a comment where 
appliciable, i.e., where they are used.

	/*
	 * TDH.MNG.CREATE tries to grab the global TDX module and fails
	 * with TDX_OPERAND_BUSY when it fails to grab.  Take the global
	 * lock to prevent it from failure.
	 */
> +	mutex_lock(&tdx_lock);
> +	err = tdh_mng_create(tdr_pa, kvm_tdx->hkid);
> +	mutex_unlock(&tdx_lock);

Empty line.

> +	if (err == TDX_RND_NO_ENTROPY) {
> +		ret = -EAGAIN;
> +		goto free_packages;
> +	}
> +	if (WARN_ON_ONCE(err)) {
> +		pr_tdx_error(TDH_MNG_CREATE, err, NULL);
> +		ret = -EIO;
> +		goto free_packages;
> +	}

I would prefer more empty lines.

> +	kvm_tdx->tdr_pa = tdr_pa;
> +
> +	for_each_online_cpu(i) {
> +		int pkg = topology_physical_package_id(i);
> +
> +		if (cpumask_test_and_set_cpu(pkg, packages))
> +			continue;
> +
> +		/*
> +		 * Program the memory controller in the package with an
> +		 * encryption key associated to a TDX private host key id
> +		 * assigned to this TDR.  Concurrent operations on same memory
> +		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
> +		 * mutex.
> +		 */

IIUC the race can only happen when you are creating multiple TDX guests 
simulatenously?  Please clarify this in the comment.

And I even don't think you need all these TDX module details:

		/*
		 * Concurrent run of TDH.MNG.KEY.CONFIG on the same
		 * package resluts in TDX_OPERAND_BUSY.  When creating
		 * multiple TDX guests simultaneously this can run
		 * concurrently.  Take the per-package lock to
		 * serialize.
		 */
> +		mutex_lock(&tdx_mng_key_config_lock[pkg]);
> +		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
> +				      &kvm_tdx->tdr_pa, true);
> +		mutex_unlock(&tdx_mng_key_config_lock[pkg]);
> +		if (ret)
> +			break;
> +	}
> +	cpus_read_unlock();
> +	free_cpumask_var(packages);
> +	if (ret) {
> +		i = 0;
> +		goto teardown;
> +	}
> +
> +	kvm_tdx->tdcs_pa = tdcs_pa;
> +	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> +		err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
> +		if (err == TDX_RND_NO_ENTROPY) {
> +			/* Here it's hard to allow userspace to retry. */
> +			ret = -EBUSY;
> +			goto teardown;
> +		}
> +		if (WARN_ON_ONCE(err)) {
> +			pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
> +			ret = -EIO;
> +			goto teardown;
> +		}
> +	}
> +
> +	/*
> +	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
> +	 * ioctl() to define the configure CPUID values for the TD.
> +	 */

Then, how about renaming this function to __tdx_td_create()?

> +	return 0;
> +
> +	/*
> +	 * The sequence for freeing resources from a partially initialized TD
> +	 * varies based on where in the initialization flow failure occurred.
> +	 * Simply use the full teardown and destroy, which naturally play nice
> +	 * with partial initialization.
> +	 */
> +teardown:
> +	for (; i < tdx_info->nr_tdcs_pages; i++) {
> +		if (tdcs_pa[i]) {
> +			free_page((unsigned long)__va(tdcs_pa[i]));
> +			tdcs_pa[i] = 0;
> +		}
> +	}
> +	if (!kvm_tdx->tdcs_pa)
> +		kfree(tdcs_pa);

The code to "free TDCS pages in a loop and free the array" is done below 
with duplicated code.  I am wondering whether we have way to eliminate one.

But I have lost track here, so perhaps we can review again after we 
split the patch to smaller pieces.

> +	tdx_mmu_release_hkid(kvm);
> +	tdx_vm_free(kvm);
> +	return ret;
> +
> +free_packages:
> +	cpus_read_unlock();
> +	free_cpumask_var(packages);
> +free_tdcs:
> +	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> +		if (tdcs_pa[i])
> +			free_page((unsigned long)__va(tdcs_pa[i]));
> +	}
> +	kfree(tdcs_pa);
> +	kvm_tdx->tdcs_pa = NULL;
> +
> +free_tdr:
> +	if (tdr_pa)
> +		free_page((unsigned long)__va(tdr_pa));
> +	kvm_tdx->tdr_pa = 0;
> +free_hkid:
> +	if (is_hkid_assigned(kvm_tdx))
> +		tdx_hkid_free(kvm_tdx);
> +	return ret;
> +}
> +
>   int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>   {
>   	struct kvm_tdx_cmd tdx_cmd;
> @@ -215,12 +664,13 @@ static int tdx_md_read(struct tdx_md_map *maps, int nr_maps)
>   
>   static int __init tdx_module_setup(void)
>   {
> -	u16 num_cpuid_config;
> +	u16 num_cpuid_config, tdcs_base_size;
>   	int ret;
>   	u32 i;
>   
>   	struct tdx_md_map mds[] = {
>   		TDX_MD_MAP(NUM_CPUID_CONFIG, &num_cpuid_config),
> +		TDX_MD_MAP(TDCS_BASE_SIZE, &tdcs_base_size),
>   	};
>   
>   	struct tdx_metadata_field_mapping fields[] = {
> @@ -273,6 +723,8 @@ static int __init tdx_module_setup(void)
>   		c->edx = ecx_edx >> 32;
>   	}
>   
> +	tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE;
> +

Round up the 'tdcs_base_size' to make sure you have enough room, or put 
a WARN() here if not page aligned?

>   	return 0;
>   
>   error_out:
> @@ -319,13 +771,27 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
>   	struct tdx_enabled enable = {
>   		.err = ATOMIC_INIT(0),
>   	};
> +	int max_pkgs;
>   	int r = 0;
> +	int i;

Nit: you can put the 3 into one line.

>   
> +	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
> +		pr_warn("MOVDIR64B is reqiured for TDX\n");

It's better to make it more clear:

"Disable TDX: MOVDIR64B is not supported or disabled by the kernel."

Or, to match below:

"Cannot enable TDX w/o MOVDIR64B".

> +		return -EOPNOTSUPP;
> +	}
>   	if (!enable_ept) {
>   		pr_warn("Cannot enable TDX with EPT disabled\n");
>   		return -EINVAL;
>   	}
>   
> +	max_pkgs = topology_max_packages();
> +	tdx_mng_key_config_lock = kcalloc(max_pkgs, sizeof(*tdx_mng_key_config_lock),
> +				   GFP_KERNEL);
> +	if (!tdx_mng_key_config_lock)
> +		return -ENOMEM;
> +	for (i = 0; i < max_pkgs; i++)
> +		mutex_init(&tdx_mng_key_config_lock[i]);
> +

Using a per-socket lock looks a little bit overkill to me.  I don't know 
whether we need to do in the initial version.  Will leave to others.

Please at least add a comment to explain this is for better performance 
when creating multiple TDX guests IIUC?

>   	if (!zalloc_cpumask_var(&enable.enabled, GFP_KERNEL)) {
>   		r = -ENOMEM;
>   		goto out;
> @@ -350,4 +816,5 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
>   void tdx_hardware_unsetup(void)
>   {
>   	kfree(tdx_info);
> +	kfree(tdx_mng_key_config_lock);

The kernel actually has a mutex_destroy().  It is empty when 
CONFIG_DEBUG_LOCK_ALLOC is off, but I think it should be standard 
proceedure to also mutex_destory()?

[...]
Kai Huang March 28, 2024, 1:49 a.m. UTC | #13
On 28/03/2024 11:53 am, Isaku Yamahata wrote:
> On Tue, Mar 26, 2024 at 02:43:54PM +1300,
> "Huang, Kai" <kai.huang@intel.com> wrote:
> 
>> ... continue the previous review ...
>>
>>> +
>>> +static void tdx_reclaim_control_page(unsigned long td_page_pa)
>>> +{
>>> +	WARN_ON_ONCE(!td_page_pa);
>>
>>  From the name 'td_page_pa' we cannot tell whether it is a control page, but
>> this function is only intended for control page AFAICT, so perhaps a more
>> specific name.
>>
>>> +
>>> +	/*
>>> +	 * TDCX are being reclaimed.  TDX module maps TDCX with HKID
>>
>> "are" -> "is".
>>
>> Are you sure it is TDCX, but not TDCS?
>>
>> AFAICT TDCX is the control structure for 'vcpu', but here you are handling
>> the control structure for the VM.
> 
> TDCS, TDVPR, and TDCX.  Will update the comment.

But TDCX, TDVPR are vcpu-scoped.  Do you want to mention them _here_?

Otherwise you will have to explain them.

[...]

>>> +
>>> +void tdx_mmu_release_hkid(struct kvm *kvm)
>>> +{
>>> +	bool packages_allocated, targets_allocated;
>>> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>>> +	cpumask_var_t packages, targets;
>>> +	u64 err;
>>> +	int i;
>>> +
>>> +	if (!is_hkid_assigned(kvm_tdx))
>>> +		return;
>>> +
>>> +	if (!is_td_created(kvm_tdx)) {
>>> +		tdx_hkid_free(kvm_tdx);
>>> +		return;
>>> +	}
>>
>> I lost tracking what does "td_created()" mean.
>>
>> I guess it means: KeyID has been allocated to the TDX guest, but not yet
>> programmed/configured.
>>
>> Perhaps add a comment to remind the reviewer?
> 
> As Chao suggested, will introduce state machine for vm and vcpu.
> 
> https://lore.kernel.org/kvm/ZfvI8t7SlfIsxbmT@chao-email/

Could you elaborate what will the state machine look like?

I need to understand it.


[...]


> 
> How about this?
> 
> /*
>   * We need three SEAMCALLs, TDH.MNG.VPFLUSHDONE(), TDH.PHYMEM.CACHE.WB(), and
>   * TDH.MNG.KEY.FREEID() to free the HKID.
>   * Other threads can remove pages from TD.  When the HKID is assigned, we need
>   * to use TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE().
>   * TDH.PHYMEM.PAGE.RECLAIM() is needed when the HKID is free.  Get lock to not
>   * present transient state of HKID.
>   */

Could you elaborate why it is still possible to have other thread 
removing pages from TD?

I am probably missing something, but the thing I don't understand is why 
this function is triggered by MMU release?  All the things done in this 
function don't seem to be related to MMU at all.

IIUC, by reaching here, you must already have done VPFLUSHDONE, which 
should be called when you free vcpu?  Freeing vcpus is done in 
kvm_arch_destroy_vm(), which is _after_ mmu_notifier->release(), in 
which this tdx_mmu_release_keyid() is called?

But here we are depending vcpus to be freed before tdx_mmu_release_hkid()?

>>> +	/*
>>> +	 * In the case of error in tdx_do_tdh_phymem_cache_wb(), the following
>>> +	 * tdh_mng_key_freeid() will fail.
>>> +	 */
>>> +	err = tdh_mng_key_freeid(kvm_tdx->tdr_pa);
>>> +	if (WARN_ON_ONCE(err)) {
>>
>> I see KVM_BUG_ON() is normally used for SEAMCALL error.  Why this uses
>> WARN_ON_ONCE() here?
> 
> Because vm_free() hook is (one of) the final steps to free struct kvm.  No one
> else touches this kvm.  Because it doesn't harm to use KVM_BUG_ON() here,
> I'll change it for consistency.

I am fine with either.  You can use KVM_BUG_ON() for SEAMCALLs at 
runtime, but use WARN_ON_ONCE() for those involved during VM creation.

[...]

>>
>>> +	err = tdh_phymem_page_wbinvd(set_hkid_to_hpa(kvm_tdx->tdr_pa,
>>> +						     tdx_global_keyid));
>>> +	if (WARN_ON_ONCE(err)) {
>>
>> Again, KVM_BUG_ON()?
>>
>> Should't matter, though.
> 
> Ok, let's use KVM_BUG_ON() consistently.

Ditto.
Isaku Yamahata March 28, 2024, 5:34 a.m. UTC | #14
On Thu, Mar 28, 2024 at 02:49:56PM +1300,
"Huang, Kai" <kai.huang@intel.com> wrote:

> 
> 
> On 28/03/2024 11:53 am, Isaku Yamahata wrote:
> > On Tue, Mar 26, 2024 at 02:43:54PM +1300,
> > "Huang, Kai" <kai.huang@intel.com> wrote:
> > 
> > > ... continue the previous review ...
> > > 
> > > > +
> > > > +static void tdx_reclaim_control_page(unsigned long td_page_pa)
> > > > +{
> > > > +	WARN_ON_ONCE(!td_page_pa);
> > > 
> > >  From the name 'td_page_pa' we cannot tell whether it is a control page, but
> > > this function is only intended for control page AFAICT, so perhaps a more
> > > specific name.
> > > 
> > > > +
> > > > +	/*
> > > > +	 * TDCX are being reclaimed.  TDX module maps TDCX with HKID
> > > 
> > > "are" -> "is".
> > > 
> > > Are you sure it is TDCX, but not TDCS?
> > > 
> > > AFAICT TDCX is the control structure for 'vcpu', but here you are handling
> > > the control structure for the VM.
> > 
> > TDCS, TDVPR, and TDCX.  Will update the comment.
> 
> But TDCX, TDVPR are vcpu-scoped.  Do you want to mention them _here_?

So I'll make the patch that frees TDVPR, TDCX will change this comment.


> Otherwise you will have to explain them.
> 
> [...]
> 
> > > > +
> > > > +void tdx_mmu_release_hkid(struct kvm *kvm)
> > > > +{
> > > > +	bool packages_allocated, targets_allocated;
> > > > +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > > > +	cpumask_var_t packages, targets;
> > > > +	u64 err;
> > > > +	int i;
> > > > +
> > > > +	if (!is_hkid_assigned(kvm_tdx))
> > > > +		return;
> > > > +
> > > > +	if (!is_td_created(kvm_tdx)) {
> > > > +		tdx_hkid_free(kvm_tdx);
> > > > +		return;
> > > > +	}
> > > 
> > > I lost tracking what does "td_created()" mean.
> > > 
> > > I guess it means: KeyID has been allocated to the TDX guest, but not yet
> > > programmed/configured.
> > > 
> > > Perhaps add a comment to remind the reviewer?
> > 
> > As Chao suggested, will introduce state machine for vm and vcpu.
> > 
> > https://lore.kernel.org/kvm/ZfvI8t7SlfIsxbmT@chao-email/
> 
> Could you elaborate what will the state machine look like?
> 
> I need to understand it.

Not yet. Chao only propose to introduce state machine. Right now it's just an
idea.


> > How about this?
> > 
> > /*
> >   * We need three SEAMCALLs, TDH.MNG.VPFLUSHDONE(), TDH.PHYMEM.CACHE.WB(), and
> >   * TDH.MNG.KEY.FREEID() to free the HKID.
> >   * Other threads can remove pages from TD.  When the HKID is assigned, we need
> >   * to use TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE().
> >   * TDH.PHYMEM.PAGE.RECLAIM() is needed when the HKID is free.  Get lock to not
> >   * present transient state of HKID.
> >   */
> 
> Could you elaborate why it is still possible to have other thread removing
> pages from TD?
> 
> I am probably missing something, but the thing I don't understand is why
> this function is triggered by MMU release?  All the things done in this
> function don't seem to be related to MMU at all.

The KVM releases EPT pages on MMU notifier release.  kvm_mmu_zap_all() does. If
we follow that way, kvm_mmu_zap_all() zaps all the Secure-EPTs by
TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE().  Because
TDH.MEM.{SEPT, PAGE}.REMOVE() is slow, we can free HKID before kvm_mmu_zap_all()
to use TDH.PHYMEM.PAGE.RECLAIM().


> IIUC, by reaching here, you must already have done VPFLUSHDONE, which should
> be called when you free vcpu?

Not necessarily.


> Freeing vcpus is done in
> kvm_arch_destroy_vm(), which is _after_ mmu_notifier->release(), in which
> this tdx_mmu_release_keyid() is called?

guest memfd complicates things.  The race is between guest memfd release and mmu
notifier release.  kvm_arch_destroy_vm() is called after closing all kvm fds
including guest memfd.

Here is the example.  Let's say, we have fds for vhost, guest_memfd, kvm vcpu,
and kvm vm.  The process is exiting.  Please notice vhost increments the
reference of the mmu to access guest (shared) memory.

exit_mmap():
  Usually mmu notifier release is fired. But not yet because of vhost.

exit_files()
  close vhost fd. vhost starts timer to issue mmput().

  close guest_memfd.  kvm_gmem_release() calls kvm_mmu_unmap_gfn_range().
    kvm_mmu_unmap_gfn_range() eventually this calls TDH.MEM.SEPT.REMOVE()
    and TDH.MEM.PAGE.REMOVE().  This takes time because it processes whole
    guest memory. Call kvm_put_kvm() at last.

  During unmapping on behalf of guest memfd, the timer of vhost fires to call
  mmput().  It triggers mmu notifier release.

  Close kvm vcpus/vm. they call kvm_put_kvm().  The last one calls
  kvm_destroy_vm().  

It's ideal to free HKID first for efficiency. But KVM doesn't have control on
the order of fds.


> But here we are depending vcpus to be freed before tdx_mmu_release_hkid()?

Not necessarily.
Kai Huang March 28, 2024, 11:14 a.m. UTC | #15
On Wed, 2024-03-27 at 22:34 -0700, Isaku Yamahata wrote:
> On Thu, Mar 28, 2024 at 02:49:56PM +1300,
> "Huang, Kai" <kai.huang@intel.com> wrote:
> 
> > 
> > 
> > On 28/03/2024 11:53 am, Isaku Yamahata wrote:
> > > On Tue, Mar 26, 2024 at 02:43:54PM +1300,
> > > "Huang, Kai" <kai.huang@intel.com> wrote:
> > > 
> > > > ... continue the previous review ...
> > > > 
> > > > > +
> > > > > +static void tdx_reclaim_control_page(unsigned long td_page_pa)
> > > > > +{
> > > > > +	WARN_ON_ONCE(!td_page_pa);
> > > > 
> > > >  From the name 'td_page_pa' we cannot tell whether it is a control page, but
> > > > this function is only intended for control page AFAICT, so perhaps a more
> > > > specific name.
> > > > 
> > > > > +
> > > > > +	/*
> > > > > +	 * TDCX are being reclaimed.  TDX module maps TDCX with HKID
> > > > 
> > > > "are" -> "is".
> > > > 
> > > > Are you sure it is TDCX, but not TDCS?
> > > > 
> > > > AFAICT TDCX is the control structure for 'vcpu', but here you are handling
> > > > the control structure for the VM.
> > > 
> > > TDCS, TDVPR, and TDCX.  Will update the comment.
> > 
> > But TDCX, TDVPR are vcpu-scoped.  Do you want to mention them _here_?
> 
> So I'll make the patch that frees TDVPR, TDCX will change this comment.
> 

Hmm.. Looking again, I am not sure why do we even need
tdx_reclaim_control_page()?

It basically does tdx_reclaim_page() + free_page():

+static void tdx_reclaim_control_page(unsigned long td_page_pa)
+{
+	WARN_ON_ONCE(!td_page_pa);
+
+	/*
+	 * TDCX are being reclaimed.  TDX module maps TDCX with HKID
+	 * assigned to the TD.  Here the cache associated to the TD
+	 * was already flushed by TDH.PHYMEM.CACHE.WB before here, So
+	 * cache doesn't need to be flushed again.
+	 */
+	if (tdx_reclaim_page(td_page_pa))
+		/*
+		 * Leak the page on failure:
+		 * tdx_reclaim_page() returns an error if and only if there's
an
+		 * unexpected, fatal error, e.g. a SEAMCALL with bad params,
+		 * incorrect concurrency in KVM, a TDX Module bug, etc.
+		 * Retrying at a later point is highly unlikely to be
+		 * successful.
+		 * No log here as tdx_reclaim_page() already did.
+		 */
+		return;
+	free_page((unsigned long)__va(td_page_pa));
+}

And why do you need a special function just for control page(s)?

> 
> > Otherwise you will have to explain them.
> > 
> > [...]
> > 
> > > > > +
> > > > > +void tdx_mmu_release_hkid(struct kvm *kvm)
> > > > > +{
> > > > > +	bool packages_allocated, targets_allocated;
> > > > > +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > > > > +	cpumask_var_t packages, targets;
> > > > > +	u64 err;
> > > > > +	int i;
> > > > > +
> > > > > +	if (!is_hkid_assigned(kvm_tdx))
> > > > > +		return;
> > > > > +
> > > > > +	if (!is_td_created(kvm_tdx)) {
> > > > > +		tdx_hkid_free(kvm_tdx);
> > > > > +		return;
> > > > > +	}
> > > > 
> > > > I lost tracking what does "td_created()" mean.
> > > > 
> > > > I guess it means: KeyID has been allocated to the TDX guest, but not yet
> > > > programmed/configured.
> > > > 
> > > > Perhaps add a comment to remind the reviewer?
> > > 
> > > As Chao suggested, will introduce state machine for vm and vcpu.
> > > 
> > > https://lore.kernel.org/kvm/ZfvI8t7SlfIsxbmT@chao-email/
> > 
> > Could you elaborate what will the state machine look like?
> > 
> > I need to understand it.
> 
> Not yet. Chao only propose to introduce state machine. Right now it's just an
> idea.

Then why state machine is better?  I guess we need some concrete example to tell
which is better?

> 
> 
> > > How about this?
> > > 
> > > /*
> > >   * We need three SEAMCALLs, TDH.MNG.VPFLUSHDONE(), TDH.PHYMEM.CACHE.WB(), and
> > >   * TDH.MNG.KEY.FREEID() to free the HKID.
> > >   * Other threads can remove pages from TD.  When the HKID is assigned, we need
> > >   * to use TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE().
> > >   * TDH.PHYMEM.PAGE.RECLAIM() is needed when the HKID is free.  Get lock to not
> > >   * present transient state of HKID.
> > >   */
> > 
> > Could you elaborate why it is still possible to have other thread removing
> > pages from TD?
> > 
> > I am probably missing something, but the thing I don't understand is why
> > this function is triggered by MMU release?  All the things done in this
> > function don't seem to be related to MMU at all.
> 
> The KVM releases EPT pages on MMU notifier release.  kvm_mmu_zap_all() does. If
> we follow that way, kvm_mmu_zap_all() zaps all the Secure-EPTs by
> TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE().  Because
> TDH.MEM.{SEPT, PAGE}.REMOVE() is slow, we can free HKID before kvm_mmu_zap_all()
> to use TDH.PHYMEM.PAGE.RECLAIM().

Can you elaborate why TDH.MEM.{SEPT,PAGE}.REMOVE is slower than
TDH.PHYMEM.PAGE.RECLAIM()?

And does the difference matter in practice, i.e. did you see using the former
having noticeable performance downgrade?

> 
> 
> > IIUC, by reaching here, you must already have done VPFLUSHDONE, which should
> > be called when you free vcpu?
> 
> Not necessarily.

OK.  I got confused between TDH.VP.FLUSH and TDH.MNG.VPFLUSHDONE.

> 
> 
> > Freeing vcpus is done in
> > kvm_arch_destroy_vm(), which is _after_ mmu_notifier->release(), in which
> > this tdx_mmu_release_keyid() is called?
> 
> guest memfd complicates things.  The race is between guest memfd release and mmu
> notifier release.  kvm_arch_destroy_vm() is called after closing all kvm fds
> including guest memfd.
> 
> Here is the example.  Let's say, we have fds for vhost, guest_memfd, kvm vcpu,
> and kvm vm.  The process is exiting.  Please notice vhost increments the
> reference of the mmu to access guest (shared) memory.
> 
> exit_mmap():
>   Usually mmu notifier release is fired. But not yet because of vhost.
> 
> exit_files()
>   close vhost fd. vhost starts timer to issue mmput().

Why does it need to start a timer to issue mmput(), but not call mmput()
directly?

> 
>   close guest_memfd.  kvm_gmem_release() calls kvm_mmu_unmap_gfn_range().
>     kvm_mmu_unmap_gfn_range() eventually this calls TDH.MEM.SEPT.REMOVE()
>     and TDH.MEM.PAGE.REMOVE().  This takes time because it processes whole
>     guest memory. Call kvm_put_kvm() at last.
> 
>   During unmapping on behalf of guest memfd, the timer of vhost fires to call
>   mmput().  It triggers mmu notifier release.
> 
>   Close kvm vcpus/vm. they call kvm_put_kvm().  The last one calls
>   kvm_destroy_vm().  
> 
> It's ideal to free HKID first for efficiency. But KVM doesn't have control on
> the order of fds.

Firstly, what kinda performance efficiency gain are we talking about?

We cannot really tell whether it can be justified to use two different methods
to tear down SEPT page because of this.

Even if it's worth to do, it is an optimization, which can/should be done later
after you have put all building blocks together.

That being said, you are putting too many logic in this patch, i.e., it just
doesn't make sense to release TDX keyID in the MMU code path in _this_ patch.

> 
> 
> > But here we are depending vcpus to be freed before tdx_mmu_release_hkid()?
> 
> Not necessarily.

I am wondering when is TDH.VP.FLUSH done?  Supposedly it should be called when
we free vcpus?  But again this means you need to call TDH.MNG.VPFLUSHDONE
_after_ freeing vcpus.  And this  looks conflicting if you make
tdx_mmu_release_keyid() being called from MMU notifier.
Chao Gao March 28, 2024, 2:12 p.m. UTC | #16
On Thu, Mar 28, 2024 at 11:14:42AM +0000, Huang, Kai wrote:
>> > 
>> > [...]
>> > 
>> > > > > +
>> > > > > +void tdx_mmu_release_hkid(struct kvm *kvm)
>> > > > > +{
>> > > > > +	bool packages_allocated, targets_allocated;
>> > > > > +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>> > > > > +	cpumask_var_t packages, targets;
>> > > > > +	u64 err;
>> > > > > +	int i;
>> > > > > +
>> > > > > +	if (!is_hkid_assigned(kvm_tdx))
>> > > > > +		return;
>> > > > > +
>> > > > > +	if (!is_td_created(kvm_tdx)) {
>> > > > > +		tdx_hkid_free(kvm_tdx);
>> > > > > +		return;
>> > > > > +	}
>> > > > 
>> > > > I lost tracking what does "td_created()" mean.
>> > > > 
>> > > > I guess it means: KeyID has been allocated to the TDX guest, but not yet
>> > > > programmed/configured.
>> > > > 
>> > > > Perhaps add a comment to remind the reviewer?
>> > > 
>> > > As Chao suggested, will introduce state machine for vm and vcpu.
>> > > 
>> > > https://lore.kernel.org/kvm/ZfvI8t7SlfIsxbmT@chao-email/
>> > 
>> > Could you elaborate what will the state machine look like?
>> > 
>> > I need to understand it.
>> 
>> Not yet. Chao only propose to introduce state machine. Right now it's just an
>> idea.
>
>Then why state machine is better?  I guess we need some concrete example to tell
>which is better?

Something like the TD Life Cycle State Machine (Section 9.1 of TDX module spec[1])

[1]: https://cdrdv2.intel.com/v1/dl/getContent/733568

I don't have the code. But using a few boolean variables to track the state of
TD and VCPU looks bad and hard to maintain and extend. At least, the state machine
is well-documented.
Isaku Yamahata March 28, 2024, 8:39 p.m. UTC | #17
On Thu, Mar 28, 2024 at 11:14:42AM +0000,
"Huang, Kai" <kai.huang@intel.com> wrote:

> On Wed, 2024-03-27 at 22:34 -0700, Isaku Yamahata wrote:
> > On Thu, Mar 28, 2024 at 02:49:56PM +1300,
> > "Huang, Kai" <kai.huang@intel.com> wrote:
> > 
> > > 
> > > 
> > > On 28/03/2024 11:53 am, Isaku Yamahata wrote:
> > > > On Tue, Mar 26, 2024 at 02:43:54PM +1300,
> > > > "Huang, Kai" <kai.huang@intel.com> wrote:
> > > > 
> > > > > ... continue the previous review ...
> > > > > 
> > > > > > +
> > > > > > +static void tdx_reclaim_control_page(unsigned long td_page_pa)
> > > > > > +{
> > > > > > +	WARN_ON_ONCE(!td_page_pa);
> > > > > 
> > > > >  From the name 'td_page_pa' we cannot tell whether it is a control page, but
> > > > > this function is only intended for control page AFAICT, so perhaps a more
> > > > > specific name.
> > > > > 
> > > > > > +
> > > > > > +	/*
> > > > > > +	 * TDCX are being reclaimed.  TDX module maps TDCX with HKID
> > > > > 
> > > > > "are" -> "is".
> > > > > 
> > > > > Are you sure it is TDCX, but not TDCS?
> > > > > 
> > > > > AFAICT TDCX is the control structure for 'vcpu', but here you are handling
> > > > > the control structure for the VM.
> > > > 
> > > > TDCS, TDVPR, and TDCX.  Will update the comment.
> > > 
> > > But TDCX, TDVPR are vcpu-scoped.  Do you want to mention them _here_?
> > 
> > So I'll make the patch that frees TDVPR, TDCX will change this comment.
> > 
> 
> Hmm.. Looking again, I am not sure why do we even need
> tdx_reclaim_control_page()?
> 
> It basically does tdx_reclaim_page() + free_page():
> 
> +static void tdx_reclaim_control_page(unsigned long td_page_pa)
> +{
> +	WARN_ON_ONCE(!td_page_pa);
> +
> +	/*
> +	 * TDCX are being reclaimed.  TDX module maps TDCX with HKID
> +	 * assigned to the TD.  Here the cache associated to the TD
> +	 * was already flushed by TDH.PHYMEM.CACHE.WB before here, So
> +	 * cache doesn't need to be flushed again.
> +	 */
> +	if (tdx_reclaim_page(td_page_pa))
> +		/*
> +		 * Leak the page on failure:
> +		 * tdx_reclaim_page() returns an error if and only if there's
> an
> +		 * unexpected, fatal error, e.g. a SEAMCALL with bad params,
> +		 * incorrect concurrency in KVM, a TDX Module bug, etc.
> +		 * Retrying at a later point is highly unlikely to be
> +		 * successful.
> +		 * No log here as tdx_reclaim_page() already did.
> +		 */
> +		return;
> +	free_page((unsigned long)__va(td_page_pa));
> +}
> 
> And why do you need a special function just for control page(s)?

We can revise the code to have common function for reclaiming page.


> > > Otherwise you will have to explain them.
> > > 
> > > [...]
> > > 
> > > > > > +
> > > > > > +void tdx_mmu_release_hkid(struct kvm *kvm)
> > > > > > +{
> > > > > > +	bool packages_allocated, targets_allocated;
> > > > > > +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > > > > > +	cpumask_var_t packages, targets;
> > > > > > +	u64 err;
> > > > > > +	int i;
> > > > > > +
> > > > > > +	if (!is_hkid_assigned(kvm_tdx))
> > > > > > +		return;
> > > > > > +
> > > > > > +	if (!is_td_created(kvm_tdx)) {
> > > > > > +		tdx_hkid_free(kvm_tdx);
> > > > > > +		return;
> > > > > > +	}
> > > > > 
> > > > > I lost tracking what does "td_created()" mean.
> > > > > 
> > > > > I guess it means: KeyID has been allocated to the TDX guest, but not yet
> > > > > programmed/configured.
> > > > > 
> > > > > Perhaps add a comment to remind the reviewer?
> > > > 
> > > > As Chao suggested, will introduce state machine for vm and vcpu.
> > > > 
> > > > https://lore.kernel.org/kvm/ZfvI8t7SlfIsxbmT@chao-email/
> > > 
> > > Could you elaborate what will the state machine look like?
> > > 
> > > I need to understand it.
> > 
> > Not yet. Chao only propose to introduce state machine. Right now it's just an
> > idea.
> 
> Then why state machine is better?  I guess we need some concrete example to tell
> which is better?

At this point we don't know which is better.  I personally think it's worthwhile
to give it a try.  After experiment, we may discard or adapt the idea.

Because the TDX spec already defines its state machine, we could follow it.


> > > > How about this?
> > > > 
> > > > /*
> > > >   * We need three SEAMCALLs, TDH.MNG.VPFLUSHDONE(), TDH.PHYMEM.CACHE.WB(), and
> > > >   * TDH.MNG.KEY.FREEID() to free the HKID.
> > > >   * Other threads can remove pages from TD.  When the HKID is assigned, we need
> > > >   * to use TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE().
> > > >   * TDH.PHYMEM.PAGE.RECLAIM() is needed when the HKID is free.  Get lock to not
> > > >   * present transient state of HKID.
> > > >   */
> > > 
> > > Could you elaborate why it is still possible to have other thread removing
> > > pages from TD?
> > > 
> > > I am probably missing something, but the thing I don't understand is why
> > > this function is triggered by MMU release?  All the things done in this
> > > function don't seem to be related to MMU at all.
> > 
> > The KVM releases EPT pages on MMU notifier release.  kvm_mmu_zap_all() does. If
> > we follow that way, kvm_mmu_zap_all() zaps all the Secure-EPTs by
> > TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE().  Because
> > TDH.MEM.{SEPT, PAGE}.REMOVE() is slow, we can free HKID before kvm_mmu_zap_all()
> > to use TDH.PHYMEM.PAGE.RECLAIM().
> 
> Can you elaborate why TDH.MEM.{SEPT,PAGE}.REMOVE is slower than
> TDH.PHYMEM.PAGE.RECLAIM()?
> 
> And does the difference matter in practice, i.e. did you see using the former
> having noticeable performance downgrade?

Yes. With HKID alive, we have to assume that vcpu can run still. It means TLB
shootdown. The difference is 2 extra SEAMCALL + IPI synchronization for each
guest private page.  If the guest has hundreds of GB, the difference can be
tens of minutes.

With HKID alive, we need to assume vcpu is alive.
- TDH.MEM.PAGE.REMOVE()
- TDH.PHYMEM.PAGE_WBINVD()
- TLB shoot down
  - TDH.MEM.TRACK()
  - IPI to other vcpus
  - wait for other vcpu to exit

After freeing HKID
- TDH.PHYMEM.PAGE.RECLAIM()
  We already flushed TLBs and memory cache.


> > > Freeing vcpus is done in
> > > kvm_arch_destroy_vm(), which is _after_ mmu_notifier->release(), in which
> > > this tdx_mmu_release_keyid() is called?
> > 
> > guest memfd complicates things.  The race is between guest memfd release and mmu
> > notifier release.  kvm_arch_destroy_vm() is called after closing all kvm fds
> > including guest memfd.
> > 
> > Here is the example.  Let's say, we have fds for vhost, guest_memfd, kvm vcpu,
> > and kvm vm.  The process is exiting.  Please notice vhost increments the
> > reference of the mmu to access guest (shared) memory.
> > 
> > exit_mmap():
> >   Usually mmu notifier release is fired. But not yet because of vhost.
> > 
> > exit_files()
> >   close vhost fd. vhost starts timer to issue mmput().
> 
> Why does it need to start a timer to issue mmput(), but not call mmput()
> directly?

That's how vhost implements it.  It's out of KVM control.  Other component or
user space as other thread can get reference to mmu or FDs.  They can keep/free
them as they like.


> >   close guest_memfd.  kvm_gmem_release() calls kvm_mmu_unmap_gfn_range().
> >     kvm_mmu_unmap_gfn_range() eventually this calls TDH.MEM.SEPT.REMOVE()
> >     and TDH.MEM.PAGE.REMOVE().  This takes time because it processes whole
> >     guest memory. Call kvm_put_kvm() at last.
> > 
> >   During unmapping on behalf of guest memfd, the timer of vhost fires to call
> >   mmput().  It triggers mmu notifier release.
> > 
> >   Close kvm vcpus/vm. they call kvm_put_kvm().  The last one calls
> >   kvm_destroy_vm().  
> > 
> > It's ideal to free HKID first for efficiency. But KVM doesn't have control on
> > the order of fds.
> 
> Firstly, what kinda performance efficiency gain are we talking about?

2 extra SEAMCALL + IPI sync for each guest private page.  If the guest memory
is hundreds of GB, the difference can be tens of minutes.


> We cannot really tell whether it can be justified to use two different methods
> to tear down SEPT page because of this.
> 
> Even if it's worth to do, it is an optimization, which can/should be done later
> after you have put all building blocks together.
> 
> That being said, you are putting too many logic in this patch, i.e., it just
> doesn't make sense to release TDX keyID in the MMU code path in _this_ patch.

I agree that this patch is too huge, and that we should break it into smaller
patches.


> > > But here we are depending vcpus to be freed before tdx_mmu_release_hkid()?
> > 
> > Not necessarily.
> 
> I am wondering when is TDH.VP.FLUSH done?  Supposedly it should be called when
> we free vcpus?  But again this means you need to call TDH.MNG.VPFLUSHDONE
> _after_ freeing vcpus.  And this  looks conflicting if you make
> tdx_mmu_release_keyid() being called from MMU notifier.

tdx_mmu_release_keyid() call it explicitly for all vcpus.
Binbin Wu March 29, 2024, 6:22 a.m. UTC | #18
On 3/21/2024 10:17 PM, Isaku Yamahata wrote:
> On Wed, Mar 20, 2024 at 01:12:01PM +0800,
> Chao Gao <chao.gao@intel.com> wrote:
>
>>> config KVM_SW_PROTECTED_VM
>>> 	bool "Enable support for KVM software-protected VMs"
>>> -	depends on EXPERT

This change is not needed, right?
Since you intended to use KVM_GENERIC_PRIVATE_MEM, not KVM_SW_PROTECTED_VM.

>>> 	depends on KVM && X86_64
>>> 	select KVM_GENERIC_PRIVATE_MEM
>>> 	help
>>> @@ -89,6 +88,8 @@ config KVM_SW_PROTECTED_VM
>>> config KVM_INTEL
>>> 	tristate "KVM for Intel (and compatible) processors support"
>>> 	depends on KVM && IA32_FEAT_CTL
>>> +	select KVM_SW_PROTECTED_VM if INTEL_TDX_HOST
>> why does INTEL_TDX_HOST select KVM_SW_PROTECTED_VM?
> I wanted KVM_GENERIC_PRIVATE_MEM.  Ah, we should do
>
>          select KKVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST
>
>
>>> +	select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
>>> 	help
>>> 	.vcpu_precreate = vmx_vcpu_precreate,
>>> 	.vcpu_create = vmx_vcpu_create,
>>
[...]
Binbin Wu March 29, 2024, 7:25 a.m. UTC | #19
On 3/29/2024 4:39 AM, Isaku Yamahata wrote:

[...]
>>>>> How about this?
>>>>>
>>>>> /*
>>>>>    * We need three SEAMCALLs, TDH.MNG.VPFLUSHDONE(), TDH.PHYMEM.CACHE.WB(), and
>>>>>    * TDH.MNG.KEY.FREEID() to free the HKID.
>>>>>    * Other threads can remove pages from TD.  When the HKID is assigned, we need
>>>>>    * to use TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE().
>>>>>    * TDH.PHYMEM.PAGE.RECLAIM() is needed when the HKID is free.  Get lock to not
>>>>>    * present transient state of HKID.
>>>>>    */
>>>> Could you elaborate why it is still possible to have other thread removing
>>>> pages from TD?
>>>>
>>>> I am probably missing something, but the thing I don't understand is why
>>>> this function is triggered by MMU release?  All the things done in this
>>>> function don't seem to be related to MMU at all.
>>> The KVM releases EPT pages on MMU notifier release.  kvm_mmu_zap_all() does. If
>>> we follow that way, kvm_mmu_zap_all() zaps all the Secure-EPTs by
>>> TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE().  Because
>>> TDH.MEM.{SEPT, PAGE}.REMOVE() is slow, we can free HKID before kvm_mmu_zap_all()
>>> to use TDH.PHYMEM.PAGE.RECLAIM().
>> Can you elaborate why TDH.MEM.{SEPT,PAGE}.REMOVE is slower than
>> TDH.PHYMEM.PAGE.RECLAIM()?
>>
>> And does the difference matter in practice, i.e. did you see using the former
>> having noticeable performance downgrade?
> Yes. With HKID alive, we have to assume that vcpu can run still. It means TLB
> shootdown. The difference is 2 extra SEAMCALL + IPI synchronization for each
> guest private page.  If the guest has hundreds of GB, the difference can be
> tens of minutes.
>
> With HKID alive, we need to assume vcpu is alive.
> - TDH.MEM.PAGE.REMOVE()
> - TDH.PHYMEM.PAGE_WBINVD()
> - TLB shoot down
>    - TDH.MEM.TRACK()
>    - IPI to other vcpus
>    - wait for other vcpu to exit

Do we have a way to batch the TLB shoot down.
IIUC, in current implementation, TLB shoot down needs to be done for 
each page remove, right?


>
> After freeing HKID
> - TDH.PHYMEM.PAGE.RECLAIM()
>    We already flushed TLBs and memory cache.
>
>
>>>> Freeing vcpus is done in
>>>> kvm_arch_destroy_vm(), which is _after_ mmu_notifier->release(), in which
>>>> this tdx_mmu_release_keyid() is called?
>>> guest memfd complicates things.  The race is between guest memfd release and mmu
>>> notifier release.  kvm_arch_destroy_vm() is called after closing all kvm fds
>>> including guest memfd.
>>>
>>> Here is the example.  Let's say, we have fds for vhost, guest_memfd, kvm vcpu,
>>> and kvm vm.  The process is exiting.  Please notice vhost increments the
>>> reference of the mmu to access guest (shared) memory.
>>>
>>> exit_mmap():
>>>    Usually mmu notifier release is fired. But not yet because of vhost.
>>>
>>> exit_files()
>>>    close vhost fd. vhost starts timer to issue mmput().
>> Why does it need to start a timer to issue mmput(), but not call mmput()
>> directly?
> That's how vhost implements it.  It's out of KVM control.  Other component or
> user space as other thread can get reference to mmu or FDs.  They can keep/free
> them as they like.
>
>
>>>    close guest_memfd.  kvm_gmem_release() calls kvm_mmu_unmap_gfn_range().
>>>      kvm_mmu_unmap_gfn_range() eventually this calls TDH.MEM.SEPT.REMOVE()
>>>      and TDH.MEM.PAGE.REMOVE().  This takes time because it processes whole
>>>      guest memory. Call kvm_put_kvm() at last.
>>>
>>>    During unmapping on behalf of guest memfd, the timer of vhost fires to call
>>>    mmput().  It triggers mmu notifier release.
>>>
>>>    Close kvm vcpus/vm. they call kvm_put_kvm().  The last one calls
>>>    kvm_destroy_vm().
>>>
>>> It's ideal to free HKID first for efficiency. But KVM doesn't have control on
>>> the order of fds.
>> Firstly, what kinda performance efficiency gain are we talking about?
> 2 extra SEAMCALL + IPI sync for each guest private page.  If the guest memory
> is hundreds of GB, the difference can be tens of minutes.
>
>
>> We cannot really tell whether it can be justified to use two different methods
>> to tear down SEPT page because of this.
>>
>> Even if it's worth to do, it is an optimization, which can/should be done later
>> after you have put all building blocks together.
>>
>> That being said, you are putting too many logic in this patch, i.e., it just
>> doesn't make sense to release TDX keyID in the MMU code path in _this_ patch.
> I agree that this patch is too huge, and that we should break it into smaller
> patches.
>
>
>>>> But here we are depending vcpus to be freed before tdx_mmu_release_hkid()?
>>> Not necessarily.
>> I am wondering when is TDH.VP.FLUSH done?  Supposedly it should be called when
>> we free vcpus?  But again this means you need to call TDH.MNG.VPFLUSHDONE
>> _after_ freeing vcpus.  And this  looks conflicting if you make
>> tdx_mmu_release_keyid() being called from MMU notifier.
> tdx_mmu_release_keyid() call it explicitly for all vcpus.
Kai Huang April 1, 2024, 10:41 a.m. UTC | #20
[...]

> > 
> > And why do you need a special function just for control page(s)?
> 
> We can revise the code to have common function for reclaiming page.

I interpret this as you will remove tdx_reclaim_control_page(), and have one
function to reclaim _ALL_ TDX private pages.

> 
> 
> > > > > How about this?
> > > > > 
> > > > > /*
> > > > >   * We need three SEAMCALLs, TDH.MNG.VPFLUSHDONE(), TDH.PHYMEM.CACHE.WB(), and
> > > > >   * TDH.MNG.KEY.FREEID() to free the HKID.
> > > > >   * Other threads can remove pages from TD.  When the HKID is assigned, we need
> > > > >   * to use TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE().
> > > > >   * TDH.PHYMEM.PAGE.RECLAIM() is needed when the HKID is free.  Get lock to not
> > > > >   * present transient state of HKID.
> > > > >   */
> > > > 
> > > > Could you elaborate why it is still possible to have other thread removing
> > > > pages from TD?
> > > > 
> > > > I am probably missing something, but the thing I don't understand is why
> > > > this function is triggered by MMU release?  All the things done in this
> > > > function don't seem to be related to MMU at all.
> > > 
> > > The KVM releases EPT pages on MMU notifier release.  kvm_mmu_zap_all() does. If
> > > we follow that way, kvm_mmu_zap_all() zaps all the Secure-EPTs by
> > > TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE().  Because
> > > TDH.MEM.{SEPT, PAGE}.REMOVE() is slow, we can free HKID before kvm_mmu_zap_all()
> > > to use TDH.PHYMEM.PAGE.RECLAIM().
> > 
> > Can you elaborate why TDH.MEM.{SEPT,PAGE}.REMOVE is slower than
> > TDH.PHYMEM.PAGE.RECLAIM()?
> > 
> > And does the difference matter in practice, i.e. did you see using the former
> > having noticeable performance downgrade?
> 
> Yes. With HKID alive, we have to assume that vcpu can run still. It means TLB
> shootdown. The difference is 2 extra SEAMCALL + IPI synchronization for each
> guest private page.  If the guest has hundreds of GB, the difference can be
> tens of minutes.
> 
> With HKID alive, we need to assume vcpu is alive.
> - TDH.MEM.PAGE.REMOVE()
> - TDH.PHYMEM.PAGE_WBINVD()
> - TLB shoot down
>   - TDH.MEM.TRACK()
>   - IPI to other vcpus
>   - wait for other vcpu to exit
> 
> After freeing HKID
> - TDH.PHYMEM.PAGE.RECLAIM()
>   We already flushed TLBs and memory cache.
> > > > 

[...]

> > 
> > Firstly, what kinda performance efficiency gain are we talking about?
> 
> 2 extra SEAMCALL + IPI sync for each guest private page.  If the guest memory
> is hundreds of GB, the difference can be tens of minutes.

[...]

> 
> 
> > We cannot really tell whether it can be justified to use two different methods
> > to tear down SEPT page because of this.
> > 
> > Even if it's worth to do, it is an optimization, which can/should be done later
> > after you have put all building blocks together.
> > 
> > That being said, you are putting too many logic in this patch, i.e., it just
> > doesn't make sense to release TDX keyID in the MMU code path in _this_ patch.
> 
> I agree that this patch is too huge, and that we should break it into smaller
> patches.

IMHO it's not only breaking into smaller pieces, but also you are mixing
performance optimization and essential functionalities together.

Moving reclaiming TDX private KeyID to MMU notifier (in order to have a better
performance when shutting down TDX guest) depends on a lot SEPT details (how to
reclaim private page, TLB flush etc), which haven't yet been mentioned at all.

It's hard to review code like this.  

I think here in this patch, we should just put reclaiming TDX keyID to the
"normal" place.  After you have done all SEPT (and related) patches, you can
have a patch to improve the performance:

	KVM: TDX: Improve TDX guest shutdown latency

Then you put your performance data there, i.e., "tens of minutes difference for
TDX guest with hundreds of GB memory", to justify that patch.
Isaku Yamahata April 1, 2024, 10:55 p.m. UTC | #21
On Fri, Mar 29, 2024 at 02:22:12PM +0800,
Binbin Wu <binbin.wu@linux.intel.com> wrote:

> 
> 
> On 3/21/2024 10:17 PM, Isaku Yamahata wrote:
> > On Wed, Mar 20, 2024 at 01:12:01PM +0800,
> > Chao Gao <chao.gao@intel.com> wrote:
> > 
> > > > config KVM_SW_PROTECTED_VM
> > > > 	bool "Enable support for KVM software-protected VMs"
> > > > -	depends on EXPERT
> 
> This change is not needed, right?
> Since you intended to use KVM_GENERIC_PRIVATE_MEM, not KVM_SW_PROTECTED_VM.

Right. The fix will be something as follows.

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index f2bc78ceaa9a..e912b128bddb 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -77,6 +77,7 @@ config KVM_WERROR
 
 config KVM_SW_PROTECTED_VM
        bool "Enable support for KVM software-protected VMs"
+       depends on EXPERT
        depends on KVM && X86_64
        select KVM_GENERIC_PRIVATE_MEM
        help
@@ -90,7 +91,7 @@ config KVM_SW_PROTECTED_VM
 config KVM_INTEL
        tristate "KVM for Intel (and compatible) processors support"
        depends on KVM && IA32_FEAT_CTL
-       select KVM_SW_PROTECTED_VM if INTEL_TDX_HOST
+       select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST
        select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
        help
          Provides support for KVM on processors equipped with Intel's VT
Isaku Yamahata April 2, 2024, 6:16 a.m. UTC | #22
On Fri, Mar 29, 2024 at 03:25:47PM +0800,
Binbin Wu <binbin.wu@linux.intel.com> wrote:

> 
> 
> On 3/29/2024 4:39 AM, Isaku Yamahata wrote:
> 
> [...]
> > > > > > How about this?
> > > > > > 
> > > > > > /*
> > > > > >    * We need three SEAMCALLs, TDH.MNG.VPFLUSHDONE(), TDH.PHYMEM.CACHE.WB(), and
> > > > > >    * TDH.MNG.KEY.FREEID() to free the HKID.
> > > > > >    * Other threads can remove pages from TD.  When the HKID is assigned, we need
> > > > > >    * to use TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE().
> > > > > >    * TDH.PHYMEM.PAGE.RECLAIM() is needed when the HKID is free.  Get lock to not
> > > > > >    * present transient state of HKID.
> > > > > >    */
> > > > > Could you elaborate why it is still possible to have other thread removing
> > > > > pages from TD?
> > > > > 
> > > > > I am probably missing something, but the thing I don't understand is why
> > > > > this function is triggered by MMU release?  All the things done in this
> > > > > function don't seem to be related to MMU at all.
> > > > The KVM releases EPT pages on MMU notifier release.  kvm_mmu_zap_all() does. If
> > > > we follow that way, kvm_mmu_zap_all() zaps all the Secure-EPTs by
> > > > TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE().  Because
> > > > TDH.MEM.{SEPT, PAGE}.REMOVE() is slow, we can free HKID before kvm_mmu_zap_all()
> > > > to use TDH.PHYMEM.PAGE.RECLAIM().
> > > Can you elaborate why TDH.MEM.{SEPT,PAGE}.REMOVE is slower than
> > > TDH.PHYMEM.PAGE.RECLAIM()?
> > > 
> > > And does the difference matter in practice, i.e. did you see using the former
> > > having noticeable performance downgrade?
> > Yes. With HKID alive, we have to assume that vcpu can run still. It means TLB
> > shootdown. The difference is 2 extra SEAMCALL + IPI synchronization for each
> > guest private page.  If the guest has hundreds of GB, the difference can be
> > tens of minutes.
> > 
> > With HKID alive, we need to assume vcpu is alive.
> > - TDH.MEM.PAGE.REMOVE()
> > - TDH.PHYMEM.PAGE_WBINVD()
> > - TLB shoot down
> >    - TDH.MEM.TRACK()
> >    - IPI to other vcpus
> >    - wait for other vcpu to exit
> 
> Do we have a way to batch the TLB shoot down.
> IIUC, in current implementation, TLB shoot down needs to be done for each
> page remove, right?

That's right because the TDP MMU allows multiple vcpus to operate on EPT
concurrently.  Batching makes the logic more complex.  It's straightforward to
use the mmu notifier to know that we start to destroy the guest.
Isaku Yamahata April 3, 2024, 5:24 p.m. UTC | #23
On Thu, Mar 28, 2024 at 12:33:35PM +1300,
"Huang, Kai" <kai.huang@intel.com> wrote:

> > +	kvm_tdx->tdr_pa = tdr_pa;
> > +
> > +	for_each_online_cpu(i) {
> > +		int pkg = topology_physical_package_id(i);
> > +
> > +		if (cpumask_test_and_set_cpu(pkg, packages))
> > +			continue;
> > +
> > +		/*
> > +		 * Program the memory controller in the package with an
> > +		 * encryption key associated to a TDX private host key id
> > +		 * assigned to this TDR.  Concurrent operations on same memory
> > +		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
> > +		 * mutex.
> > +		 */
> 
> IIUC the race can only happen when you are creating multiple TDX guests
> simulatenously?  Please clarify this in the comment.
> 
> And I even don't think you need all these TDX module details:
> 
> 		/*
> 		 * Concurrent run of TDH.MNG.KEY.CONFIG on the same
> 		 * package resluts in TDX_OPERAND_BUSY.  When creating
> 		 * multiple TDX guests simultaneously this can run
> 		 * concurrently.  Take the per-package lock to
> 		 * serialize.
> 		 */

As pointed by Chao, those mutex will be dropped.
https://lore.kernel.org/kvm/ZfpwIespKy8qxWWE@chao-email/
Also we would simplify cpu masks to track which package is online/offline,
which cpu to use for each package somehow.


> > +		mutex_lock(&tdx_mng_key_config_lock[pkg]);
> > +		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
> > +				      &kvm_tdx->tdr_pa, true);
> > +		mutex_unlock(&tdx_mng_key_config_lock[pkg]);
> > +		if (ret)
> > +			break;
> > +	}
> > +	cpus_read_unlock();
> > +	free_cpumask_var(packages);
> > +	if (ret) {
> > +		i = 0;
> > +		goto teardown;
> > +	}
> > +
> > +	kvm_tdx->tdcs_pa = tdcs_pa;
> > +	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> > +		err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
> > +		if (err == TDX_RND_NO_ENTROPY) {
> > +			/* Here it's hard to allow userspace to retry. */
> > +			ret = -EBUSY;
> > +			goto teardown;
> > +		}
> > +		if (WARN_ON_ONCE(err)) {
> > +			pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
> > +			ret = -EIO;
> > +			goto teardown;
> > +		}
> > +	}
> > +
> > +	/*
> > +	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
> > +	 * ioctl() to define the configure CPUID values for the TD.
> > +	 */
> 
> Then, how about renaming this function to __tdx_td_create()?

So do we want to rename also ioctl name for consistency?
i.e. KVM_TDX_INIT_VM => KVM_TDX_CREATE_VM.

I don't have strong opinion those names. Maybe
KVM_TDX_{INIT, CREATE, or CONFIG}_VM?
And we can rename the function name to match it.

> > +	return 0;
> > +
> > +	/*
> > +	 * The sequence for freeing resources from a partially initialized TD
> > +	 * varies based on where in the initialization flow failure occurred.
> > +	 * Simply use the full teardown and destroy, which naturally play nice
> > +	 * with partial initialization.
> > +	 */
> > +teardown:
> > +	for (; i < tdx_info->nr_tdcs_pages; i++) {
> > +		if (tdcs_pa[i]) {
> > +			free_page((unsigned long)__va(tdcs_pa[i]));
> > +			tdcs_pa[i] = 0;
> > +		}
> > +	}
> > +	if (!kvm_tdx->tdcs_pa)
> > +		kfree(tdcs_pa);
> 
> The code to "free TDCS pages in a loop and free the array" is done below
> with duplicated code.  I am wondering whether we have way to eliminate one.
> 
> But I have lost track here, so perhaps we can review again after we split
> the patch to smaller pieces.

Surely we can simplify it.  Originally we had a spin lock and I had to separate
blocking memory allocation from its usage with this error clean up path.
Now it's mutex, we mix page allocation with its usage.


> > +	tdx_mmu_release_hkid(kvm);
> > +	tdx_vm_free(kvm);
> > +	return ret;
> > +
> > +free_packages:
> > +	cpus_read_unlock();
> > +	free_cpumask_var(packages);
> > +free_tdcs:
> > +	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> > +		if (tdcs_pa[i])
> > +			free_page((unsigned long)__va(tdcs_pa[i]));
> > +	}
> > +	kfree(tdcs_pa);
> > +	kvm_tdx->tdcs_pa = NULL;
> > +
> > +free_tdr:
> > +	if (tdr_pa)
> > +		free_page((unsigned long)__va(tdr_pa));
> > +	kvm_tdx->tdr_pa = 0;
> > +free_hkid:
> > +	if (is_hkid_assigned(kvm_tdx))
> > +		tdx_hkid_free(kvm_tdx);
> > +	return ret;
> > +}
> > +
> >   int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
> >   {
> >   	struct kvm_tdx_cmd tdx_cmd;
> > @@ -215,12 +664,13 @@ static int tdx_md_read(struct tdx_md_map *maps, int nr_maps)
> >   static int __init tdx_module_setup(void)
> >   {
> > -	u16 num_cpuid_config;
> > +	u16 num_cpuid_config, tdcs_base_size;
> >   	int ret;
> >   	u32 i;
> >   	struct tdx_md_map mds[] = {
> >   		TDX_MD_MAP(NUM_CPUID_CONFIG, &num_cpuid_config),
> > +		TDX_MD_MAP(TDCS_BASE_SIZE, &tdcs_base_size),
> >   	};
> >   	struct tdx_metadata_field_mapping fields[] = {
> > @@ -273,6 +723,8 @@ static int __init tdx_module_setup(void)
> >   		c->edx = ecx_edx >> 32;
> >   	}
> > +	tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE;
> > +
> 
> Round up the 'tdcs_base_size' to make sure you have enough room, or put a
> WARN() here if not page aligned?

Ok, will add round up. Same for tdvps_base_size.
I can't find about those sizes and page size in the TDX spec.  Although
TDH.MNG.ADDCX() and TDH.VP.ADDCX() imply that those sizes are multiple of PAGE
SIZE, the spec doesn't guarantee it.  I think silent round up is better than
WARN() because we can do nothing about those values the TDX module provides.



> >   	return 0;
> >   error_out:
> > @@ -319,13 +771,27 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
> >   	struct tdx_enabled enable = {
> >   		.err = ATOMIC_INIT(0),
> >   	};
> > +	int max_pkgs;
> >   	int r = 0;
> > +	int i;
> 
> Nit: you can put the 3 into one line.
> 
> > +	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
> > +		pr_warn("MOVDIR64B is reqiured for TDX\n");
> 
> It's better to make it more clear:
> 
> "Disable TDX: MOVDIR64B is not supported or disabled by the kernel."
> 
> Or, to match below:
> 
> "Cannot enable TDX w/o MOVDIR64B".

Ok.


> > +		return -EOPNOTSUPP;
> > +	}
> >   	if (!enable_ept) {
> >   		pr_warn("Cannot enable TDX with EPT disabled\n");
> >   		return -EINVAL;
> >   	}
> > +	max_pkgs = topology_max_packages();
> > +	tdx_mng_key_config_lock = kcalloc(max_pkgs, sizeof(*tdx_mng_key_config_lock),
> > +				   GFP_KERNEL);
> > +	if (!tdx_mng_key_config_lock)
> > +		return -ENOMEM;
> > +	for (i = 0; i < max_pkgs; i++)
> > +		mutex_init(&tdx_mng_key_config_lock[i]);
> > +
> 
> Using a per-socket lock looks a little bit overkill to me.  I don't know
> whether we need to do in the initial version.  Will leave to others.
> 
> Please at least add a comment to explain this is for better performance when
> creating multiple TDX guests IIUC?

Will delete the mutex and simply the related logic.
Kai Huang April 3, 2024, 10:13 p.m. UTC | #24
On 22/03/2024 3:17 am, Yamahata, Isaku wrote:
>>> +
>>> +	for_each_online_cpu(i) {
>>> +		int pkg = topology_physical_package_id(i);
>>> +
>>> +		if (cpumask_test_and_set_cpu(pkg, packages))
>>> +			continue;
>>> +
>>> +		/*
>>> +		 * Program the memory controller in the package with an
>>> +		 * encryption key associated to a TDX private host key id
>>> +		 * assigned to this TDR.  Concurrent operations on same memory
>>> +		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
>>> +		 * mutex.
>>> +		 */
>>> +		mutex_lock(&tdx_mng_key_config_lock[pkg]);
>> the lock is superfluous to me. with cpu lock held, even if multiple CPUs try to
>> create TDs, the same set of CPUs (the first online CPU of each package) will be
>> selected to configure the key because of the cpumask_test_and_set_cpu() above.
>> it means, we never have two CPUs in the same socket trying to program the key,
>> i.e., no concurrent calls.
> Makes sense. Will drop the lock.

Hmm.. Skipping in cpumask_test_and_set_cpu() would result in the second 
TDH.MNG.KEY.CONFIG not being done for the second VM.  No?
Kai Huang April 3, 2024, 10:26 p.m. UTC | #25
On 4/04/2024 6:24 am, Yamahata, Isaku wrote:
> On Thu, Mar 28, 2024 at 12:33:35PM +1300,
> "Huang, Kai" <kai.huang@intel.com> wrote:
> 
>>> +	kvm_tdx->tdr_pa = tdr_pa;
>>> +
>>> +	for_each_online_cpu(i) {
>>> +		int pkg = topology_physical_package_id(i);
>>> +
>>> +		if (cpumask_test_and_set_cpu(pkg, packages))
>>> +			continue;
>>> +
>>> +		/*
>>> +		 * Program the memory controller in the package with an
>>> +		 * encryption key associated to a TDX private host key id
>>> +		 * assigned to this TDR.  Concurrent operations on same memory
>>> +		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
>>> +		 * mutex.
>>> +		 */
>>
>> IIUC the race can only happen when you are creating multiple TDX guests
>> simulatenously?  Please clarify this in the comment.
>>
>> And I even don't think you need all these TDX module details:
>>
>> 		/*
>> 		 * Concurrent run of TDH.MNG.KEY.CONFIG on the same
>> 		 * package resluts in TDX_OPERAND_BUSY.  When creating
>> 		 * multiple TDX guests simultaneously this can run
>> 		 * concurrently.  Take the per-package lock to
>> 		 * serialize.
>> 		 */
> 
> As pointed by Chao, those mutex will be dropped.
> https://lore.kernel.org/kvm/ZfpwIespKy8qxWWE@chao-email/
> Also we would simplify cpu masks to track which package is online/offline,
> which cpu to use for each package somehow.

Please see my reply there.  I might be missing something, though.

> 
> 
>>> +		mutex_lock(&tdx_mng_key_config_lock[pkg]);
>>> +		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
>>> +				      &kvm_tdx->tdr_pa, true);
>>> +		mutex_unlock(&tdx_mng_key_config_lock[pkg]);
>>> +		if (ret)
>>> +			break;
>>> +	}
>>> +	cpus_read_unlock();
>>> +	free_cpumask_var(packages);
>>> +	if (ret) {
>>> +		i = 0;
>>> +		goto teardown;
>>> +	}
>>> +
>>> +	kvm_tdx->tdcs_pa = tdcs_pa;
>>> +	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
>>> +		err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
>>> +		if (err == TDX_RND_NO_ENTROPY) {
>>> +			/* Here it's hard to allow userspace to retry. */
>>> +			ret = -EBUSY;
>>> +			goto teardown;
>>> +		}
>>> +		if (WARN_ON_ONCE(err)) {
>>> +			pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
>>> +			ret = -EIO;
>>> +			goto teardown;
>>> +		}
>>> +	}
>>> +
>>> +	/*
>>> +	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
>>> +	 * ioctl() to define the configure CPUID values for the TD.
>>> +	 */
>>
>> Then, how about renaming this function to __tdx_td_create()?
> 
> So do we want to rename also ioctl name for consistency?
> i.e. KVM_TDX_INIT_VM => KVM_TDX_CREATE_VM.

Hmm.. but this __tdx_td_create() (the __tdx_td_init() in this patch) is 
called via kvm_x86_ops->vm_init(), but not IOCTL()?

If I read correctly, only TDH.MNG.INIT is called via IOCTL(), in that 
sense it makes more sense to name the IOCTL() as KVM_TDX_INIT_VM.

> 
> I don't have strong opinion those names. Maybe
> KVM_TDX_{INIT, CREATE, or CONFIG}_VM?
> And we can rename the function name to match it.
>
Chao Gao April 4, 2024, 1:03 a.m. UTC | #26
On Thu, Apr 04, 2024 at 11:13:49AM +1300, Huang, Kai wrote:
>
>
>On 22/03/2024 3:17 am, Yamahata, Isaku wrote:
>> > > +
>> > > +	for_each_online_cpu(i) {
>> > > +		int pkg = topology_physical_package_id(i);
>> > > +
>> > > +		if (cpumask_test_and_set_cpu(pkg, packages))
>> > > +			continue;
>> > > +
>> > > +		/*
>> > > +		 * Program the memory controller in the package with an
>> > > +		 * encryption key associated to a TDX private host key id
>> > > +		 * assigned to this TDR.  Concurrent operations on same memory
>> > > +		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
>> > > +		 * mutex.
>> > > +		 */
>> > > +		mutex_lock(&tdx_mng_key_config_lock[pkg]);
>> > the lock is superfluous to me. with cpu lock held, even if multiple CPUs try to
>> > create TDs, the same set of CPUs (the first online CPU of each package) will be
>> > selected to configure the key because of the cpumask_test_and_set_cpu() above.
>> > it means, we never have two CPUs in the same socket trying to program the key,
>> > i.e., no concurrent calls.
>> Makes sense. Will drop the lock.
>
>Hmm.. Skipping in cpumask_test_and_set_cpu() would result in the second
>TDH.MNG.KEY.CONFIG not being done for the second VM.  No?

No. Because @packages isn't shared between VMs.
Kai Huang April 4, 2024, 1:24 a.m. UTC | #27
On 4/04/2024 2:03 pm, Chao Gao wrote:
> On Thu, Apr 04, 2024 at 11:13:49AM +1300, Huang, Kai wrote:
>>
>>
>> On 22/03/2024 3:17 am, Yamahata, Isaku wrote:
>>>>> +
>>>>> +	for_each_online_cpu(i) {
>>>>> +		int pkg = topology_physical_package_id(i);
>>>>> +
>>>>> +		if (cpumask_test_and_set_cpu(pkg, packages))
>>>>> +			continue;
>>>>> +
>>>>> +		/*
>>>>> +		 * Program the memory controller in the package with an
>>>>> +		 * encryption key associated to a TDX private host key id
>>>>> +		 * assigned to this TDR.  Concurrent operations on same memory
>>>>> +		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
>>>>> +		 * mutex.
>>>>> +		 */
>>>>> +		mutex_lock(&tdx_mng_key_config_lock[pkg]);
>>>> the lock is superfluous to me. with cpu lock held, even if multiple CPUs try to
>>>> create TDs, the same set of CPUs (the first online CPU of each package) will be
>>>> selected to configure the key because of the cpumask_test_and_set_cpu() above.
>>>> it means, we never have two CPUs in the same socket trying to program the key,
>>>> i.e., no concurrent calls.
>>> Makes sense. Will drop the lock.
>>
>> Hmm.. Skipping in cpumask_test_and_set_cpu() would result in the second
>> TDH.MNG.KEY.CONFIG not being done for the second VM.  No?
> 
> No. Because @packages isn't shared between VMs.

I see.  Thanks.
Xiaoyao Li April 15, 2024, 8:17 a.m. UTC | #28
On 2/26/2024 4:25 PM, isaku.yamahata@intel.com wrote:

...

> +
> +	kvm_tdx->tdcs_pa = tdcs_pa;
> +	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> +		err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
> +		if (err == TDX_RND_NO_ENTROPY) {
> +			/* Here it's hard to allow userspace to retry. */
> +			ret = -EBUSY;

So userspace is expected to stop creating TD and quit on this?

If so, it exposes an DOS attack surface that malicious users in another 
can drain the entropy with busy-loop on RDSEED.

Can you clarify why it's hard to allow userspace to retry? To me, it's 
OK to retry that "teardown" cleans everything up, and userspace and 
issue the KVM_TDX_INIT_VM again.

> +			goto teardown;
> +		}
> +		if (WARN_ON_ONCE(err)) {
> +			pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
> +			ret = -EIO;
> +			goto teardown;
> +		}
> +	}
> +
> +	/*
> +	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
> +	 * ioctl() to define the configure CPUID values for the TD.
> +	 */
> +	return 0;
> +
> +	/*
> +	 * The sequence for freeing resources from a partially initialized TD
> +	 * varies based on where in the initialization flow failure occurred.
> +	 * Simply use the full teardown and destroy, which naturally play nice
> +	 * with partial initialization.
> +	 */
> +teardown:
> +	for (; i < tdx_info->nr_tdcs_pages; i++) {
> +		if (tdcs_pa[i]) {
> +			free_page((unsigned long)__va(tdcs_pa[i]));
> +			tdcs_pa[i] = 0;
> +		}
> +	}
> +	if (!kvm_tdx->tdcs_pa)
> +		kfree(tdcs_pa);
> +	tdx_mmu_release_hkid(kvm);
> +	tdx_vm_free(kvm);
> +	return ret;
> +
> +free_packages:
> +	cpus_read_unlock();
> +	free_cpumask_var(packages);
> +free_tdcs:
> +	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> +		if (tdcs_pa[i])
> +			free_page((unsigned long)__va(tdcs_pa[i]));
> +	}
> +	kfree(tdcs_pa);
> +	kvm_tdx->tdcs_pa = NULL;
> +
> +free_tdr:
> +	if (tdr_pa)
> +		free_page((unsigned long)__va(tdr_pa));
> +	kvm_tdx->tdr_pa = 0;
> +free_hkid:
> +	if (is_hkid_assigned(kvm_tdx))
> +		tdx_hkid_free(kvm_tdx);
> +	return ret;
> +}
> +
Isaku Yamahata April 16, 2024, 4:40 p.m. UTC | #29
On Mon, Apr 15, 2024 at 04:17:35PM +0800,
Xiaoyao Li <xiaoyao.li@intel.com> wrote:

> On 2/26/2024 4:25 PM, isaku.yamahata@intel.com wrote:
> 
> ...
> 
> > +
> > +	kvm_tdx->tdcs_pa = tdcs_pa;
> > +	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
> > +		err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
> > +		if (err == TDX_RND_NO_ENTROPY) {
> > +			/* Here it's hard to allow userspace to retry. */
> > +			ret = -EBUSY;
> 
> So userspace is expected to stop creating TD and quit on this?
> 
> If so, it exposes an DOS attack surface that malicious users in another can
> drain the entropy with busy-loop on RDSEED.
> 
> Can you clarify why it's hard to allow userspace to retry? To me, it's OK to
> retry that "teardown" cleans everything up, and userspace and issue the
> KVM_TDX_INIT_VM again.

The current patch has complicated error recovery path.  After simplifying
the code, it would be possible to return -EAGAIN in this patch.

For the retry case, we need to avoid TDH.MNG.CREATE() and TDH.MNG.KEY.CONFIG().
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 1b0dacc6b6c0..97f0a681e02c 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -24,7 +24,9 @@  KVM_X86_OP(is_vm_type_supported)
 KVM_X86_OP_OPTIONAL(max_vcpus);
 KVM_X86_OP_OPTIONAL(vm_enable_cap)
 KVM_X86_OP(vm_init)
+KVM_X86_OP_OPTIONAL(flush_shadow_all_private)
 KVM_X86_OP_OPTIONAL(vm_destroy)
+KVM_X86_OP_OPTIONAL(vm_free)
 KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate)
 KVM_X86_OP(vcpu_create)
 KVM_X86_OP(vcpu_free)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cf8eb46b3a20..6dedb5cb71ef 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1608,7 +1608,9 @@  struct kvm_x86_ops {
 	unsigned int vm_size;
 	int (*vm_enable_cap)(struct kvm *kvm, struct kvm_enable_cap *cap);
 	int (*vm_init)(struct kvm *kvm);
+	void (*flush_shadow_all_private)(struct kvm *kvm);
 	void (*vm_destroy)(struct kvm *kvm);
+	void (*vm_free)(struct kvm *kvm);
 
 	/* Create, but do not attach this VCPU */
 	int (*vcpu_precreate)(struct kvm *kvm);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 87e3da7b0439..bc077d6f4b43 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -76,7 +76,6 @@  config KVM_WERROR
 
 config KVM_SW_PROTECTED_VM
 	bool "Enable support for KVM software-protected VMs"
-	depends on EXPERT
 	depends on KVM && X86_64
 	select KVM_GENERIC_PRIVATE_MEM
 	help
@@ -89,6 +88,8 @@  config KVM_SW_PROTECTED_VM
 config KVM_INTEL
 	tristate "KVM for Intel (and compatible) processors support"
 	depends on KVM && IA32_FEAT_CTL
+	select KVM_SW_PROTECTED_VM if INTEL_TDX_HOST
+	select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
 	help
 	  Provides support for KVM on processors equipped with Intel's VT
 	  extensions, a.k.a. Virtual Machine Extensions (VMX).
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c45252ed2ffd..2becc86c71b2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6866,6 +6866,13 @@  static void kvm_mmu_zap_all(struct kvm *kvm)
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
+	/*
+	 * kvm_mmu_zap_all() zaps both private and shared page tables.  Before
+	 * tearing down private page tables, TDX requires some TD resources to
+	 * be destroyed (i.e. keyID must have been reclaimed, etc).  Invoke
+	 * kvm_x86_flush_shadow_all_private() for this.
+	 */
+	static_call_cond(kvm_x86_flush_shadow_all_private)(kvm);
 	kvm_mmu_zap_all(kvm);
 }
 
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index e8a1a7533eea..437c6d5e802e 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -62,11 +62,31 @@  static int vt_vm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 static int vt_vm_init(struct kvm *kvm)
 {
 	if (is_td(kvm))
-		return -EOPNOTSUPP;	/* Not ready to create guest TD yet. */
+		return tdx_vm_init(kvm);
 
 	return vmx_vm_init(kvm);
 }
 
+static void vt_flush_shadow_all_private(struct kvm *kvm)
+{
+	if (is_td(kvm))
+		tdx_mmu_release_hkid(kvm);
+}
+
+static void vt_vm_destroy(struct kvm *kvm)
+{
+	if (is_td(kvm))
+		return;
+
+	vmx_vm_destroy(kvm);
+}
+
+static void vt_vm_free(struct kvm *kvm)
+{
+	if (is_td(kvm))
+		tdx_vm_free(kvm);
+}
+
 static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
 {
 	if (!is_td(kvm))
@@ -101,7 +121,9 @@  struct kvm_x86_ops vt_x86_ops __initdata = {
 	.vm_size = sizeof(struct kvm_vmx),
 	.vm_enable_cap = vt_vm_enable_cap,
 	.vm_init = vt_vm_init,
-	.vm_destroy = vmx_vm_destroy,
+	.flush_shadow_all_private = vt_flush_shadow_all_private,
+	.vm_destroy = vt_vm_destroy,
+	.vm_free = vt_vm_free,
 
 	.vcpu_precreate = vmx_vcpu_precreate,
 	.vcpu_create = vmx_vcpu_create,
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index ee015f3ce2c9..1cf2b15da257 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -5,10 +5,11 @@ 
 
 #include "capabilities.h"
 #include "x86_ops.h"
-#include "x86.h"
 #include "mmu.h"
 #include "tdx_arch.h"
 #include "tdx.h"
+#include "tdx_ops.h"
+#include "x86.h"
 
 #undef pr_fmt
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -22,7 +23,7 @@ 
 /* TDX KeyID pool */
 static DEFINE_IDA(tdx_guest_keyid_pool);
 
-static int __used tdx_guest_keyid_alloc(void)
+static int tdx_guest_keyid_alloc(void)
 {
 	if (WARN_ON_ONCE(!tdx_guest_keyid_start || !tdx_nr_guest_keyids))
 		return -EINVAL;
@@ -32,7 +33,7 @@  static int __used tdx_guest_keyid_alloc(void)
 			       GFP_KERNEL);
 }
 
-static void __used tdx_guest_keyid_free(int keyid)
+static void tdx_guest_keyid_free(int keyid)
 {
 	if (WARN_ON_ONCE(keyid < tdx_guest_keyid_start ||
 			 keyid > tdx_guest_keyid_start + tdx_nr_guest_keyids - 1))
@@ -48,6 +49,8 @@  struct tdx_info {
 	u64 xfam_fixed0;
 	u64 xfam_fixed1;
 
+	u8 nr_tdcs_pages;
+
 	u16 num_cpuid_config;
 	/* This must the last member. */
 	DECLARE_FLEX_ARRAY(struct kvm_tdx_cpuid_config, cpuid_configs);
@@ -85,6 +88,282 @@  int tdx_vm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 	return r;
 }
 
+/*
+ * Some TDX SEAMCALLs (TDH.MNG.CREATE, TDH.PHYMEM.CACHE.WB,
+ * TDH.MNG.KEY.RECLAIMID, TDH.MNG.KEY.FREEID etc) tries to acquire a global lock
+ * internally in TDX module.  If failed, TDX_OPERAND_BUSY is returned without
+ * spinning or waiting due to a constraint on execution time.  It's caller's
+ * responsibility to avoid race (or retry on TDX_OPERAND_BUSY).  Use this mutex
+ * to avoid race in TDX module because the kernel knows better about scheduling.
+ */
+static DEFINE_MUTEX(tdx_lock);
+static struct mutex *tdx_mng_key_config_lock;
+
+static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
+{
+	return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
+}
+
+static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
+{
+	return kvm_tdx->tdr_pa;
+}
+
+static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
+{
+	tdx_guest_keyid_free(kvm_tdx->hkid);
+	kvm_tdx->hkid = -1;
+}
+
+static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
+{
+	return kvm_tdx->hkid > 0;
+}
+
+static void tdx_clear_page(unsigned long page_pa)
+{
+	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+	void *page = __va(page_pa);
+	unsigned long i;
+
+	/*
+	 * When re-assign one page from old keyid to a new keyid, MOVDIR64B is
+	 * required to clear/write the page with new keyid to prevent integrity
+	 * error when read on the page with new keyid.
+	 *
+	 * clflush doesn't flush cache with HKID set.  The cache line could be
+	 * poisoned (even without MKTME-i), clear the poison bit.
+	 */
+	for (i = 0; i < PAGE_SIZE; i += 64)
+		movdir64b(page + i, zero_page);
+	/*
+	 * MOVDIR64B store uses WC buffer.  Prevent following memory reads
+	 * from seeing potentially poisoned cache.
+	 */
+	__mb();
+}
+
+static int __tdx_reclaim_page(hpa_t pa)
+{
+	struct tdx_module_args out;
+	u64 err;
+
+	do {
+		err = tdh_phymem_page_reclaim(pa, &out);
+		/*
+		 * TDH.PHYMEM.PAGE.RECLAIM is allowed only when TD is shutdown.
+		 * state.  i.e. destructing TD.
+		 * TDH.PHYMEM.PAGE.RECLAIM requires TDR and target page.
+		 * Because we're destructing TD, it's rare to contend with TDR.
+		 */
+	} while (unlikely(err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX) ||
+			  err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_TDR)));
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_PHYMEM_PAGE_RECLAIM, err, &out);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int tdx_reclaim_page(hpa_t pa)
+{
+	int r;
+
+	r = __tdx_reclaim_page(pa);
+	if (!r)
+		tdx_clear_page(pa);
+	return r;
+}
+
+static void tdx_reclaim_control_page(unsigned long td_page_pa)
+{
+	WARN_ON_ONCE(!td_page_pa);
+
+	/*
+	 * TDCX are being reclaimed.  TDX module maps TDCX with HKID
+	 * assigned to the TD.  Here the cache associated to the TD
+	 * was already flushed by TDH.PHYMEM.CACHE.WB before here, So
+	 * cache doesn't need to be flushed again.
+	 */
+	if (tdx_reclaim_page(td_page_pa))
+		/*
+		 * Leak the page on failure:
+		 * tdx_reclaim_page() returns an error if and only if there's an
+		 * unexpected, fatal error, e.g. a SEAMCALL with bad params,
+		 * incorrect concurrency in KVM, a TDX Module bug, etc.
+		 * Retrying at a later point is highly unlikely to be
+		 * successful.
+		 * No log here as tdx_reclaim_page() already did.
+		 */
+		return;
+	free_page((unsigned long)__va(td_page_pa));
+}
+
+static void tdx_do_tdh_phymem_cache_wb(void *unused)
+{
+	u64 err = 0;
+
+	do {
+		err = tdh_phymem_cache_wb(!!err);
+	} while (err == TDX_INTERRUPTED_RESUMABLE);
+
+	/* Other thread may have done for us. */
+	if (err == TDX_NO_HKID_READY_TO_WBCACHE)
+		err = TDX_SUCCESS;
+	if (WARN_ON_ONCE(err))
+		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
+}
+
+void tdx_mmu_release_hkid(struct kvm *kvm)
+{
+	bool packages_allocated, targets_allocated;
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	cpumask_var_t packages, targets;
+	u64 err;
+	int i;
+
+	if (!is_hkid_assigned(kvm_tdx))
+		return;
+
+	if (!is_td_created(kvm_tdx)) {
+		tdx_hkid_free(kvm_tdx);
+		return;
+	}
+
+	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
+	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
+	cpus_read_lock();
+
+	/*
+	 * We can destroy multiple guest TDs simultaneously.  Prevent
+	 * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
+	 */
+	mutex_lock(&tdx_lock);
+
+	/*
+	 * Go through multiple TDX HKID state transitions with three SEAMCALLs
+	 * to make TDH.PHYMEM.PAGE.RECLAIM() usable.  Make the transition atomic
+	 * to other functions to operate private pages and Secure-EPT pages.
+	 *
+	 * Avoid race for kvm_gmem_release() to call kvm_mmu_unmap_gfn_range().
+	 * This function is called via mmu notifier, mmu_release().
+	 * kvm_gmem_release() is called via fput() on process exit.
+	 */
+	write_lock(&kvm->mmu_lock);
+
+	for_each_online_cpu(i) {
+		if (packages_allocated &&
+		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
+					     packages))
+			continue;
+		if (targets_allocated)
+			cpumask_set_cpu(i, targets);
+	}
+	if (targets_allocated)
+		on_each_cpu_mask(targets, tdx_do_tdh_phymem_cache_wb, NULL, true);
+	else
+		on_each_cpu(tdx_do_tdh_phymem_cache_wb, NULL, true);
+	/*
+	 * In the case of error in tdx_do_tdh_phymem_cache_wb(), the following
+	 * tdh_mng_key_freeid() will fail.
+	 */
+	err = tdh_mng_key_freeid(kvm_tdx->tdr_pa);
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
+		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
+		       kvm_tdx->hkid);
+	} else
+		tdx_hkid_free(kvm_tdx);
+
+	write_unlock(&kvm->mmu_lock);
+	mutex_unlock(&tdx_lock);
+	cpus_read_unlock();
+	free_cpumask_var(targets);
+	free_cpumask_var(packages);
+}
+
+void tdx_vm_free(struct kvm *kvm)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	u64 err;
+	int i;
+
+	/*
+	 * tdx_mmu_release_hkid() failed to reclaim HKID.  Something went wrong
+	 * heavily with TDX module.  Give up freeing TD pages.  As the function
+	 * already warned, don't warn it again.
+	 */
+	if (is_hkid_assigned(kvm_tdx))
+		return;
+
+	if (kvm_tdx->tdcs_pa) {
+		for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
+			if (kvm_tdx->tdcs_pa[i])
+				tdx_reclaim_control_page(kvm_tdx->tdcs_pa[i]);
+		}
+		kfree(kvm_tdx->tdcs_pa);
+		kvm_tdx->tdcs_pa = NULL;
+	}
+
+	if (!kvm_tdx->tdr_pa)
+		return;
+	if (__tdx_reclaim_page(kvm_tdx->tdr_pa))
+		return;
+	/*
+	 * TDX module maps TDR with TDX global HKID.  TDX module may access TDR
+	 * while operating on TD (Especially reclaiming TDCS).  Cache flush with
+	 * TDX global HKID is needed.
+	 */
+	err = tdh_phymem_page_wbinvd(set_hkid_to_hpa(kvm_tdx->tdr_pa,
+						     tdx_global_keyid));
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err, NULL);
+		return;
+	}
+	tdx_clear_page(kvm_tdx->tdr_pa);
+
+	free_page((unsigned long)__va(kvm_tdx->tdr_pa));
+	kvm_tdx->tdr_pa = 0;
+}
+
+static int tdx_do_tdh_mng_key_config(void *param)
+{
+	hpa_t *tdr_p = param;
+	u64 err;
+
+	do {
+		err = tdh_mng_key_config(*tdr_p);
+
+		/*
+		 * If it failed to generate a random key, retry it because this
+		 * is typically caused by an entropy error of the CPU's random
+		 * number generator.
+		 */
+	} while (err == TDX_KEY_GENERATION_FAILED);
+
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_MNG_KEY_CONFIG, err, NULL);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int __tdx_td_init(struct kvm *kvm);
+
+int tdx_vm_init(struct kvm *kvm)
+{
+	/*
+	 * TDX has its own limit of the number of vcpus in addition to
+	 * KVM_MAX_VCPUS.
+	 */
+	kvm->max_vcpus = min(kvm->max_vcpus, TDX_MAX_VCPUS);
+
+	/* Place holder for TDX specific logic. */
+	return __tdx_td_init(kvm);
+}
+
 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
 {
 	struct kvm_tdx_capabilities __user *user_caps;
@@ -137,6 +416,176 @@  static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
 	return ret;
 }
 
+static int __tdx_td_init(struct kvm *kvm)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	cpumask_var_t packages;
+	unsigned long *tdcs_pa = NULL;
+	unsigned long tdr_pa = 0;
+	unsigned long va;
+	int ret, i;
+	u64 err;
+
+	ret = tdx_guest_keyid_alloc();
+	if (ret < 0)
+		return ret;
+	kvm_tdx->hkid = ret;
+
+	va = __get_free_page(GFP_KERNEL_ACCOUNT);
+	if (!va)
+		goto free_hkid;
+	tdr_pa = __pa(va);
+
+	tdcs_pa = kcalloc(tdx_info->nr_tdcs_pages, sizeof(*kvm_tdx->tdcs_pa),
+			  GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (!tdcs_pa)
+		goto free_tdr;
+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
+		va = __get_free_page(GFP_KERNEL_ACCOUNT);
+		if (!va)
+			goto free_tdcs;
+		tdcs_pa[i] = __pa(va);
+	}
+
+	if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto free_tdcs;
+	}
+	cpus_read_lock();
+	/*
+	 * Need at least one CPU of the package to be online in order to
+	 * program all packages for host key id.  Check it.
+	 */
+	for_each_present_cpu(i)
+		cpumask_set_cpu(topology_physical_package_id(i), packages);
+	for_each_online_cpu(i)
+		cpumask_clear_cpu(topology_physical_package_id(i), packages);
+	if (!cpumask_empty(packages)) {
+		ret = -EIO;
+		/*
+		 * Because it's hard for human operator to figure out the
+		 * reason, warn it.
+		 */
+#define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
+		pr_warn_ratelimited(MSG_ALLPKG);
+		goto free_packages;
+	}
+
+	/*
+	 * Acquire global lock to avoid TDX_OPERAND_BUSY:
+	 * TDH.MNG.CREATE and other APIs try to lock the global Key Owner
+	 * Table (KOT) to track the assigned TDX private HKID.  It doesn't spin
+	 * to acquire the lock, returns TDX_OPERAND_BUSY instead, and let the
+	 * caller to handle the contention.  This is because of time limitation
+	 * usable inside the TDX module and OS/VMM knows better about process
+	 * scheduling.
+	 *
+	 * APIs to acquire the lock of KOT:
+	 * TDH.MNG.CREATE, TDH.MNG.KEY.FREEID, TDH.MNG.VPFLUSHDONE, and
+	 * TDH.PHYMEM.CACHE.WB.
+	 */
+	mutex_lock(&tdx_lock);
+	err = tdh_mng_create(tdr_pa, kvm_tdx->hkid);
+	mutex_unlock(&tdx_lock);
+	if (err == TDX_RND_NO_ENTROPY) {
+		ret = -EAGAIN;
+		goto free_packages;
+	}
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_MNG_CREATE, err, NULL);
+		ret = -EIO;
+		goto free_packages;
+	}
+	kvm_tdx->tdr_pa = tdr_pa;
+
+	for_each_online_cpu(i) {
+		int pkg = topology_physical_package_id(i);
+
+		if (cpumask_test_and_set_cpu(pkg, packages))
+			continue;
+
+		/*
+		 * Program the memory controller in the package with an
+		 * encryption key associated to a TDX private host key id
+		 * assigned to this TDR.  Concurrent operations on same memory
+		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
+		 * mutex.
+		 */
+		mutex_lock(&tdx_mng_key_config_lock[pkg]);
+		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
+				      &kvm_tdx->tdr_pa, true);
+		mutex_unlock(&tdx_mng_key_config_lock[pkg]);
+		if (ret)
+			break;
+	}
+	cpus_read_unlock();
+	free_cpumask_var(packages);
+	if (ret) {
+		i = 0;
+		goto teardown;
+	}
+
+	kvm_tdx->tdcs_pa = tdcs_pa;
+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
+		err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
+		if (err == TDX_RND_NO_ENTROPY) {
+			/* Here it's hard to allow userspace to retry. */
+			ret = -EBUSY;
+			goto teardown;
+		}
+		if (WARN_ON_ONCE(err)) {
+			pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
+			ret = -EIO;
+			goto teardown;
+		}
+	}
+
+	/*
+	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
+	 * ioctl() to define the configure CPUID values for the TD.
+	 */
+	return 0;
+
+	/*
+	 * The sequence for freeing resources from a partially initialized TD
+	 * varies based on where in the initialization flow failure occurred.
+	 * Simply use the full teardown and destroy, which naturally play nice
+	 * with partial initialization.
+	 */
+teardown:
+	for (; i < tdx_info->nr_tdcs_pages; i++) {
+		if (tdcs_pa[i]) {
+			free_page((unsigned long)__va(tdcs_pa[i]));
+			tdcs_pa[i] = 0;
+		}
+	}
+	if (!kvm_tdx->tdcs_pa)
+		kfree(tdcs_pa);
+	tdx_mmu_release_hkid(kvm);
+	tdx_vm_free(kvm);
+	return ret;
+
+free_packages:
+	cpus_read_unlock();
+	free_cpumask_var(packages);
+free_tdcs:
+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
+		if (tdcs_pa[i])
+			free_page((unsigned long)__va(tdcs_pa[i]));
+	}
+	kfree(tdcs_pa);
+	kvm_tdx->tdcs_pa = NULL;
+
+free_tdr:
+	if (tdr_pa)
+		free_page((unsigned long)__va(tdr_pa));
+	kvm_tdx->tdr_pa = 0;
+free_hkid:
+	if (is_hkid_assigned(kvm_tdx))
+		tdx_hkid_free(kvm_tdx);
+	return ret;
+}
+
 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_tdx_cmd tdx_cmd;
@@ -215,12 +664,13 @@  static int tdx_md_read(struct tdx_md_map *maps, int nr_maps)
 
 static int __init tdx_module_setup(void)
 {
-	u16 num_cpuid_config;
+	u16 num_cpuid_config, tdcs_base_size;
 	int ret;
 	u32 i;
 
 	struct tdx_md_map mds[] = {
 		TDX_MD_MAP(NUM_CPUID_CONFIG, &num_cpuid_config),
+		TDX_MD_MAP(TDCS_BASE_SIZE, &tdcs_base_size),
 	};
 
 	struct tdx_metadata_field_mapping fields[] = {
@@ -273,6 +723,8 @@  static int __init tdx_module_setup(void)
 		c->edx = ecx_edx >> 32;
 	}
 
+	tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE;
+
 	return 0;
 
 error_out:
@@ -319,13 +771,27 @@  int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
 	struct tdx_enabled enable = {
 		.err = ATOMIC_INIT(0),
 	};
+	int max_pkgs;
 	int r = 0;
+	int i;
 
+	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
+		pr_warn("MOVDIR64B is reqiured for TDX\n");
+		return -EOPNOTSUPP;
+	}
 	if (!enable_ept) {
 		pr_warn("Cannot enable TDX with EPT disabled\n");
 		return -EINVAL;
 	}
 
+	max_pkgs = topology_max_packages();
+	tdx_mng_key_config_lock = kcalloc(max_pkgs, sizeof(*tdx_mng_key_config_lock),
+				   GFP_KERNEL);
+	if (!tdx_mng_key_config_lock)
+		return -ENOMEM;
+	for (i = 0; i < max_pkgs; i++)
+		mutex_init(&tdx_mng_key_config_lock[i]);
+
 	if (!zalloc_cpumask_var(&enable.enabled, GFP_KERNEL)) {
 		r = -ENOMEM;
 		goto out;
@@ -350,4 +816,5 @@  int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
 void tdx_hardware_unsetup(void)
 {
 	kfree(tdx_info);
+	kfree(tdx_mng_key_config_lock);
 }
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 22c0b57f69ca..ae117f864cfb 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -8,7 +8,11 @@ 
 
 struct kvm_tdx {
 	struct kvm kvm;
-	/* TDX specific members follow. */
+
+	unsigned long tdr_pa;
+	unsigned long *tdcs_pa;
+
+	int hkid;
 };
 
 struct vcpu_tdx {
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 0031a8d61589..7f123ffe4d42 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -140,6 +140,9 @@  void tdx_hardware_unsetup(void);
 bool tdx_is_vm_type_supported(unsigned long type);
 
 int tdx_vm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap);
+int tdx_vm_init(struct kvm *kvm);
+void tdx_mmu_release_hkid(struct kvm *kvm);
+void tdx_vm_free(struct kvm *kvm);
 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
 #else
 static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -EOPNOTSUPP; }
@@ -150,6 +153,9 @@  static inline int tdx_vm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 {
 	return -EINVAL;
 };
+static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
+static inline void tdx_mmu_release_hkid(struct kvm *kvm) {}
+static inline void tdx_vm_free(struct kvm *kvm) {}
 static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
 #endif
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3f16e9450d2f..1f01c7f91652 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12743,6 +12743,7 @@  void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_page_track_cleanup(kvm);
 	kvm_xen_destroy_vm(kvm);
 	kvm_hv_destroy_vm(kvm);
+	static_call_cond(kvm_x86_vm_free)(kvm);
 }
 
 static void memslot_rmap_free(struct kvm_memory_slot *slot)