diff mbox series

[18/25] KVM: TDX: Do TDX specific vcpu initialization

Message ID 20240812224820.34826-19-rick.p.edgecombe@intel.com (mailing list archive)
State New, archived
Headers show
Series TDX vCPU/VM creation | expand

Commit Message

Edgecombe, Rick P Aug. 12, 2024, 10:48 p.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

TD guest vcpu needs TDX specific initialization before running.  Repurpose
KVM_MEMORY_ENCRYPT_OP to vcpu-scope, add a new sub-command
KVM_TDX_INIT_VCPU, and implement the callback for it.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
uAPI breakout v1:
 - Support FEATURES0_TOPOLOGY_ENUM
 - Update for the wrapper functions for SEAMCALLs. (Sean)
 - Remove WARN_ON_ONCE() in tdx_vcpu_free().
   WARN_ON_ONCE(vcpu->cpu != -1), WARN_ON_ONCE(tdx->tdvpx_pa),
   WARN_ON_ONCE(tdx->tdvpr_pa)
 - Remove KVM_BUG_ON() in tdx_vcpu_reset().
 - Remove duplicate "tdx->tdvpr_pa=" lines
 - Rename tdvpx to tdcx as it is confusing, follow spec change for same
   reason (Isaku)
 - Updates from seamcall overhaul (Kai)
 - Rename error->hw_error
 - Change using tdx_info to using exported 'tdx_sysinfo' pointer in
   tdx_td_vcpu_init().
 - Remove code to the old (non-existing) tdx_module_setup().
 - Use a new wrapper tdx_sysinfo_nr_tdcx_pages() to replace
   tdx_info->nr_tdcx_pages.
 - Combine the two for loops in tdx_td_vcpu_init() (Chao)
 - Add more line breaks into tdx_td_vcpu_init() for readability (Tony)
 - Drop Drop local tdcx_pa in tdx_td_vcpu_init() (Rick)
 - Drop Drop local tdvpr_pa in tdx_td_vcpu_init() (Rick)

v18:
 - Use tdh_sys_rd() instead of struct tdsysinfo_struct.
 - Rename tdx_reclaim_td_page() => tdx_reclaim_control_page()
 - Remove the change of tools/arch/x86/include/uapi/asm/kvm.h.
---
 arch/x86/include/asm/kvm-x86-ops.h |   1 +
 arch/x86/include/asm/kvm_host.h    |   1 +
 arch/x86/include/uapi/asm/kvm.h    |   1 +
 arch/x86/kvm/vmx/main.c            |   9 ++
 arch/x86/kvm/vmx/tdx.c             | 193 ++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/tdx.h             |   6 +
 arch/x86/kvm/vmx/tdx_arch.h        |   2 +
 arch/x86/kvm/vmx/x86_ops.h         |   4 +
 arch/x86/kvm/x86.c                 |   6 +
 9 files changed, 221 insertions(+), 2 deletions(-)

Comments

Yuan Yao Aug. 13, 2024, 8 a.m. UTC | #1
On Mon, Aug 12, 2024 at 03:48:13PM -0700, Rick Edgecombe wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
>
> TD guest vcpu needs TDX specific initialization before running.  Repurpose
> KVM_MEMORY_ENCRYPT_OP to vcpu-scope, add a new sub-command
> KVM_TDX_INIT_VCPU, and implement the callback for it.
>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> ---
> uAPI breakout v1:
>  - Support FEATURES0_TOPOLOGY_ENUM
>  - Update for the wrapper functions for SEAMCALLs. (Sean)
>  - Remove WARN_ON_ONCE() in tdx_vcpu_free().
>    WARN_ON_ONCE(vcpu->cpu != -1), WARN_ON_ONCE(tdx->tdvpx_pa),
>    WARN_ON_ONCE(tdx->tdvpr_pa)
>  - Remove KVM_BUG_ON() in tdx_vcpu_reset().
>  - Remove duplicate "tdx->tdvpr_pa=" lines
>  - Rename tdvpx to tdcx as it is confusing, follow spec change for same
>    reason (Isaku)
>  - Updates from seamcall overhaul (Kai)
>  - Rename error->hw_error
>  - Change using tdx_info to using exported 'tdx_sysinfo' pointer in
>    tdx_td_vcpu_init().
>  - Remove code to the old (non-existing) tdx_module_setup().
>  - Use a new wrapper tdx_sysinfo_nr_tdcx_pages() to replace
>    tdx_info->nr_tdcx_pages.
>  - Combine the two for loops in tdx_td_vcpu_init() (Chao)
>  - Add more line breaks into tdx_td_vcpu_init() for readability (Tony)
>  - Drop Drop local tdcx_pa in tdx_td_vcpu_init() (Rick)
>  - Drop Drop local tdvpr_pa in tdx_td_vcpu_init() (Rick)
>
> v18:
>  - Use tdh_sys_rd() instead of struct tdsysinfo_struct.
>  - Rename tdx_reclaim_td_page() => tdx_reclaim_control_page()
>  - Remove the change of tools/arch/x86/include/uapi/asm/kvm.h.
> ---
>  arch/x86/include/asm/kvm-x86-ops.h |   1 +
>  arch/x86/include/asm/kvm_host.h    |   1 +
>  arch/x86/include/uapi/asm/kvm.h    |   1 +
>  arch/x86/kvm/vmx/main.c            |   9 ++
>  arch/x86/kvm/vmx/tdx.c             | 193 ++++++++++++++++++++++++++++-
>  arch/x86/kvm/vmx/tdx.h             |   6 +
>  arch/x86/kvm/vmx/tdx_arch.h        |   2 +
>  arch/x86/kvm/vmx/x86_ops.h         |   4 +
>  arch/x86/kvm/x86.c                 |   6 +
>  9 files changed, 221 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 12ee66bc9026..5dd7955376e3 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -126,6 +126,7 @@ KVM_X86_OP(enable_smi_window)
>  #endif
>  KVM_X86_OP_OPTIONAL(dev_get_attr)
>  KVM_X86_OP(mem_enc_ioctl)
> +KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
>  KVM_X86_OP_OPTIONAL(mem_enc_register_region)
>  KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
>  KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 188cd684bffb..e3094c843556 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1829,6 +1829,7 @@ struct kvm_x86_ops {
>
>  	int (*dev_get_attr)(u32 group, u64 attr, u64 *val);
>  	int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
> +	int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
>  	int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
>  	int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
>  	int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 95ae2d4a4697..b4f12997052d 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -930,6 +930,7 @@ struct kvm_hyperv_eventfd {
>  enum kvm_tdx_cmd_id {
>  	KVM_TDX_CAPABILITIES = 0,
>  	KVM_TDX_INIT_VM,
> +	KVM_TDX_INIT_VCPU,
>
>  	KVM_TDX_CMD_NR_MAX,
>  };
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index d40de73d2bd3..e34cb476cc78 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -116,6 +116,14 @@ static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
>  	return tdx_vm_ioctl(kvm, argp);
>  }
>
> +static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> +{
> +	if (!is_td_vcpu(vcpu))
> +		return -EINVAL;
> +
> +	return tdx_vcpu_ioctl(vcpu, argp);
> +}
> +
>  #define VMX_REQUIRED_APICV_INHIBITS				\
>  	(BIT(APICV_INHIBIT_REASON_DISABLED) |			\
>  	 BIT(APICV_INHIBIT_REASON_ABSENT) |			\
> @@ -268,6 +276,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
>  	.get_untagged_addr = vmx_get_untagged_addr,
>
>  	.mem_enc_ioctl = vt_mem_enc_ioctl,
> +	.vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
>  };
>
>  struct kvm_x86_init_ops vt_init_ops __initdata = {
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 18738cacbc87..ba7b436fae86 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -89,6 +89,11 @@ static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
>  	return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
>  }
>
> +static inline bool is_td_vcpu_created(struct vcpu_tdx *tdx)
> +{
> +	return tdx->td_vcpu_created;
> +}
> +
>  static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
>  {
>  	return kvm_tdx->tdr_pa;
> @@ -105,6 +110,11 @@ static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
>  	return kvm_tdx->hkid > 0;
>  }
>
> +static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
> +{
> +	return kvm_tdx->finalized;
> +}
> +
>  static void tdx_clear_page(unsigned long page_pa)
>  {
>  	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> @@ -293,6 +303,15 @@ static inline u8 tdx_sysinfo_nr_tdcs_pages(void)
>  	return tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
>  }
>
> +static inline u8 tdx_sysinfo_nr_tdcx_pages(void)

tdx_sysinfo_nr_tdcx_pages() is very similar to
tdx_sysinfo_nr_tdcs_pages() which is introduced in patch 13.

It's easy to use either of them in wrong place and hard to
review, these 2 functions have same signature so compiler
has no way to prevent us from using them incorrectly.
TDX 1.5 spec defines these additional pages for TD and vCPU to
"TDCX" pages, so how about we name them like:

u8 tdx_sysinfo_nr_td_tdcx_pages(void);
u8 tdx_sysinfo_nr_vcpu_tdcx_pages(void);

Above name matchs spec more, and easy to distinguish and review.

> +{
> +	/*
> +	 * TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages).
> +	 * -1 for TDVPR.
> +	 */
> +	return tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
> +}
> +
>  void tdx_vm_free(struct kvm *kvm)
>  {
>  	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> @@ -405,7 +424,29 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
>
>  void tdx_vcpu_free(struct kvm_vcpu *vcpu)
>  {
> -	/* This is stub for now.  More logic will come. */
> +	struct vcpu_tdx *tdx = to_tdx(vcpu);
> +	int i;
> +
> +	/*
> +	 * This methods can be called when vcpu allocation/initialization
> +	 * failed. So it's possible that hkid, tdvpx and tdvpr are not assigned
> +	 * yet.
> +	 */

IIUC leaking tdcx_pa/tdvpr_pa shuold happen only when
failure of freeing hkid. How about change above to real
reason or just remove them ?

> +	if (is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
> +		return;
> +
> +	if (tdx->tdcx_pa) {
> +		for (i = 0; i < tdx_sysinfo_nr_tdcx_pages(); i++) {
> +			if (tdx->tdcx_pa[i])
> +				tdx_reclaim_control_page(tdx->tdcx_pa[i]);
> +		}
> +		kfree(tdx->tdcx_pa);
> +		tdx->tdcx_pa = NULL;
> +	}
> +	if (tdx->tdvpr_pa) {
> +		tdx_reclaim_control_page(tdx->tdvpr_pa);
> +		tdx->tdvpr_pa = 0;
> +	}
>  }
>
>  void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> @@ -414,8 +455,13 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>  	/* Ignore INIT silently because TDX doesn't support INIT event. */
>  	if (init_event)
>  		return;
> +	if (is_td_vcpu_created(to_tdx(vcpu)))
> +		return;
>
> -	/* This is stub for now. More logic will come here. */
> +	/*
> +	 * Don't update mp_state to runnable because more initialization
> +	 * is needed by TDX_VCPU_INIT.
> +	 */
>  }
>
>  static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
> @@ -884,6 +930,149 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>  	return r;
>  }
>
> +/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
> +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
> +{
> +	const struct tdx_sysinfo_module_info *modinfo = &tdx_sysinfo->module_info;
> +	struct vcpu_tdx *tdx = to_tdx(vcpu);
> +	unsigned long va;
> +	int ret, i;
> +	u64 err;
> +
> +	if (is_td_vcpu_created(tdx))
> +		return -EINVAL;
> +
> +	/*
> +	 * vcpu_free method frees allocated pages.  Avoid partial setup so
> +	 * that the method can't handle it.
> +	 */

This looks not that clear, why vcpu_free can't handle it is not explained.

Looking the whole function, page already added into TD by
SEAMCALL should be cleared before free back to kernel,
tdx_vcpu_free() can handle them. Other pages can be freed
directly and can't be handled by tdx_vcpu_free() because
they're not added into TD. Is this right understanding ?

> +	va = __get_free_page(GFP_KERNEL_ACCOUNT);
> +	if (!va)
> +		return -ENOMEM;
> +	tdx->tdvpr_pa = __pa(va);
> +
> +	tdx->tdcx_pa = kcalloc(tdx_sysinfo_nr_tdcx_pages(), sizeof(*tdx->tdcx_pa),
> +			   GFP_KERNEL_ACCOUNT);
> +	if (!tdx->tdcx_pa) {
> +		ret = -ENOMEM;
> +		goto free_tdvpr;
> +	}
> +
> +	err = tdh_vp_create(tdx);
> +	if (KVM_BUG_ON(err, vcpu->kvm)) {
> +		tdx->tdvpr_pa = 0;

This leaks the tdx->tdvpr_pa in case of no VP is created.
Any reason for this ?

> +		ret = -EIO;
> +		pr_tdx_error(TDH_VP_CREATE, err);
> +		goto free_tdvpx;
> +	}
> +
> +	for (i = 0; i < tdx_sysinfo_nr_tdcx_pages(); i++) {
> +		va = __get_free_page(GFP_KERNEL_ACCOUNT);
> +		if (!va) {
> +			ret = -ENOMEM;
> +			goto free_tdvpx;

It's possible that some pages already added into TD by
tdh_vp_addcx() below and they won't be handled by
tdx_vcpu_free() if goto free_tdvpx here;

> +		}
> +		tdx->tdcx_pa[i] = __pa(va);
> +
> +		err = tdh_vp_addcx(tdx, tdx->tdcx_pa[i]);
> +		if (KVM_BUG_ON(err, vcpu->kvm)) {
> +			pr_tdx_error(TDH_VP_ADDCX, err);
> +			/* vcpu_free method frees TDCX and TDR donated to TDX */
> +			return -EIO;
> +		}
> +	}
> +
> +	if (modinfo->tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)
> +		err = tdh_vp_init_apicid(tdx, vcpu_rcx, vcpu->vcpu_id);

This can cause incorrect topology information to guest
silently:

A user space VMM uses "-smp 8,threads=4,cores=2" but doesn't
pass any 0x1f leaf data to KVM, means no 0x1f value to TDX
module for this TD. The topology TD guest observed is:

Thread(s) per core:                 2
Core(s) per socket:                 4

I suggest to use tdh_vp_init_apicid() only when 0x1f is
valid. This will disable the 0x1f/0xb topology feature per
the spec, but leaf 0x1/0x4 still are available to present
right topology in this example. It presents correct topology
information to guest if user space VMM doesn't use 0x1f for
simple topology and run on TDX module w/ FEATURES0_TOPOLOGY.

> +	else
> +		err = tdh_vp_init(tdx, vcpu_rcx);
> +
> +	if (KVM_BUG_ON(err, vcpu->kvm)) {
> +		pr_tdx_error(TDH_VP_INIT, err);
> +		return -EIO;
> +	}
> +
> +	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> +	tdx->td_vcpu_created = true;
> +
> +	return 0;
> +
> +free_tdvpx:

How about s/free_tdvpx/free_tdcx

In 1.5 TDX spec these pages are all called TDCX pages, and
the function context already indicates that we're talking about
vcpu's TDCX pages.

> +	for (i = 0; i < tdx_sysinfo_nr_tdcx_pages(); i++) {
> +		if (tdx->tdcx_pa[i])
> +			free_page((unsigned long)__va(tdx->tdcx_pa[i]));
> +		tdx->tdcx_pa[i] = 0;
> +	}
> +	kfree(tdx->tdcx_pa);
> +	tdx->tdcx_pa = NULL;
> +
> +free_tdvpr:
> +	if (tdx->tdvpr_pa)
> +		free_page((unsigned long)__va(tdx->tdvpr_pa));
> +	tdx->tdvpr_pa = 0;
> +
> +	return ret;
> +}
> +
> +static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
> +{
> +	struct msr_data apic_base_msr;
> +	struct vcpu_tdx *tdx = to_tdx(vcpu);
> +	int ret;
> +
> +	if (cmd->flags)
> +		return -EINVAL;
> +	if (tdx->initialized)
> +		return -EINVAL;
> +
> +	/*
> +	 * As TDX requires X2APIC, set local apic mode to X2APIC.  User space
> +	 * VMM, e.g. qemu, is required to set CPUID[0x1].ecx.X2APIC=1 by
> +	 * KVM_SET_CPUID2.  Otherwise kvm_set_apic_base() will fail.
> +	 */
> +	apic_base_msr = (struct msr_data) {
> +		.host_initiated = true,
> +		.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
> +		(kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0),
> +	};
> +	if (kvm_set_apic_base(vcpu, &apic_base_msr))
> +		return -EINVAL;
> +
> +	ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
> +	if (ret)
> +		return ret;
> +
> +	tdx->initialized = true;
> +	return 0;
> +}
> +
> +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> +	struct kvm_tdx_cmd cmd;
> +	int ret;
> +
> +	if (!is_hkid_assigned(kvm_tdx) || is_td_finalized(kvm_tdx))
> +		return -EINVAL;
> +
> +	if (copy_from_user(&cmd, argp, sizeof(cmd)))
> +		return -EFAULT;
> +
> +	if (cmd.hw_error)
> +		return -EINVAL;
> +
> +	switch (cmd.id) {
> +	case KVM_TDX_INIT_VCPU:
> +		ret = tdx_vcpu_init(vcpu, &cmd);
> +		break;
> +	default:
> +		ret = -EINVAL;
> +		break;
> +	}
> +
> +	return ret;
> +}
> +
>  #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
>
>  static int __init setup_kvm_tdx_caps(void)
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index ca948f26b755..8349b542836e 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -22,6 +22,8 @@ struct kvm_tdx {
>  	u64 xfam;
>  	int hkid;
>
> +	bool finalized;
> +
>  	u64 tsc_offset;
>  };
>
> @@ -29,6 +31,10 @@ struct vcpu_tdx {
>  	struct kvm_vcpu	vcpu;
>
>  	unsigned long tdvpr_pa;
> +	unsigned long *tdcx_pa;
> +	bool td_vcpu_created;
> +
> +	bool initialized;
>
>  	/*
>  	 * Dummy to make pmu_intel not corrupt memory.
> diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h
> index 413619dd92ef..d2d7f9cab740 100644
> --- a/arch/x86/kvm/vmx/tdx_arch.h
> +++ b/arch/x86/kvm/vmx/tdx_arch.h
> @@ -155,4 +155,6 @@ struct td_params {
>  #define TDX_MIN_TSC_FREQUENCY_KHZ		(100 * 1000)
>  #define TDX_MAX_TSC_FREQUENCY_KHZ		(10 * 1000 * 1000)
>
> +#define MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM	BIT_ULL(20)
> +
>  #endif /* __KVM_X86_TDX_ARCH_H */
> diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> index e1d3276b0f60..55fd17fbfd19 100644
> --- a/arch/x86/kvm/vmx/x86_ops.h
> +++ b/arch/x86/kvm/vmx/x86_ops.h
> @@ -129,6 +129,8 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
>  int tdx_vcpu_create(struct kvm_vcpu *vcpu);
>  void tdx_vcpu_free(struct kvm_vcpu *vcpu);
>  void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
> +
> +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
>  #else
>  static inline int tdx_vm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
>  {
> @@ -143,6 +145,8 @@ static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOP
>  static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
>  static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
>  static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
> +
> +static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
>  #endif
>
>  #endif /* __KVM_X86_VMX_X86_OPS_H */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 9cee326f5e7a..3d43fa84c2b4 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6314,6 +6314,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  	case KVM_SET_DEVICE_ATTR:
>  		r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
>  		break;
> +	case KVM_MEMORY_ENCRYPT_OP:
> +		r = -ENOTTY;
> +		if (!kvm_x86_ops.vcpu_mem_enc_ioctl)
> +			goto out;
> +		r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp);
> +		break;
>  	default:
>  		r = -EINVAL;
>  	}
> --
> 2.34.1
>
>
Isaku Yamahata Aug. 13, 2024, 5:21 p.m. UTC | #2
On Tue, Aug 13, 2024 at 04:00:09PM +0800,
Yuan Yao <yuan.yao@linux.intel.com> wrote:

> > +/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
> > +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
> > +{
> > +	const struct tdx_sysinfo_module_info *modinfo = &tdx_sysinfo->module_info;
> > +	struct vcpu_tdx *tdx = to_tdx(vcpu);
> > +	unsigned long va;
> > +	int ret, i;
> > +	u64 err;
> > +
> > +	if (is_td_vcpu_created(tdx))
> > +		return -EINVAL;
> > +
> > +	/*
> > +	 * vcpu_free method frees allocated pages.  Avoid partial setup so
> > +	 * that the method can't handle it.
> > +	 */
> 
> This looks not that clear, why vcpu_free can't handle it is not explained.
> 
> Looking the whole function, page already added into TD by
> SEAMCALL should be cleared before free back to kernel,
> tdx_vcpu_free() can handle them. Other pages can be freed
> directly and can't be handled by tdx_vcpu_free() because
> they're not added into TD. Is this right understanding ?

Yes.  If we result in error in the middle of TDX vCPU initialization,
TDH.MEM.PAGE.RECLAIM() result in error due to TDX module state check.
TDX module seems to assume that we don't fail in the middle of TDX vCPU
initialization.  Maybe we can add WARN_ON_ONCE() for such cases.


> > +		ret = -EIO;
> > +		pr_tdx_error(TDH_VP_CREATE, err);
> > +		goto free_tdvpx;
> > +	}
> > +
> > +	for (i = 0; i < tdx_sysinfo_nr_tdcx_pages(); i++) {
> > +		va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > +		if (!va) {
> > +			ret = -ENOMEM;
> > +			goto free_tdvpx;
> 
> It's possible that some pages already added into TD by
> tdh_vp_addcx() below and they won't be handled by
> tdx_vcpu_free() if goto free_tdvpx here;

Due to TDX TD state check, we can't free partially assigned TDCS pages.
TDX module seems to assume that TDH.VP.ADDCX() won't fail in the middle.


> > +	else
> > +		err = tdh_vp_init(tdx, vcpu_rcx);
> > +
> > +	if (KVM_BUG_ON(err, vcpu->kvm)) {
> > +		pr_tdx_error(TDH_VP_INIT, err);
> > +		return -EIO;
> > +	}
> > +
> > +	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> > +	tdx->td_vcpu_created = true;
> > +
> > +	return 0;
> > +
> > +free_tdvpx:
> 
> How about s/free_tdvpx/free_tdcx
> 
> In 1.5 TDX spec these pages are all called TDCX pages, and
> the function context already indicates that we're talking about
> vcpu's TDCX pages.

Oops, this is left over when tdvpx was converted to tdcs.


> > +static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
> > +{
> > +	struct msr_data apic_base_msr;
> > +	struct vcpu_tdx *tdx = to_tdx(vcpu);
> > +	int ret;
> > +
> > +	if (cmd->flags)
> > +		return -EINVAL;
> > +	if (tdx->initialized)
> > +		return -EINVAL;
> > +
> > +	/*
> > +	 * As TDX requires X2APIC, set local apic mode to X2APIC.  User space
> > +	 * VMM, e.g. qemu, is required to set CPUID[0x1].ecx.X2APIC=1 by
> > +	 * KVM_SET_CPUID2.  Otherwise kvm_set_apic_base() will fail.
> > +	 */
> > +	apic_base_msr = (struct msr_data) {
> > +		.host_initiated = true,
> > +		.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
> > +		(kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0),
> > +	};
> > +	if (kvm_set_apic_base(vcpu, &apic_base_msr))
> > +		return -EINVAL;
> > +
> > +	ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);

Because we set guest rcx only, we use cmd->data.  Can we add reserved area for
future use similar to struct kvm_tdx_init_vm?
i.e. introduce something like
struct kvm_tdx_init_vcpu {u64 rcx; u64 reserved[]; }
Yuan Yao Aug. 14, 2024, 1:20 a.m. UTC | #3
On Tue, Aug 13, 2024 at 10:21:08AM -0700, Isaku Yamahata wrote:
> On Tue, Aug 13, 2024 at 04:00:09PM +0800,
> Yuan Yao <yuan.yao@linux.intel.com> wrote:
>
> > > +/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
> > > +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
> > > +{
> > > +	const struct tdx_sysinfo_module_info *modinfo = &tdx_sysinfo->module_info;
> > > +	struct vcpu_tdx *tdx = to_tdx(vcpu);
> > > +	unsigned long va;
> > > +	int ret, i;
> > > +	u64 err;
> > > +
> > > +	if (is_td_vcpu_created(tdx))
> > > +		return -EINVAL;
> > > +
> > > +	/*
> > > +	 * vcpu_free method frees allocated pages.  Avoid partial setup so
> > > +	 * that the method can't handle it.
> > > +	 */
> >
> > This looks not that clear, why vcpu_free can't handle it is not explained.
> >
> > Looking the whole function, page already added into TD by
> > SEAMCALL should be cleared before free back to kernel,
> > tdx_vcpu_free() can handle them. Other pages can be freed
> > directly and can't be handled by tdx_vcpu_free() because
> > they're not added into TD. Is this right understanding ?
>
> Yes.  If we result in error in the middle of TDX vCPU initialization,
> TDH.MEM.PAGE.RECLAIM() result in error due to TDX module state check.
> TDX module seems to assume that we don't fail in the middle of TDX vCPU
> initialization.  Maybe we can add WARN_ON_ONCE() for such cases.
>
>
> > > +		ret = -EIO;
> > > +		pr_tdx_error(TDH_VP_CREATE, err);
> > > +		goto free_tdvpx;
> > > +	}
> > > +
> > > +	for (i = 0; i < tdx_sysinfo_nr_tdcx_pages(); i++) {
> > > +		va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > > +		if (!va) {
> > > +			ret = -ENOMEM;
> > > +			goto free_tdvpx;
> >
> > It's possible that some pages already added into TD by
> > tdh_vp_addcx() below and they won't be handled by
> > tdx_vcpu_free() if goto free_tdvpx here;
>
> Due to TDX TD state check, we can't free partially assigned TDCS pages.
> TDX module seems to assume that TDH.VP.ADDCX() won't fail in the middle.

The already partially added TDCX pages are initialized by
MOVDIR64 with the TD's private HKID in TDX module, the above
'goto free_tdvpx' frees them back to kernel directly w/o
take back the ownership with shared HKID. This violates the
rule that a page's ownership should be taken back with shared
HKID before release to kernel if they were initialized by any
private HKID before.

How about do tdh_vp_addcx() afer allocated all TDCX pages
and give WARN_ON_ONCE() to the return value of
tdh_vp_addcx() if the tdh_vp_addcx() won't fail except some
BUG inside TDX module in our current usage ?

>
>
> > > +	else
> > > +		err = tdh_vp_init(tdx, vcpu_rcx);
> > > +
> > > +	if (KVM_BUG_ON(err, vcpu->kvm)) {
> > > +		pr_tdx_error(TDH_VP_INIT, err);
> > > +		return -EIO;
> > > +	}
> > > +
> > > +	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> > > +	tdx->td_vcpu_created = true;
> > > +
> > > +	return 0;
> > > +
> > > +free_tdvpx:
> >
> > How about s/free_tdvpx/free_tdcx
> >
> > In 1.5 TDX spec these pages are all called TDCX pages, and
> > the function context already indicates that we're talking about
> > vcpu's TDCX pages.
>
> Oops, this is left over when tdvpx was converted to tdcs.
>
>
> > > +static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
> > > +{
> > > +	struct msr_data apic_base_msr;
> > > +	struct vcpu_tdx *tdx = to_tdx(vcpu);
> > > +	int ret;
> > > +
> > > +	if (cmd->flags)
> > > +		return -EINVAL;
> > > +	if (tdx->initialized)
> > > +		return -EINVAL;
> > > +
> > > +	/*
> > > +	 * As TDX requires X2APIC, set local apic mode to X2APIC.  User space
> > > +	 * VMM, e.g. qemu, is required to set CPUID[0x1].ecx.X2APIC=1 by
> > > +	 * KVM_SET_CPUID2.  Otherwise kvm_set_apic_base() will fail.
> > > +	 */
> > > +	apic_base_msr = (struct msr_data) {
> > > +		.host_initiated = true,
> > > +		.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
> > > +		(kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0),
> > > +	};
> > > +	if (kvm_set_apic_base(vcpu, &apic_base_msr))
> > > +		return -EINVAL;
> > > +
> > > +	ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
>
> Because we set guest rcx only, we use cmd->data.  Can we add reserved area for
> future use similar to struct kvm_tdx_init_vm?
> i.e. introduce something like
> struct kvm_tdx_init_vcpu {u64 rcx; u64 reserved[]; }
> --
> Isaku Yamahata <isaku.yamahata@intel.com>
Isaku Yamahata Aug. 15, 2024, 12:47 a.m. UTC | #4
On Wed, Aug 14, 2024 at 09:20:06AM +0800,
Yuan Yao <yuan.yao@linux.intel.com> wrote:

> > > > +		ret = -EIO;
> > > > +		pr_tdx_error(TDH_VP_CREATE, err);
> > > > +		goto free_tdvpx;
> > > > +	}
> > > > +
> > > > +	for (i = 0; i < tdx_sysinfo_nr_tdcx_pages(); i++) {
> > > > +		va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > > > +		if (!va) {
> > > > +			ret = -ENOMEM;
> > > > +			goto free_tdvpx;
> > >
> > > It's possible that some pages already added into TD by
> > > tdh_vp_addcx() below and they won't be handled by
> > > tdx_vcpu_free() if goto free_tdvpx here;
> >
> > Due to TDX TD state check, we can't free partially assigned TDCS pages.
> > TDX module seems to assume that TDH.VP.ADDCX() won't fail in the middle.
> 
> The already partially added TDCX pages are initialized by
> MOVDIR64 with the TD's private HKID in TDX module, the above
> 'goto free_tdvpx' frees them back to kernel directly w/o
> take back the ownership with shared HKID. This violates the
> rule that a page's ownership should be taken back with shared
> HKID before release to kernel if they were initialized by any
> private HKID before.
> 
> How about do tdh_vp_addcx() afer allocated all TDCX pages
> and give WARN_ON_ONCE() to the return value of
> tdh_vp_addcx() if the tdh_vp_addcx() won't fail except some
> BUG inside TDX module in our current usage ?

Yes, that makes sense.  Those error recovery paths need to be simplified.
Edgecombe, Rick P Aug. 28, 2024, 2:34 p.m. UTC | #5
On Mon, 2024-08-12 at 15:48 -0700, Rick Edgecombe wrote:
> +static inline bool is_td_vcpu_created(struct vcpu_tdx *tdx)
> +{
> +       return tdx->td_vcpu_created;
> +}

This and is_td_finalized() seem like unneeded helpers. The field name is clear
enough.

> +
>  static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
>  {
>         return kvm_tdx->tdr_pa;

Not this one though, the helper makes the caller code clearer.

> @@ -105,6 +110,11 @@ static inline bool is_hkid_assigned(struct kvm_tdx
> *kvm_tdx)
>         return kvm_tdx->hkid > 0;
>  }
>  
> +static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
> +{
> +       return kvm_tdx->finalized;
> +}
> +
Tony Lindgren Sept. 3, 2024, 5:23 a.m. UTC | #6
On Tue, Aug 13, 2024 at 04:00:09PM +0800, Yuan Yao wrote:
> On Mon, Aug 12, 2024 at 03:48:13PM -0700, Rick Edgecombe wrote:
> > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > @@ -293,6 +303,15 @@ static inline u8 tdx_sysinfo_nr_tdcs_pages(void)
> >  	return tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
> >  }
> >
> > +static inline u8 tdx_sysinfo_nr_tdcx_pages(void)
> 
> tdx_sysinfo_nr_tdcx_pages() is very similar to
> tdx_sysinfo_nr_tdcs_pages() which is introduced in patch 13.
> 
> It's easy to use either of them in wrong place and hard to
> review, these 2 functions have same signature so compiler
> has no way to prevent us from using them incorrectly.
> TDX 1.5 spec defines these additional pages for TD and vCPU to
> "TDCX" pages, so how about we name them like:
> 
> u8 tdx_sysinfo_nr_td_tdcx_pages(void);
> u8 tdx_sysinfo_nr_vcpu_tdcx_pages(void);
> 
> Above name matchs spec more, and easy to distinguish and review.

Good idea to clarify the naming. For patch 13/25, Nikolay suggested
precalculating the values and dropping the helpers. So we could have
kvm_tdx->nr_td_tdcx_pages and kvm_tdx->nr_vcpu_tdcx_pages following
your naming suggestion.

Regards,

Tony
Tony Lindgren Sept. 3, 2024, 5:34 a.m. UTC | #7
On Wed, Aug 28, 2024 at 02:34:21PM +0000, Edgecombe, Rick P wrote:
> On Mon, 2024-08-12 at 15:48 -0700, Rick Edgecombe wrote:
> > +static inline bool is_td_vcpu_created(struct vcpu_tdx *tdx)
> > +{
> > +       return tdx->td_vcpu_created;
> > +}
> 
> This and is_td_finalized() seem like unneeded helpers. The field name is clear
> enough.

I'll do a patch for this.

> >  static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
> >  {
> >         return kvm_tdx->tdr_pa;
> 
> Not this one though, the helper makes the caller code clearer.

Yes this makes things more readable.

Regards,

Tony
Adrian Hunter Oct. 9, 2024, 3:01 p.m. UTC | #8
On 13/08/24 11:00, Yuan Yao wrote:
> On Mon, Aug 12, 2024 at 03:48:13PM -0700, Rick Edgecombe wrote:
>> From: Isaku Yamahata <isaku.yamahata@intel.com>
>>
>> TD guest vcpu needs TDX specific initialization before running.  Repurpose
>> KVM_MEMORY_ENCRYPT_OP to vcpu-scope, add a new sub-command
>> KVM_TDX_INIT_VCPU, and implement the callback for it.
>>
>> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
>> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
>> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
>> ---

<SNIP>

>> @@ -884,6 +930,149 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>>  	return r;
>>  }
>>
>> +/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
>> +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
>> +{

<SNIP>

>> +	if (modinfo->tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)
>> +		err = tdh_vp_init_apicid(tdx, vcpu_rcx, vcpu->vcpu_id);
>> +	else
>> +		err = tdh_vp_init(tdx, vcpu_rcx);
> 
> This can cause incorrect topology information to guest
> silently:
> 
> A user space VMM uses "-smp 8,threads=4,cores=2" but doesn't
> pass any 0x1f leaf data to KVM, means no 0x1f value to TDX
> module for this TD. The topology TD guest observed is:
> 
> Thread(s) per core:                 2
> Core(s) per socket:                 4
> 
> I suggest to use tdh_vp_init_apicid() only when 0x1f is
> valid. This will disable the 0x1f/0xb topology feature per
> the spec, but leaf 0x1/0x4 still are available to present
> right topology in this example. It presents correct topology
> information to guest if user space VMM doesn't use 0x1f for
> simple topology and run on TDX module w/ FEATURES0_TOPOLOGY.

tdh_vp_init_apicid() passes x2APIC ID to TDH.VP.INIT which
is one of the steps for the TDX Module to support topology
information for the guest i.e. CPUID leaf 0xB and CPUID leaf 0x1F.

If the host VMM does not provide CPUID leaf 0x1F values
(i.e. the values are 0), then the TDX Module will use native
values for both CPUID leaf 0x1F and CPUID leaf 0xB.

To get 0x1F/0xB the guest must also opt-in by setting
TDCS.TD_CTLS.ENUM_TOPOLOGY to 1.  AFAICT currently Linux
does not do that.

In the tdh_vp_init() case, topology information will not be
supported.

If topology information is not supported CPUID leaf 0xB and
CPUID leaf 0x1F will #VE, and a Linux guest will return zeros.

So, yes, it seems like tdh_vp_init_apicid() should only
be called if there is non-zero CPUID leaf 0x1F values provided
by host VMM. e.g. add a helper function

bool tdx_td_enum_topology(struct kvm_cpuid2 *cpuid)
{
	const struct tdx_sys_info_features *modinfo = &tdx_sysinfo->features;
	const struct kvm_cpuid_entry2 *entry;

	if (!(modinfo->tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
		return false;

	entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x1f, 0);
	if (!entry)
		return false;

	return entry->eax || entry->ebx || entry->ecx || entry->edx;
}
Edgecombe, Rick P Oct. 16, 2024, 5:42 p.m. UTC | #9
On Wed, 2024-10-09 at 18:01 +0300, Adrian Hunter wrote:
> tdh_vp_init_apicid() passes x2APIC ID to TDH.VP.INIT which
> is one of the steps for the TDX Module to support topology
> information for the guest i.e. CPUID leaf 0xB and CPUID leaf 0x1F.
> 
> If the host VMM does not provide CPUID leaf 0x1F values
> (i.e. the values are 0), then the TDX Module will use native
> values for both CPUID leaf 0x1F and CPUID leaf 0xB.
> 
> To get 0x1F/0xB the guest must also opt-in by setting
> TDCS.TD_CTLS.ENUM_TOPOLOGY to 1.  AFAICT currently Linux
> does not do that.
> 
> In the tdh_vp_init() case, topology information will not be
> supported.
> 
> If topology information is not supported CPUID leaf 0xB and
> CPUID leaf 0x1F will #VE, and a Linux guest will return zeros.
> 
> So, yes, it seems like tdh_vp_init_apicid() should only
> be called if there is non-zero CPUID leaf 0x1F values provided
> by host VMM. e.g. add a helper function
> 
> bool tdx_td_enum_topology(struct kvm_cpuid2 *cpuid)
> {
> 	const struct tdx_sys_info_features *modinfo = &tdx_sysinfo->features;
> 	const struct kvm_cpuid_entry2 *entry;
> 
> 	if (!(modinfo->tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
> 		return false;
> 
> 	entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x1f, 0);
> 	if (!entry)
> 		return false;
> 
> 	return entry->eax || entry->ebx || entry->ecx || entry->edx;
> }

KVM usually leaves it up to userspace to not create nonsensical VMs. So I think
we can skip the check in KVM.

In that case, do you see a need for the vanilla tdh_vp_init() SEAMCALL wrapper?

The TDX module version we need already supports enum_topology, so the code:
	if (modinfo->tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)
		err = tdh_vp_init_apicid(tdx, vcpu_rcx, vcpu->vcpu_id);
	else
		err = tdh_vp_init(tdx, vcpu_rcx);

The tdh_vp_init() branch shouldn't be hit.
Xiaoyao Li Oct. 18, 2024, 2:21 a.m. UTC | #10
On 10/17/2024 1:42 AM, Edgecombe, Rick P wrote:
> On Wed, 2024-10-09 at 18:01 +0300, Adrian Hunter wrote:
>> tdh_vp_init_apicid() passes x2APIC ID to TDH.VP.INIT which
>> is one of the steps for the TDX Module to support topology
>> information for the guest i.e. CPUID leaf 0xB and CPUID leaf 0x1F.
>>
>> If the host VMM does not provide CPUID leaf 0x1F values
>> (i.e. the values are 0), then the TDX Module will use native
>> values for both CPUID leaf 0x1F and CPUID leaf 0xB.
>>
>> To get 0x1F/0xB the guest must also opt-in by setting
>> TDCS.TD_CTLS.ENUM_TOPOLOGY to 1.  AFAICT currently Linux
>> does not do that.
>>
>> In the tdh_vp_init() case, topology information will not be
>> supported.
>>
>> If topology information is not supported CPUID leaf 0xB and
>> CPUID leaf 0x1F will #VE, and a Linux guest will return zeros.
>>
>> So, yes, it seems like tdh_vp_init_apicid() should only
>> be called if there is non-zero CPUID leaf 0x1F values provided
>> by host VMM. e.g. add a helper function
>>
>> bool tdx_td_enum_topology(struct kvm_cpuid2 *cpuid)
>> {
>> 	const struct tdx_sys_info_features *modinfo = &tdx_sysinfo->features;
>> 	const struct kvm_cpuid_entry2 *entry;
>>
>> 	if (!(modinfo->tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
>> 		return false;
>>
>> 	entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x1f, 0);
>> 	if (!entry)
>> 		return false;
>>
>> 	return entry->eax || entry->ebx || entry->ecx || entry->edx;
>> }
> 
> KVM usually leaves it up to userspace to not create nonsensical VMs. So I think
> we can skip the check in KVM.

It's not nonsensical unless KVM announces its own requirement for TD 
guest that userspace VMM must provide valid CPUID leaf 0x1f value for 
topology.

It's architectural valid that userspace VMM creates a TD with legacy 
topology, i.e., topology enumerated via CPUID 0x1 and 0x4.

> In that case, do you see a need for the vanilla tdh_vp_init() SEAMCALL wrapper?
> 
> The TDX module version we need already supports enum_topology, so the code:
> 	if (modinfo->tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)
> 		err = tdh_vp_init_apicid(tdx, vcpu_rcx, vcpu->vcpu_id);
> 	else
> 		err = tdh_vp_init(tdx, vcpu_rcx);
> 
> The tdh_vp_init() branch shouldn't be hit.

We cannot know what version of TDX module user might use thus we cannot 
assume enum_topology is always there unless we make it a hard 
requirement in KVM that TDX fails being enabled when

   !(modinfo->tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)
Edgecombe, Rick P Oct. 18, 2024, 2:20 p.m. UTC | #11
On Fri, 2024-10-18 at 10:21 +0800, Xiaoyao Li wrote:
> > KVM usually leaves it up to userspace to not create nonsensical VMs. So I
> > think
> > we can skip the check in KVM.
> 
> It's not nonsensical unless KVM announces its own requirement for TD 
> guest that userspace VMM must provide valid CPUID leaf 0x1f value for 
> topology.

How about adding it to the docs?

> 
> It's architectural valid that userspace VMM creates a TD with legacy 
> topology, i.e., topology enumerated via CPUID 0x1 and 0x4.
> 
> > In that case, do you see a need for the vanilla tdh_vp_init() SEAMCALL
> > wrapper?
> > 
> > The TDX module version we need already supports enum_topology, so the code:
> >  	if (modinfo->tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)
> >  		err = tdh_vp_init_apicid(tdx, vcpu_rcx, vcpu->vcpu_id);
> >  	else
> >  		err = tdh_vp_init(tdx, vcpu_rcx);
> > 
> > The tdh_vp_init() branch shouldn't be hit.
> 
> We cannot know what version of TDX module user might use thus we cannot 
> assume enum_topology is always there unless we make it a hard 
> requirement in KVM that TDX fails being enabled when
> 
>    !(modinfo->tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)

We will depend on bugs that are fixed in TDX Modules after enum topology, so it
shouldn't be required in the normal case. So I think it would be simpler to add
this tdx_features0 conditional. We can then export one less SEAMCALL and will
have less configurations flows to worry about on the KVM side.
Xiaoyao Li Oct. 21, 2024, 8:35 a.m. UTC | #12
On 10/18/2024 10:20 PM, Edgecombe, Rick P wrote:
> On Fri, 2024-10-18 at 10:21 +0800, Xiaoyao Li wrote:
>>> KVM usually leaves it up to userspace to not create nonsensical VMs. So I
>>> think
>>> we can skip the check in KVM.
>>
>> It's not nonsensical unless KVM announces its own requirement for TD
>> guest that userspace VMM must provide valid CPUID leaf 0x1f value for
>> topology.
> 
> How about adding it to the docs?

OK for me.

>>
>> It's architectural valid that userspace VMM creates a TD with legacy
>> topology, i.e., topology enumerated via CPUID 0x1 and 0x4.
>>
>>> In that case, do you see a need for the vanilla tdh_vp_init() SEAMCALL
>>> wrapper?
>>>
>>> The TDX module version we need already supports enum_topology, so the code:
>>>   	if (modinfo->tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)
>>>   		err = tdh_vp_init_apicid(tdx, vcpu_rcx, vcpu->vcpu_id);
>>>   	else
>>>   		err = tdh_vp_init(tdx, vcpu_rcx);
>>>
>>> The tdh_vp_init() branch shouldn't be hit.
>>
>> We cannot know what version of TDX module user might use thus we cannot
>> assume enum_topology is always there unless we make it a hard
>> requirement in KVM that TDX fails being enabled when
>>
>>     !(modinfo->tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)
> 
> We will depend on bugs that are fixed in TDX Modules after enum topology, so it
> shouldn't be required in the normal case. So I think it would be simpler to add
> this tdx_features0 conditional. We can then export one less SEAMCALL and will
> have less configurations flows to worry about on the KVM side.

I'm a little bit confused. what does "add this tdx_feature0 conditional" 
mean?
Edgecombe, Rick P Oct. 26, 2024, 1:12 a.m. UTC | #13
On Mon, 2024-10-21 at 16:35 +0800, Xiaoyao Li wrote:
> > 
> > How about adding it to the docs?
> 
> OK for me.

Can you propose something?

> 
> > > 
> > > It's architectural valid that userspace VMM creates a TD with legacy
> > > topology, i.e., topology enumerated via CPUID 0x1 and 0x4.
> > > 
> > > > In that case, do you see a need for the vanilla tdh_vp_init() SEAMCALL
> > > > wrapper?
> > > > 
> > > > The TDX module version we need already supports enum_topology, so the
> > > > code:
> > > >    	if (modinfo->tdx_features0 &
> > > > MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)
> > > >    		err = tdh_vp_init_apicid(tdx, vcpu_rcx, vcpu->vcpu_id);
> > > >    	else
> > > >    		err = tdh_vp_init(tdx, vcpu_rcx);
> > > > 
> > > > The tdh_vp_init() branch shouldn't be hit.
> > > 
> > > We cannot know what version of TDX module user might use thus we cannot
> > > assume enum_topology is always there unless we make it a hard
> > > requirement in KVM that TDX fails being enabled when
> > > 
> > >      !(modinfo->tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)
> > 
> > We will depend on bugs that are fixed in TDX Modules after enum topology, so
> > it
> > shouldn't be required in the normal case. So I think it would be simpler to
> > add
> > this tdx_features0 conditional. We can then export one less SEAMCALL and
> > will
> > have less configurations flows to worry about on the KVM side.
> 
> I'm a little bit confused. what does "add this tdx_feature0 conditional" 
> mean?

I was talking about your suggestion to check for
MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 12ee66bc9026..5dd7955376e3 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -126,6 +126,7 @@  KVM_X86_OP(enable_smi_window)
 #endif
 KVM_X86_OP_OPTIONAL(dev_get_attr)
 KVM_X86_OP(mem_enc_ioctl)
+KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
 KVM_X86_OP_OPTIONAL(mem_enc_register_region)
 KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
 KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 188cd684bffb..e3094c843556 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1829,6 +1829,7 @@  struct kvm_x86_ops {
 
 	int (*dev_get_attr)(u32 group, u64 attr, u64 *val);
 	int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
+	int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
 	int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
 	int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
 	int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 95ae2d4a4697..b4f12997052d 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -930,6 +930,7 @@  struct kvm_hyperv_eventfd {
 enum kvm_tdx_cmd_id {
 	KVM_TDX_CAPABILITIES = 0,
 	KVM_TDX_INIT_VM,
+	KVM_TDX_INIT_VCPU,
 
 	KVM_TDX_CMD_NR_MAX,
 };
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index d40de73d2bd3..e34cb476cc78 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -116,6 +116,14 @@  static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
 	return tdx_vm_ioctl(kvm, argp);
 }
 
+static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+	if (!is_td_vcpu(vcpu))
+		return -EINVAL;
+
+	return tdx_vcpu_ioctl(vcpu, argp);
+}
+
 #define VMX_REQUIRED_APICV_INHIBITS				\
 	(BIT(APICV_INHIBIT_REASON_DISABLED) |			\
 	 BIT(APICV_INHIBIT_REASON_ABSENT) |			\
@@ -268,6 +276,7 @@  struct kvm_x86_ops vt_x86_ops __initdata = {
 	.get_untagged_addr = vmx_get_untagged_addr,
 
 	.mem_enc_ioctl = vt_mem_enc_ioctl,
+	.vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
 };
 
 struct kvm_x86_init_ops vt_init_ops __initdata = {
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 18738cacbc87..ba7b436fae86 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -89,6 +89,11 @@  static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
 	return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
 }
 
+static inline bool is_td_vcpu_created(struct vcpu_tdx *tdx)
+{
+	return tdx->td_vcpu_created;
+}
+
 static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
 {
 	return kvm_tdx->tdr_pa;
@@ -105,6 +110,11 @@  static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
 	return kvm_tdx->hkid > 0;
 }
 
+static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
+{
+	return kvm_tdx->finalized;
+}
+
 static void tdx_clear_page(unsigned long page_pa)
 {
 	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
@@ -293,6 +303,15 @@  static inline u8 tdx_sysinfo_nr_tdcs_pages(void)
 	return tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
 }
 
+static inline u8 tdx_sysinfo_nr_tdcx_pages(void)
+{
+	/*
+	 * TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages).
+	 * -1 for TDVPR.
+	 */
+	return tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
+}
+
 void tdx_vm_free(struct kvm *kvm)
 {
 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
@@ -405,7 +424,29 @@  int tdx_vcpu_create(struct kvm_vcpu *vcpu)
 
 void tdx_vcpu_free(struct kvm_vcpu *vcpu)
 {
-	/* This is stub for now.  More logic will come. */
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
+	int i;
+
+	/*
+	 * This methods can be called when vcpu allocation/initialization
+	 * failed. So it's possible that hkid, tdvpx and tdvpr are not assigned
+	 * yet.
+	 */
+	if (is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
+		return;
+
+	if (tdx->tdcx_pa) {
+		for (i = 0; i < tdx_sysinfo_nr_tdcx_pages(); i++) {
+			if (tdx->tdcx_pa[i])
+				tdx_reclaim_control_page(tdx->tdcx_pa[i]);
+		}
+		kfree(tdx->tdcx_pa);
+		tdx->tdcx_pa = NULL;
+	}
+	if (tdx->tdvpr_pa) {
+		tdx_reclaim_control_page(tdx->tdvpr_pa);
+		tdx->tdvpr_pa = 0;
+	}
 }
 
 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -414,8 +455,13 @@  void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	/* Ignore INIT silently because TDX doesn't support INIT event. */
 	if (init_event)
 		return;
+	if (is_td_vcpu_created(to_tdx(vcpu)))
+		return;
 
-	/* This is stub for now. More logic will come here. */
+	/*
+	 * Don't update mp_state to runnable because more initialization
+	 * is needed by TDX_VCPU_INIT.
+	 */
 }
 
 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
@@ -884,6 +930,149 @@  int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 	return r;
 }
 
+/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
+static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
+{
+	const struct tdx_sysinfo_module_info *modinfo = &tdx_sysinfo->module_info;
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
+	unsigned long va;
+	int ret, i;
+	u64 err;
+
+	if (is_td_vcpu_created(tdx))
+		return -EINVAL;
+
+	/*
+	 * vcpu_free method frees allocated pages.  Avoid partial setup so
+	 * that the method can't handle it.
+	 */
+	va = __get_free_page(GFP_KERNEL_ACCOUNT);
+	if (!va)
+		return -ENOMEM;
+	tdx->tdvpr_pa = __pa(va);
+
+	tdx->tdcx_pa = kcalloc(tdx_sysinfo_nr_tdcx_pages(), sizeof(*tdx->tdcx_pa),
+			   GFP_KERNEL_ACCOUNT);
+	if (!tdx->tdcx_pa) {
+		ret = -ENOMEM;
+		goto free_tdvpr;
+	}
+
+	err = tdh_vp_create(tdx);
+	if (KVM_BUG_ON(err, vcpu->kvm)) {
+		tdx->tdvpr_pa = 0;
+		ret = -EIO;
+		pr_tdx_error(TDH_VP_CREATE, err);
+		goto free_tdvpx;
+	}
+
+	for (i = 0; i < tdx_sysinfo_nr_tdcx_pages(); i++) {
+		va = __get_free_page(GFP_KERNEL_ACCOUNT);
+		if (!va) {
+			ret = -ENOMEM;
+			goto free_tdvpx;
+		}
+		tdx->tdcx_pa[i] = __pa(va);
+
+		err = tdh_vp_addcx(tdx, tdx->tdcx_pa[i]);
+		if (KVM_BUG_ON(err, vcpu->kvm)) {
+			pr_tdx_error(TDH_VP_ADDCX, err);
+			/* vcpu_free method frees TDCX and TDR donated to TDX */
+			return -EIO;
+		}
+	}
+
+	if (modinfo->tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)
+		err = tdh_vp_init_apicid(tdx, vcpu_rcx, vcpu->vcpu_id);
+	else
+		err = tdh_vp_init(tdx, vcpu_rcx);
+
+	if (KVM_BUG_ON(err, vcpu->kvm)) {
+		pr_tdx_error(TDH_VP_INIT, err);
+		return -EIO;
+	}
+
+	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+	tdx->td_vcpu_created = true;
+
+	return 0;
+
+free_tdvpx:
+	for (i = 0; i < tdx_sysinfo_nr_tdcx_pages(); i++) {
+		if (tdx->tdcx_pa[i])
+			free_page((unsigned long)__va(tdx->tdcx_pa[i]));
+		tdx->tdcx_pa[i] = 0;
+	}
+	kfree(tdx->tdcx_pa);
+	tdx->tdcx_pa = NULL;
+
+free_tdvpr:
+	if (tdx->tdvpr_pa)
+		free_page((unsigned long)__va(tdx->tdvpr_pa));
+	tdx->tdvpr_pa = 0;
+
+	return ret;
+}
+
+static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
+{
+	struct msr_data apic_base_msr;
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
+	int ret;
+
+	if (cmd->flags)
+		return -EINVAL;
+	if (tdx->initialized)
+		return -EINVAL;
+
+	/*
+	 * As TDX requires X2APIC, set local apic mode to X2APIC.  User space
+	 * VMM, e.g. qemu, is required to set CPUID[0x1].ecx.X2APIC=1 by
+	 * KVM_SET_CPUID2.  Otherwise kvm_set_apic_base() will fail.
+	 */
+	apic_base_msr = (struct msr_data) {
+		.host_initiated = true,
+		.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
+		(kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0),
+	};
+	if (kvm_set_apic_base(vcpu, &apic_base_msr))
+		return -EINVAL;
+
+	ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
+	if (ret)
+		return ret;
+
+	tdx->initialized = true;
+	return 0;
+}
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+	struct kvm_tdx_cmd cmd;
+	int ret;
+
+	if (!is_hkid_assigned(kvm_tdx) || is_td_finalized(kvm_tdx))
+		return -EINVAL;
+
+	if (copy_from_user(&cmd, argp, sizeof(cmd)))
+		return -EFAULT;
+
+	if (cmd.hw_error)
+		return -EINVAL;
+
+	switch (cmd.id) {
+	case KVM_TDX_INIT_VCPU:
+		ret = tdx_vcpu_init(vcpu, &cmd);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
 #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
 
 static int __init setup_kvm_tdx_caps(void)
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index ca948f26b755..8349b542836e 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -22,6 +22,8 @@  struct kvm_tdx {
 	u64 xfam;
 	int hkid;
 
+	bool finalized;
+
 	u64 tsc_offset;
 };
 
@@ -29,6 +31,10 @@  struct vcpu_tdx {
 	struct kvm_vcpu	vcpu;
 
 	unsigned long tdvpr_pa;
+	unsigned long *tdcx_pa;
+	bool td_vcpu_created;
+
+	bool initialized;
 
 	/*
 	 * Dummy to make pmu_intel not corrupt memory.
diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h
index 413619dd92ef..d2d7f9cab740 100644
--- a/arch/x86/kvm/vmx/tdx_arch.h
+++ b/arch/x86/kvm/vmx/tdx_arch.h
@@ -155,4 +155,6 @@  struct td_params {
 #define TDX_MIN_TSC_FREQUENCY_KHZ		(100 * 1000)
 #define TDX_MAX_TSC_FREQUENCY_KHZ		(10 * 1000 * 1000)
 
+#define MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM	BIT_ULL(20)
+
 #endif /* __KVM_X86_TDX_ARCH_H */
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index e1d3276b0f60..55fd17fbfd19 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -129,6 +129,8 @@  int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
 int tdx_vcpu_create(struct kvm_vcpu *vcpu);
 void tdx_vcpu_free(struct kvm_vcpu *vcpu);
 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
 #else
 static inline int tdx_vm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 {
@@ -143,6 +145,8 @@  static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOP
 static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
 static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
 static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
+
+static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
 #endif
 
 #endif /* __KVM_X86_VMX_X86_OPS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9cee326f5e7a..3d43fa84c2b4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6314,6 +6314,12 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 	case KVM_SET_DEVICE_ATTR:
 		r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
 		break;
+	case KVM_MEMORY_ENCRYPT_OP:
+		r = -ENOTTY;
+		if (!kvm_x86_ops.vcpu_mem_enc_ioctl)
+			goto out;
+		r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp);
+		break;
 	default:
 		r = -EINVAL;
 	}