diff mbox series

[14/25] KVM: TDX: initialize VM with TDX specific parameters

Message ID 20240812224820.34826-15-rick.p.edgecombe@intel.com (mailing list archive)
State New, archived
Headers show
Series TDX vCPU/VM creation | expand

Commit Message

Edgecombe, Rick P Aug. 12, 2024, 10:48 p.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

After the crypto-protection key has been configured, TDX requires a
VM-scope initialization as a step of creating the TDX guest.  This
"per-VM" TDX initialization does the global configurations/features that
the TDX guest can support, such as guest's CPUIDs (emulated by the TDX
module), the maximum number of vcpus etc.

This "per-VM" TDX initialization must be done before any "vcpu-scope" TDX
initialization.  To match this better, require the KVM_TDX_INIT_VM IOCTL()
to be done before KVM creates any vcpus.

Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
uAPI breakout v1:
 - Drop TDX_TD_XFAM_CET and use XFEATURE_MASK_CET_{USER, KERNEL}.
 - Update for the wrapper functions for SEAMCALLs. (Sean)
 - Move gfn_shared_mask settings into this patch due to MMU section move
 - Fix bisectability issues in headers (Kai)
 - Updates from seamcall overhaul (Kai)
 - Allow userspace configure xfam directly
 - Check if user sets non-configurable bits in CPUIDs
 - Rename error->hw_error
 - Move code change to tdx_module_setup() to __tdx_bringup() due to
   initializing is done in post hardware_setup() now and
   tdx_module_setup() is removed.  Remove the code to use API to read
   global metadata but use exported 'struct tdx_sysinfo' pointer.
 - Replace 'tdx_info->nr_tdcs_pages' with a wrapper
   tdx_sysinfo_nr_tdcs_pages() because the 'struct tdx_sysinfo' doesn't
   have nr_tdcs_pages directly.
 - Replace tdx_info->max_vcpus_per_td with the new exported pointer in
   tdx_vm_init().
 - Decrease the reserved space for struct kvm_tdx_init_vm (Kai)
 - Use sizeof_field() for struct kvm_tdx_init_vm cpuids (Tony)
 - No need to init init_vm, it gets copied over in tdx_td_init() (Chao)
 - Use kmalloc() instead of () kzalloc for init_vm in tdx_td_init() (Chao)
 - Add more line breaks to tdx_td_init() to make code easier to read (Tony)
 - Clarify patch description (Kai)

v19:
 - Check NO_RBP_MOD of feature0 and set it
 - Update the comment for PT and CET

v18:
 - remove the change of tools/arch/x86/include/uapi/asm/kvm.h
 - typo in comment. sha348 => sha384
 - updated comment in setup_tdparams_xfam()
 - fix setup_tdparams_xfam() to use init_vm instead of td_params

v16:
 - Removed AMX check as the KVM upstream supports AMX.
 - Added CET flag to guest supported xss
---
 arch/x86/include/uapi/asm/kvm.h |  24 ++++
 arch/x86/kvm/cpuid.c            |   7 +
 arch/x86/kvm/cpuid.h            |   2 +
 arch/x86/kvm/vmx/tdx.c          | 237 ++++++++++++++++++++++++++++++--
 arch/x86/kvm/vmx/tdx.h          |   4 +
 arch/x86/kvm/vmx/tdx_ops.h      |  12 ++
 6 files changed, 276 insertions(+), 10 deletions(-)

Comments

Nikolay Borisov Aug. 19, 2024, 3:35 p.m. UTC | #1
On 13.08.24 г. 1:48 ч., Rick Edgecombe wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> After the crypto-protection key has been configured, TDX requires a
> VM-scope initialization as a step of creating the TDX guest.  This
> "per-VM" TDX initialization does the global configurations/features that
> the TDX guest can support, such as guest's CPUIDs (emulated by the TDX
> module), the maximum number of vcpus etc.
> 
> This "per-VM" TDX initialization must be done before any "vcpu-scope" TDX
> initialization.  To match this better, require the KVM_TDX_INIT_VM IOCTL()
> to be done before KVM creates any vcpus.
> 
> Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> ---
> uAPI breakout v1:
>   - Drop TDX_TD_XFAM_CET and use XFEATURE_MASK_CET_{USER, KERNEL}.
>   - Update for the wrapper functions for SEAMCALLs. (Sean)
>   - Move gfn_shared_mask settings into this patch due to MMU section move
>   - Fix bisectability issues in headers (Kai)
>   - Updates from seamcall overhaul (Kai)
>   - Allow userspace configure xfam directly
>   - Check if user sets non-configurable bits in CPUIDs
>   - Rename error->hw_error
>   - Move code change to tdx_module_setup() to __tdx_bringup() due to
>     initializing is done in post hardware_setup() now and
>     tdx_module_setup() is removed.  Remove the code to use API to read
>     global metadata but use exported 'struct tdx_sysinfo' pointer.
>   - Replace 'tdx_info->nr_tdcs_pages' with a wrapper
>     tdx_sysinfo_nr_tdcs_pages() because the 'struct tdx_sysinfo' doesn't
>     have nr_tdcs_pages directly.
>   - Replace tdx_info->max_vcpus_per_td with the new exported pointer in
>     tdx_vm_init().
>   - Decrease the reserved space for struct kvm_tdx_init_vm (Kai)
>   - Use sizeof_field() for struct kvm_tdx_init_vm cpuids (Tony)
>   - No need to init init_vm, it gets copied over in tdx_td_init() (Chao)
>   - Use kmalloc() instead of () kzalloc for init_vm in tdx_td_init() (Chao)
>   - Add more line breaks to tdx_td_init() to make code easier to read (Tony)
>   - Clarify patch description (Kai)
> 
> v19:
>   - Check NO_RBP_MOD of feature0 and set it
>   - Update the comment for PT and CET
> 
> v18:
>   - remove the change of tools/arch/x86/include/uapi/asm/kvm.h
>   - typo in comment. sha348 => sha384
>   - updated comment in setup_tdparams_xfam()
>   - fix setup_tdparams_xfam() to use init_vm instead of td_params
> 
> v16:
>   - Removed AMX check as the KVM upstream supports AMX.
>   - Added CET flag to guest supported xss
> ---
>   arch/x86/include/uapi/asm/kvm.h |  24 ++++
>   arch/x86/kvm/cpuid.c            |   7 +
>   arch/x86/kvm/cpuid.h            |   2 +
>   arch/x86/kvm/vmx/tdx.c          | 237 ++++++++++++++++++++++++++++++--
>   arch/x86/kvm/vmx/tdx.h          |   4 +
>   arch/x86/kvm/vmx/tdx_ops.h      |  12 ++
>   6 files changed, 276 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 2e3caa5a58fd..95ae2d4a4697 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -929,6 +929,7 @@ struct kvm_hyperv_eventfd {
>   /* Trust Domain eXtension sub-ioctl() commands. */
>   enum kvm_tdx_cmd_id {
>   	KVM_TDX_CAPABILITIES = 0,
> +	KVM_TDX_INIT_VM,
>   
>   	KVM_TDX_CMD_NR_MAX,
>   };
> @@ -970,4 +971,27 @@ struct kvm_tdx_capabilities {
>   	struct kvm_tdx_cpuid_config cpuid_configs[];
>   };
>   
> +struct kvm_tdx_init_vm {
> +	__u64 attributes;
> +	__u64 xfam;
> +	__u64 mrconfigid[6];	/* sha384 digest */
> +	__u64 mrowner[6];	/* sha384 digest */
> +	__u64 mrownerconfig[6];	/* sha384 digest */
> +
> +	/* The total space for TD_PARAMS before the CPUIDs is 256 bytes */
> +	__u64 reserved[12];
> +
> +	/*
> +	 * Call KVM_TDX_INIT_VM before vcpu creation, thus before
> +	 * KVM_SET_CPUID2.
> +	 * This configuration supersedes KVM_SET_CPUID2s for VCPUs because the
> +	 * TDX module directly virtualizes those CPUIDs without VMM.  The user
> +	 * space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with
> +	 * those values.  If it doesn't, KVM may have wrong idea of vCPUIDs of
> +	 * the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX
> +	 * module doesn't virtualize.
> +	 */
> +	struct kvm_cpuid2 cpuid;
> +};
> +
>   #endif /* _ASM_X86_KVM_H */
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 2617be544480..7310d8a8a503 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -1487,6 +1487,13 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
>   	return r;
>   }
>   
> +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(
> +	struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index)
> +{
> +	return cpuid_entry2_find(entries, nent, function, index);
> +}
> +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2);
> +
>   struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
>   						    u32 function, u32 index)
>   {
> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> index 41697cca354e..00570227e2ae 100644
> --- a/arch/x86/kvm/cpuid.h
> +++ b/arch/x86/kvm/cpuid.h
> @@ -13,6 +13,8 @@ void kvm_set_cpu_caps(void);
>   
>   void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
>   void kvm_update_pv_runtime(struct kvm_vcpu *vcpu);
> +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries,
> +					       int nent, u32 function, u64 index);
>   struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
>   						    u32 function, u32 index);
>   struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index a0954c3928e2..a6c711715a4a 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -7,6 +7,7 @@
>   #include "tdx.h"
>   #include "tdx_ops.h"
>   
> +
>   #undef pr_fmt
>   #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>   
> @@ -356,12 +357,16 @@ static int tdx_do_tdh_mng_key_config(void *param)
>   	return 0;
>   }
>   
> -static int __tdx_td_init(struct kvm *kvm);
> -
>   int tdx_vm_init(struct kvm *kvm)
>   {
>   	kvm->arch.has_private_mem = true;
>   
> +	/*
> +	 * This function initializes only KVM software construct.  It doesn't
> +	 * initialize TDX stuff, e.g. TDCS, TDR, TDCX, HKID etc.
> +	 * It is handled by KVM_TDX_INIT_VM, __tdx_td_init().
> +	 */

If you need to put a comment like that it means the function has the 
wrong name.

> +
>   	/*
>   	 * TDX has its own limit of the number of vcpus in addition to
>   	 * KVM_MAX_VCPUS.

<snip>

> +
> +static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
> +			 u64 *seamcall_err)

What criteria did you use to split __tdx_td_init from tdx_td_init? Seems 
somewhar arbitrary, I think it's best if the TD VM init code is in a 
single function, yet it will be rather large but the code should be 
self-explanatory and fairly linear. Additionally I think some of the 
code can be factored out in more specific helpers i.e the key 
programming bits can be a separate helper.

>   {
>   	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>   	cpumask_var_t packages;
> @@ -427,8 +547,9 @@ static int __tdx_td_init(struct kvm *kvm)
>   	unsigned long tdr_pa = 0;
>   	unsigned long va;
>   	int ret, i;
> -	u64 err;
> +	u64 err, rcx;
>   
> +	*seamcall_err = 0;
>   	ret = tdx_guest_keyid_alloc();
>   	if (ret < 0)
>   		return ret;
> @@ -543,10 +664,23 @@ static int __tdx_td_init(struct kvm *kvm)
>   		}
>   	}
>   
> -	/*
> -	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
> -	 * ioctl() to define the configure CPUID values for the TD.
> -	 */
> +	err = tdh_mng_init(kvm_tdx, __pa(td_params), &rcx);
> +	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
> +		/*
> +		 * Because a user gives operands, don't warn.
> +		 * Return a hint to the user because it's sometimes hard for the
> +		 * user to figure out which operand is invalid.  SEAMCALL status
> +		 * code includes which operand caused invalid operand error.
> +		 */
> +		*seamcall_err = err;
> +		ret = -EINVAL;
> +		goto teardown;
> +	} else if (WARN_ON_ONCE(err)) {
> +		pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
> +		ret = -EIO;
> +		goto teardown;
> +	}
> +
>   	return 0;
>   
>   	/*
> @@ -592,6 +726,86 @@ static int __tdx_td_init(struct kvm *kvm)
>   	return ret;
>   }
>   
> +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	struct kvm_tdx_init_vm *init_vm;
> +	struct td_params *td_params = NULL;
> +	int ret;
> +
> +	BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
> +	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
> +
> +	if (is_hkid_assigned(kvm_tdx))
> +		return -EINVAL;
> +
> +	if (cmd->flags)
> +		return -EINVAL;
> +
> +	init_vm = kmalloc(sizeof(*init_vm) +
> +			  sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
> +			  GFP_KERNEL);
> +	if (!init_vm)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
> +		ret = -E2BIG;
> +		goto out;
> +	}
> +
> +	if (copy_from_user(init_vm->cpuid.entries,
> +			   u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
> +			   flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (init_vm->cpuid.padding) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
> +	if (!td_params) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = setup_tdparams(kvm, td_params, init_vm);
> +	if (ret)
> +		goto out;
> +
> +	ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
> +	if (ret)
> +		goto out;
> +
> +	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
> +	kvm_tdx->attributes = td_params->attributes;
> +	kvm_tdx->xfam = td_params->xfam;
> +
> +	if (td_params->exec_controls & TDX_EXEC_CONTROL_MAX_GPAW)
> +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(51));
> +	else
> +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(47));
> +
> +out:
> +	/* kfree() accepts NULL. */
> +	kfree(init_vm);
> +	kfree(td_params);
> +
> +	return ret;
> +}
> +
>   int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>   {
>   	struct kvm_tdx_cmd tdx_cmd;
> @@ -613,6 +827,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>   	case KVM_TDX_CAPABILITIES:
>   		r = tdx_get_capabilities(&tdx_cmd);
>   		break;
> +	case KVM_TDX_INIT_VM:
> +		r = tdx_td_init(kvm, &tdx_cmd);
> +		break;
>   	default:
>   		r = -EINVAL;
>   		goto out;
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 268959d0f74f..8912cb6d5bc2 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -16,7 +16,11 @@ struct kvm_tdx {
>   	unsigned long tdr_pa;
>   	unsigned long *tdcs_pa;
>   
> +	u64 attributes;
> +	u64 xfam;
>   	int hkid;
> +
> +	u64 tsc_offset;
>   };
>   
>   struct vcpu_tdx {
> diff --git a/arch/x86/kvm/vmx/tdx_ops.h b/arch/x86/kvm/vmx/tdx_ops.h
> index 3f64c871a3f2..0363d8544f42 100644
> --- a/arch/x86/kvm/vmx/tdx_ops.h
> +++ b/arch/x86/kvm/vmx/tdx_ops.h
> @@ -399,4 +399,16 @@ static inline u64 tdh_vp_wr(struct vcpu_tdx *tdx, u64 field, u64 val, u64 mask)
>   	return seamcall(TDH_VP_WR, &in);
>   }
>   
> +static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
> +{
> +	u64 err, data;
> +
> +	err = tdh_mng_rd(kvm_tdx, TDCS_EXEC(field), &data);
> +	if (unlikely(err)) {
> +		pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err);
> +		return 0;
> +	}
> +	return data;
> +}
> +
>   #endif /* __KVM_X86_TDX_OPS_H */
Edgecombe, Rick P Aug. 21, 2024, 12:01 a.m. UTC | #2
On Mon, 2024-08-19 at 18:35 +0300, Nikolay Borisov wrote:
> > +       /*
> > +        * This function initializes only KVM software construct.  It
> > doesn't
> > +        * initialize TDX stuff, e.g. TDCS, TDR, TDCX, HKID etc.
> > +        * It is handled by KVM_TDX_INIT_VM, __tdx_td_init().
> > +        */
> 
> If you need to put a comment like that it means the function has the 
> wrong name.

This comment is pretty weird. The name seems to come from the pattern of the tdx
specific x86_ops callbacks. As in:

vcpu_create()
	vt_vcpu_create()
		vmx_vcpu_create()
		tdx_vcpu_create()

..matches to:

vm_init()
	vt_vm_init()
		tdx_vm_init()
		vmx_vm_init()

Maybe we should try to come up with some other prefix that makes it clearer that
these are x86_ops callbacks.

> 
> > +
> >         /*
> >          * TDX has its own limit of the number of vcpus in addition to
> >          * KVM_MAX_VCPUS.
> 
> <snip>
> 
> > +
> > +static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
> > +                        u64 *seamcall_err)
> 
> What criteria did you use to split __tdx_td_init from tdx_td_init? Seems 
> somewhar arbitrary, I think it's best if the TD VM init code is in a 
> single function, yet it will be rather large but the code should be 
> self-explanatory and fairly linear. Additionally I think some of the 
> code can be factored out in more specific helpers i.e the key 
> programming bits can be a separate helper.

It looks like it has been like that since 2022. I couldn't find any reasoning.

I agree this could be organized better.
Yan Zhao Aug. 29, 2024, 6:27 a.m. UTC | #3
On Mon, Aug 12, 2024 at 03:48:09PM -0700, Rick Edgecombe wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
...
> +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	struct kvm_tdx_init_vm *init_vm;
> +	struct td_params *td_params = NULL;
> +	int ret;
> +
> +	BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
> +	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
> +
> +	if (is_hkid_assigned(kvm_tdx))
> +		return -EINVAL;
> +
> +	if (cmd->flags)
> +		return -EINVAL;
> +
> +	init_vm = kmalloc(sizeof(*init_vm) +
> +			  sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
> +			  GFP_KERNEL);
> +	if (!init_vm)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
> +		ret = -E2BIG;
> +		goto out;
> +	}
> +
> +	if (copy_from_user(init_vm->cpuid.entries,
> +			   u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
> +			   flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (init_vm->cpuid.padding) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
> +	if (!td_params) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = setup_tdparams(kvm, td_params, init_vm);
> +	if (ret)
> +		goto out;
> +
> +	ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
> +	if (ret)
> +		goto out;
> +
> +	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
> +	kvm_tdx->attributes = td_params->attributes;
> +	kvm_tdx->xfam = td_params->xfam;
> +
> +	if (td_params->exec_controls & TDX_EXEC_CONTROL_MAX_GPAW)
> +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(51));
> +	else
> +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(47));
> +
Could we introduce a initialized field in struct kvm_tdx and set it true
here? e.g
+       kvm_tdx->initialized = true;

Then reject vCPU creation in tdx_vcpu_create() before KVM_TDX_INIT_VM is
executed successfully? e.g.

@@ -584,6 +589,9 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
        struct vcpu_tdx *tdx = to_tdx(vcpu);

+       if (!kvm_tdx->initialized)
+               return -EIO;
+
        /* TDX only supports x2APIC, which requires an in-kernel local APIC. */
        if (!vcpu->arch.apic)
                return -EINVAL;

Allowing vCPU creation only after TD is initialized can prevent unexpected
userspace access to uninitialized TD primitives.
See details in the next comment.

> +out:
> +	/* kfree() accepts NULL. */
> +	kfree(init_vm);
> +	kfree(td_params);
> +
> +	return ret;
> +}
> +
>  int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>  {
>  	struct kvm_tdx_cmd tdx_cmd;
> @@ -613,6 +827,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>  	case KVM_TDX_CAPABILITIES:
>  		r = tdx_get_capabilities(&tdx_cmd);
>  		break;
> +	case KVM_TDX_INIT_VM:
> +		r = tdx_td_init(kvm, &tdx_cmd);
> +		break;
>  	default:
>  		r = -EINVAL;
>  		goto out;


QEMU should invoke VM ioctl KVM_TDX_INIT_VM in tdx_pre_create_vcpu() before
creating vCPUs via VM ioctl KVM_CREATE_VCPU, but KVM should not count on
userspace always doing the right thing.
e.g. running below selftest would produce warning in KVM due to
td_vmcs_write64() error in tdx_load_mmu_pgd().

void verify_td_negative_test(void)
{
        struct kvm_vm *vm;
        struct kvm_vcpu *vcpu;

        vm = td_create();
        vm_enable_cap(vm, KVM_CAP_SPLIT_IRQCHIP, 24);
        vcpu = __vm_vcpu_add(vm, 0);
        vcpu_run(vcpu);
        kvm_vm_free(vm);
}


[ 5600.721996] WARNING: CPU: 116 PID: 7914 at arch/x86/kvm/vmx/tdx.h:237 tdx_load_mmu_pgd+0x55/0xa0 [kvm_intel] 
[ 5600.735999] Modules linked in: kvm_intel kvm idxd i2c_i801 nls_iso8859_1 i2c_smbus i2c_ismt nls_cp437 squashfs hid_generic crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel crypto_simd cryptd [last unloaded: kvm]
[ 5600.762904] CPU: 116 PID: 7914 Comm: tdx_vm_tests Not tainted 6.10.0-rc7-upstream+ #278 5e882f76313c2b130a0f7525b7eda06f47d8ea02
[ 5600.779772] Hardware name: Intel Corporation ArcherCity/ArcherCity, BIOS EGSDCRB1.SYS.0101.D29.2303301937 03/30/2023
[ 5600.795940] RIP: 0010:tdx_load_mmu_pgd+0x55/0xa0 [kvm_intel]                  
[ 5600.805013] Code: 00 e8 8f b4 ff ff 48 85 c0 74 52 49 89 c5 48 8b 03 44 0f b6 b0 89 a3 00 00 41 80 fe 01 0f 87 ae 74 00 00 41 83 e6 01 75 1d 90 <0f> 0b 90 48 8b 3b b8 01 01 00 00 be 01 03 00 00 66 89 87 89 a3 00
[ 5600.833286] RSP: 0018:ff3550cf49297c78 EFLAGS: 00010246                       
[ 5600.842233] RAX: ff3550cf4dfd9000 RBX: ff2c5edc10600000 RCX: 0000000000000000 
[ 5600.853400] RDX: 0000000000000000 RSI: ff3550cf49297be8 RDI: 000000000000002b 
[ 5600.864609] RBP: ff3550cf49297c98 R08: 0000000000000000 R09: ffffffffffffffff 
[ 5600.875915] R10: 0000000000000000 R11: 0000000000000000 R12: 000000048d10c000 
[ 5600.887255] R13: c000030000000001 R14: 0000000000000000 R15: 0000000000000000 
[ 5600.898584] FS:  00007f9597799740(0000) GS:ff2c5ee7ad700000(0000) knlGS:0000000000000000
[ 5600.911113] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033                 
[ 5600.921064] CR2: 00007f959759b8c0 CR3: 000000010b83e005 CR4: 0000000000773ef0 
[ 5600.932675] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 
[ 5600.944319] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 
[ 5600.955987] PKRU: 55555554                                                    
[ 5600.962665] Call Trace:                                                       
[ 5600.969084]  <TASK>                                                           
[ 5600.975079]  ? show_regs+0x64/0x70                                            
[ 5600.982536]  ? __warn+0x8a/0x100                                              
[ 5600.989840]  ? tdx_load_mmu_pgd+0x55/0xa0 [kvm_intel b63d7b2e0213930160302a21a156d5f897483840]
[ 5601.006321]  ? report_bug+0x1b6/0x220                                         
[ 5601.014351]  ? handle_bug+0x43/0x80                                           
[ 5601.022248]  ? exc_invalid_op+0x18/0x70                                       
[ 5601.030554]  ? asm_exc_invalid_op+0x1b/0x20                                   
[ 5601.039297]  ? tdx_load_mmu_pgd+0x55/0xa0 [kvm_intel b63d7b2e0213930160302a21a156d5f897483840]
[ 5601.056276]  ? tdx_load_mmu_pgd+0x31/0xa0 [kvm_intel b63d7b2e0213930160302a21a156d5f897483840]
[ 5601.073270]  vt_load_mmu_pgd+0x57/0x70 [kvm_intel b63d7b2e0213930160302a21a156d5f897483840]
[ 5601.089991]  kvm_mmu_load+0xa4/0xc0 [kvm 2979fa2240d2f299e1c4576243100dec1104b4cd]
[ 5601.102708]  vcpu_enter_guest+0xbe2/0x1140 [kvm 2979fa2240d2f299e1c4576243100dec1104b4cd]
[ 5601.116042]  ? __this_cpu_preempt_check+0x13/0x20                             
[ 5601.125373]  ? debug_smp_processor_id+0x17/0x20                               
[ 5601.134400]  vcpu_run+0x4d/0x280 [kvm 2979fa2240d2f299e1c4576243100dec1104b4cd]
[ 5601.146657]  ? vcpu_run+0x4d/0x280 [kvm 2979fa2240d2f299e1c4576243100dec1104b4cd]
[ 5601.159108]  kvm_arch_vcpu_ioctl_run+0x224/0x680 [kvm 2979fa2240d2f299e1c4576243100dec1104b4cd]
[ 5601.175943]  kvm_vcpu_ioctl+0x238/0x750 [kvm 2979fa2240d2f299e1c4576243100dec1104b4cd]
[ 5601.188912]  ? __ct_user_exit+0xd1/0x120 
[ 5601.197305]  ? __lock_release.isra.0+0x61/0x160
[ 5601.206432]  ? __ct_user_exit+0xd1/0x120
[ 5601.214791]  __x64_sys_ioctl+0x98/0xd0
[ 5601.222980]  x64_sys_call+0x1222/0x2040
[ 5601.231268]  do_syscall_64+0xc3/0x220
[ 5601.239321]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[ 5601.248913] RIP: 0033:0x7f9597524ded
[ 5601.256781] Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00
[ 5601.288181] RSP: 002b:00007ffd117315c0 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[ 5601.300843] RAX: ffffffffffffffda RBX: 00000000108a32a0 RCX: 00007f9597524ded
[ 5601.313108] RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 0000000000000005
[ 5601.325411] RBP: 00007ffd11731610 R08: 0000000000422078 R09: 0000000000428e48
[ 5601.337721] R10: 0000000000000001 R11: 0000000000000246 R12: 00000000108a54a0
[ 5601.349965] R13: 0000000000000000 R14: 0000000000434e00 R15: 00007f95977eb000
[ 5601.362131]  </TASK>


> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 268959d0f74f..8912cb6d5bc2 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -16,7 +16,11 @@ struct kvm_tdx {
>  	unsigned long tdr_pa;
>  	unsigned long *tdcs_pa;
>  
> +	u64 attributes;
> +	u64 xfam;
>  	int hkid;
> +
> +	u64 tsc_offset;
>  };

> 
>
Tony Lindgren Sept. 2, 2024, 10:31 a.m. UTC | #4
On Thu, Aug 29, 2024 at 02:27:56PM +0800, Yan Zhao wrote:
> On Mon, Aug 12, 2024 at 03:48:09PM -0700, Rick Edgecombe wrote:
> > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > 
> ...
> > +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
> > +{
...

> > +	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
> > +	kvm_tdx->attributes = td_params->attributes;
> > +	kvm_tdx->xfam = td_params->xfam;
> > +
> > +	if (td_params->exec_controls & TDX_EXEC_CONTROL_MAX_GPAW)
> > +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(51));
> > +	else
> > +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(47));
> > +
> Could we introduce a initialized field in struct kvm_tdx and set it true
> here? e.g
> +       kvm_tdx->initialized = true;
> 
> Then reject vCPU creation in tdx_vcpu_create() before KVM_TDX_INIT_VM is
> executed successfully? e.g.
> 
> @@ -584,6 +589,9 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
>         struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
>         struct vcpu_tdx *tdx = to_tdx(vcpu);
> 
> +       if (!kvm_tdx->initialized)
> +               return -EIO;
> +
>         /* TDX only supports x2APIC, which requires an in-kernel local APIC. */
>         if (!vcpu->arch.apic)
>                 return -EINVAL;
> 
> Allowing vCPU creation only after TD is initialized can prevent unexpected
> userspace access to uninitialized TD primitives.

Makes sense to check for initialized TD before allowing other calls. Maybe
the check is needed in other places too in additoin to the tdx_vcpu_create().

How about just a function to check for one or more of the already existing
initialized struct kvm_tdx values?

Regards,

Tony
Chenyi Qiang Sept. 3, 2024, 2:58 a.m. UTC | #5
On 8/13/2024 6:48 AM, Rick Edgecombe wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> After the crypto-protection key has been configured, TDX requires a
> VM-scope initialization as a step of creating the TDX guest.  This
> "per-VM" TDX initialization does the global configurations/features that
> the TDX guest can support, such as guest's CPUIDs (emulated by the TDX
> module), the maximum number of vcpus etc.
> 
> This "per-VM" TDX initialization must be done before any "vcpu-scope" TDX
> initialization.  To match this better, require the KVM_TDX_INIT_VM IOCTL()
> to be done before KVM creates any vcpus.
> 
> Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> ---
> uAPI breakout v1:
>  - Drop TDX_TD_XFAM_CET and use XFEATURE_MASK_CET_{USER, KERNEL}.
>  - Update for the wrapper functions for SEAMCALLs. (Sean)
>  - Move gfn_shared_mask settings into this patch due to MMU section move
>  - Fix bisectability issues in headers (Kai)
>  - Updates from seamcall overhaul (Kai)
>  - Allow userspace configure xfam directly
>  - Check if user sets non-configurable bits in CPUIDs
>  - Rename error->hw_error
>  - Move code change to tdx_module_setup() to __tdx_bringup() due to
>    initializing is done in post hardware_setup() now and
>    tdx_module_setup() is removed.  Remove the code to use API to read
>    global metadata but use exported 'struct tdx_sysinfo' pointer.
>  - Replace 'tdx_info->nr_tdcs_pages' with a wrapper
>    tdx_sysinfo_nr_tdcs_pages() because the 'struct tdx_sysinfo' doesn't
>    have nr_tdcs_pages directly.
>  - Replace tdx_info->max_vcpus_per_td with the new exported pointer in
>    tdx_vm_init().
>  - Decrease the reserved space for struct kvm_tdx_init_vm (Kai)
>  - Use sizeof_field() for struct kvm_tdx_init_vm cpuids (Tony)
>  - No need to init init_vm, it gets copied over in tdx_td_init() (Chao)
>  - Use kmalloc() instead of () kzalloc for init_vm in tdx_td_init() (Chao)
>  - Add more line breaks to tdx_td_init() to make code easier to read (Tony)
>  - Clarify patch description (Kai)
> 
> v19:
>  - Check NO_RBP_MOD of feature0 and set it
>  - Update the comment for PT and CET
> 
> v18:
>  - remove the change of tools/arch/x86/include/uapi/asm/kvm.h
>  - typo in comment. sha348 => sha384
>  - updated comment in setup_tdparams_xfam()
>  - fix setup_tdparams_xfam() to use init_vm instead of td_params
> 
> v16:
>  - Removed AMX check as the KVM upstream supports AMX.
>  - Added CET flag to guest supported xss
> ---
>  arch/x86/include/uapi/asm/kvm.h |  24 ++++
>  arch/x86/kvm/cpuid.c            |   7 +
>  arch/x86/kvm/cpuid.h            |   2 +
>  arch/x86/kvm/vmx/tdx.c          | 237 ++++++++++++++++++++++++++++++--
>  arch/x86/kvm/vmx/tdx.h          |   4 +
>  arch/x86/kvm/vmx/tdx_ops.h      |  12 ++
>  6 files changed, 276 insertions(+), 10 deletions(-)
> 

...

> +static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
> +			 u64 *seamcall_err)
>  {
>  	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>  	cpumask_var_t packages;
> @@ -427,8 +547,9 @@ static int __tdx_td_init(struct kvm *kvm)
>  	unsigned long tdr_pa = 0;
>  	unsigned long va;
>  	int ret, i;
> -	u64 err;
> +	u64 err, rcx;
>  
> +	*seamcall_err = 0;
>  	ret = tdx_guest_keyid_alloc();
>  	if (ret < 0)
>  		return ret;
> @@ -543,10 +664,23 @@ static int __tdx_td_init(struct kvm *kvm)
>  		}
>  	}
>  
> -	/*
> -	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
> -	 * ioctl() to define the configure CPUID values for the TD.
> -	 */
> +	err = tdh_mng_init(kvm_tdx, __pa(td_params), &rcx);
> +	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
> +		/*
> +		 * Because a user gives operands, don't warn.
> +		 * Return a hint to the user because it's sometimes hard for the
> +		 * user to figure out which operand is invalid.  SEAMCALL status
> +		 * code includes which operand caused invalid operand error.
> +		 */
> +		*seamcall_err = err;

I'm wondering if we could return or output more hint (i.e. the value of
rcx) in the case of invalid operand. For example, if seamcall returns
with INVALID_OPERAND_CPUID_CONFIG, rcx will contain the CPUID
leaf/sub-leaf info.

> +		ret = -EINVAL;
> +		goto teardown;
> +	} else if (WARN_ON_ONCE(err)) {
> +		pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
> +		ret = -EIO;
> +		goto teardown;
> +	}
> +
>  	return 0;
>  
>  	/*
> @@ -592,6 +726,86 @@ static int __tdx_td_init(struct kvm *kvm)
>  	return ret;
>  }
>  
> +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	struct kvm_tdx_init_vm *init_vm;
> +	struct td_params *td_params = NULL;
> +	int ret;
> +
> +	BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
> +	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
> +
> +	if (is_hkid_assigned(kvm_tdx))
> +		return -EINVAL;
> +
> +	if (cmd->flags)
> +		return -EINVAL;
> +
> +	init_vm = kmalloc(sizeof(*init_vm) +
> +			  sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
> +			  GFP_KERNEL);
> +	if (!init_vm)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
> +		ret = -E2BIG;
> +		goto out;
> +	}
> +
> +	if (copy_from_user(init_vm->cpuid.entries,
> +			   u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
> +			   flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (init_vm->cpuid.padding) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
> +	if (!td_params) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = setup_tdparams(kvm, td_params, init_vm);
> +	if (ret)
> +		goto out;
> +
> +	ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
> +	if (ret)
> +		goto out;
> +
> +	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
> +	kvm_tdx->attributes = td_params->attributes;
> +	kvm_tdx->xfam = td_params->xfam;
> +
> +	if (td_params->exec_controls & TDX_EXEC_CONTROL_MAX_GPAW)
> +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(51));
> +	else
> +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(47));
> +
> +out:
> +	/* kfree() accepts NULL. */
> +	kfree(init_vm);
> +	kfree(td_params);
> +
> +	return ret;
> +}
> +
>  int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>  {
>  	struct kvm_tdx_cmd tdx_cmd;
> @@ -613,6 +827,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>  	case KVM_TDX_CAPABILITIES:
>  		r = tdx_get_capabilities(&tdx_cmd);
>  		break;
> +	case KVM_TDX_INIT_VM:
> +		r = tdx_td_init(kvm, &tdx_cmd);
> +		break;
>  	default:
>  		r = -EINVAL;
>  		goto out;
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 268959d0f74f..8912cb6d5bc2 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -16,7 +16,11 @@ struct kvm_tdx {
>  	unsigned long tdr_pa;
>  	unsigned long *tdcs_pa;
>  
> +	u64 attributes;
> +	u64 xfam;
>  	int hkid;
> +
> +	u64 tsc_offset;
>  };
>  
>  struct vcpu_tdx {
> diff --git a/arch/x86/kvm/vmx/tdx_ops.h b/arch/x86/kvm/vmx/tdx_ops.h
> index 3f64c871a3f2..0363d8544f42 100644
> --- a/arch/x86/kvm/vmx/tdx_ops.h
> +++ b/arch/x86/kvm/vmx/tdx_ops.h
> @@ -399,4 +399,16 @@ static inline u64 tdh_vp_wr(struct vcpu_tdx *tdx, u64 field, u64 val, u64 mask)
>  	return seamcall(TDH_VP_WR, &in);
>  }
>  
> +static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
> +{
> +	u64 err, data;
> +
> +	err = tdh_mng_rd(kvm_tdx, TDCS_EXEC(field), &data);
> +	if (unlikely(err)) {
> +		pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err);
> +		return 0;
> +	}
> +	return data;
> +}
> +
>  #endif /* __KVM_X86_TDX_OPS_H */
Tony Lindgren Sept. 3, 2024, 5:44 a.m. UTC | #6
On Tue, Sep 03, 2024 at 10:58:11AM +0800, Chenyi Qiang wrote:
> On 8/13/2024 6:48 AM, Rick Edgecombe wrote:
> > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > @@ -543,10 +664,23 @@ static int __tdx_td_init(struct kvm *kvm)
> >  		}
> >  	}
> >  
> > -	/*
> > -	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
> > -	 * ioctl() to define the configure CPUID values for the TD.
> > -	 */
> > +	err = tdh_mng_init(kvm_tdx, __pa(td_params), &rcx);
> > +	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
> > +		/*
> > +		 * Because a user gives operands, don't warn.
> > +		 * Return a hint to the user because it's sometimes hard for the
> > +		 * user to figure out which operand is invalid.  SEAMCALL status
> > +		 * code includes which operand caused invalid operand error.
> > +		 */
> > +		*seamcall_err = err;
> 
> I'm wondering if we could return or output more hint (i.e. the value of
> rcx) in the case of invalid operand. For example, if seamcall returns
> with INVALID_OPERAND_CPUID_CONFIG, rcx will contain the CPUID
> leaf/sub-leaf info.

Printing a decriptive error here would be nice when things go wrong.
Probably no need to return that information.

Sounds like you have a patch already in mind though :) Care to post a
patch against the current kvm-coco branch? If not, I can do it after all
the obvious comment changes are out of the way.

Regards,

Tony
Chenyi Qiang Sept. 3, 2024, 8:04 a.m. UTC | #7
On 9/3/2024 1:44 PM, Tony Lindgren wrote:
> On Tue, Sep 03, 2024 at 10:58:11AM +0800, Chenyi Qiang wrote:
>> On 8/13/2024 6:48 AM, Rick Edgecombe wrote:
>>> From: Isaku Yamahata <isaku.yamahata@intel.com>
>>> @@ -543,10 +664,23 @@ static int __tdx_td_init(struct kvm *kvm)
>>>  		}
>>>  	}
>>>  
>>> -	/*
>>> -	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
>>> -	 * ioctl() to define the configure CPUID values for the TD.
>>> -	 */
>>> +	err = tdh_mng_init(kvm_tdx, __pa(td_params), &rcx);
>>> +	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
>>> +		/*
>>> +		 * Because a user gives operands, don't warn.
>>> +		 * Return a hint to the user because it's sometimes hard for the
>>> +		 * user to figure out which operand is invalid.  SEAMCALL status
>>> +		 * code includes which operand caused invalid operand error.
>>> +		 */
>>> +		*seamcall_err = err;
>>
>> I'm wondering if we could return or output more hint (i.e. the value of
>> rcx) in the case of invalid operand. For example, if seamcall returns
>> with INVALID_OPERAND_CPUID_CONFIG, rcx will contain the CPUID
>> leaf/sub-leaf info.
> 
> Printing a decriptive error here would be nice when things go wrong.
> Probably no need to return that information.
> 
> Sounds like you have a patch already in mind though :) Care to post a
> patch against the current kvm-coco branch? If not, I can do it after all
> the obvious comment changes are out of the way.

According to the comment above, this patch wants to return the hint to
user as the user gives operands. I'm still uncertain if we should follow
this to return value in some way or special-case the
INVALID_OPERAND_CPUID_CONFIG like:

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index c00c73b2ad4c..dd6e3149ff5a 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -2476,8 +2476,14 @@ static int __tdx_td_init(struct kvm *kvm, struct
td_params *td_params,
                 * Return a hint to the user because it's sometimes hard
for the
                 * user to figure out which operand is invalid.
SEAMCALL status
                 * code includes which operand caused invalid operand error.
+                *
+                * TDX_OPERAND_INVALID_CPUID_CONFIG contains more info
+                * in rcx (i.e. leaf/sub-leaf), warn it to help figure
+                * out the invalid CPUID config.
                 */
                *seamcall_err = err;
+               if (err == (TDX_OPERAND_INVALID |
TDX_OPERAND_ID_CPUID_CONFIG))
+                       pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
                ret = -EINVAL;
                goto teardown;
        } else if (WARN_ON_ONCE(err)) {
diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/kvm/vmx/tdx_errno.h
index f9dbb3a065cc..311c3f03d398 100644
--- a/arch/x86/kvm/vmx/tdx_errno.h
+++ b/arch/x86/kvm/vmx/tdx_errno.h
@@ -30,6 +30,7 @@
  * detail information
  */
 #define TDX_OPERAND_ID_RCX                     0x01
+#define TDX_OPERAND_ID_CPUID_CONFIG            0x45
 #define TDX_OPERAND_ID_TDR                     0x80
 #define TDX_OPERAND_ID_SEPT                    0x92
 #define TDX_OPERAND_ID_TD_EPOCH                        0xa9

> 
> Regards,
> 
> Tony
Yan Zhao Sept. 5, 2024, 6:59 a.m. UTC | #8
On Mon, Sep 02, 2024 at 01:31:29PM +0300, Tony Lindgren wrote:
> On Thu, Aug 29, 2024 at 02:27:56PM +0800, Yan Zhao wrote:
> > On Mon, Aug 12, 2024 at 03:48:09PM -0700, Rick Edgecombe wrote:
> > > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > > 
> > ...
> > > +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
> > > +{
> ...
> 
> > > +	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
> > > +	kvm_tdx->attributes = td_params->attributes;
> > > +	kvm_tdx->xfam = td_params->xfam;
> > > +
> > > +	if (td_params->exec_controls & TDX_EXEC_CONTROL_MAX_GPAW)
> > > +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(51));
> > > +	else
> > > +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(47));
> > > +
> > Could we introduce a initialized field in struct kvm_tdx and set it true
> > here? e.g
> > +       kvm_tdx->initialized = true;
> > 
> > Then reject vCPU creation in tdx_vcpu_create() before KVM_TDX_INIT_VM is
> > executed successfully? e.g.
> > 
> > @@ -584,6 +589,9 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
> >         struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> >         struct vcpu_tdx *tdx = to_tdx(vcpu);
> > 
> > +       if (!kvm_tdx->initialized)
> > +               return -EIO;
> > +
> >         /* TDX only supports x2APIC, which requires an in-kernel local APIC. */
> >         if (!vcpu->arch.apic)
> >                 return -EINVAL;
> > 
> > Allowing vCPU creation only after TD is initialized can prevent unexpected
> > userspace access to uninitialized TD primitives.
> 
> Makes sense to check for initialized TD before allowing other calls. Maybe
> the check is needed in other places too in additoin to the tdx_vcpu_create().
Do you mean in places checking is_hkid_assigned()?
> 
> How about just a function to check for one or more of the already existing
> initialized struct kvm_tdx values?
Instead of checking multiple individual fields in kvm_tdx or vcpu_tdx, could we
introduce a single state field in the two strutures and utilize a state machine
for check (as Chao Gao pointed out at [1]) ?

e.g.
Now TD can have 5 states: (1)created, (2)initialized, (3)finalized,
                          (4)destroyed, (5)freed.
Each vCPU has 3 states: (1) created, (2) initialized, (3)freed

All the states are updated by a user operation (e.g. KVM_TDX_INIT_VM,
KVM_TDX_FINALIZE_VM, KVM_TDX_INIT_VCPU) or a x86 op (e.g. vm_init, vm_destroy,
vm_free, vcpu_create, vcpu_free).


     TD                                   vCPU
(1) created(set in op vm_init)
(2) initialized
(indicate tdr_pa != 0 && HKID assigned)

                                          (1) created (set in op vcpu_create)

                                          (2) initialized

                                    (can call INIT_MEM_REGION, GET_CPUID here)


(3) finalized

                                 (tdx_vcpu_run(), tdx_handle_exit() can be here)


(4) destroyed (indicate HKID released)

                                         (3) freed

(5) freed


[1] https://lore.kernel.org/kvm/ZfvI8t7SlfIsxbmT@chao-email/#t
Tony Lindgren Sept. 5, 2024, 9:27 a.m. UTC | #9
On Thu, Sep 05, 2024 at 02:59:25PM +0800, Yan Zhao wrote:
> On Mon, Sep 02, 2024 at 01:31:29PM +0300, Tony Lindgren wrote:
> > On Thu, Aug 29, 2024 at 02:27:56PM +0800, Yan Zhao wrote:
> > > On Mon, Aug 12, 2024 at 03:48:09PM -0700, Rick Edgecombe wrote:
> > > > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > > > 
> > > ...
> > > > +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
> > > > +{
> > ...
> > 
> > > > +	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
> > > > +	kvm_tdx->attributes = td_params->attributes;
> > > > +	kvm_tdx->xfam = td_params->xfam;
> > > > +
> > > > +	if (td_params->exec_controls & TDX_EXEC_CONTROL_MAX_GPAW)
> > > > +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(51));
> > > > +	else
> > > > +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(47));
> > > > +
> > > Could we introduce a initialized field in struct kvm_tdx and set it true
> > > here? e.g
> > > +       kvm_tdx->initialized = true;
> > > 
> > > Then reject vCPU creation in tdx_vcpu_create() before KVM_TDX_INIT_VM is
> > > executed successfully? e.g.
> > > 
> > > @@ -584,6 +589,9 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
> > >         struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> > >         struct vcpu_tdx *tdx = to_tdx(vcpu);
> > > 
> > > +       if (!kvm_tdx->initialized)
> > > +               return -EIO;
> > > +
> > >         /* TDX only supports x2APIC, which requires an in-kernel local APIC. */
> > >         if (!vcpu->arch.apic)
> > >                 return -EINVAL;
> > > 
> > > Allowing vCPU creation only after TD is initialized can prevent unexpected
> > > userspace access to uninitialized TD primitives.
> > 
> > Makes sense to check for initialized TD before allowing other calls. Maybe
> > the check is needed in other places too in additoin to the tdx_vcpu_create().
> Do you mean in places checking is_hkid_assigned()?

Sounds like the state needs to be checked in multiple places to handle
out-of-order ioctls to that's not enough.

> > How about just a function to check for one or more of the already existing
> > initialized struct kvm_tdx values?
> Instead of checking multiple individual fields in kvm_tdx or vcpu_tdx, could we
> introduce a single state field in the two strutures and utilize a state machine
> for check (as Chao Gao pointed out at [1]) ?

OK

> e.g.
> Now TD can have 5 states: (1)created, (2)initialized, (3)finalized,
>                           (4)destroyed, (5)freed.
> Each vCPU has 3 states: (1) created, (2) initialized, (3)freed
> 
> All the states are updated by a user operation (e.g. KVM_TDX_INIT_VM,
> KVM_TDX_FINALIZE_VM, KVM_TDX_INIT_VCPU) or a x86 op (e.g. vm_init, vm_destroy,
> vm_free, vcpu_create, vcpu_free).
> 
> 
>      TD                                   vCPU
> (1) created(set in op vm_init)
> (2) initialized
> (indicate tdr_pa != 0 && HKID assigned)
> 
>                                           (1) created (set in op vcpu_create)
> 
>                                           (2) initialized
> 
>                                     (can call INIT_MEM_REGION, GET_CPUID here)
> 
> 
> (3) finalized
> 
>                                  (tdx_vcpu_run(), tdx_handle_exit() can be here)
> 
> 
> (4) destroyed (indicate HKID released)
> 
>                                          (3) freed
> 
> (5) freed

So an enum for the TD state, and also for the vCPU state?

Regards,

Tony
 
> [1] https://lore.kernel.org/kvm/ZfvI8t7SlfIsxbmT@chao-email/#t
Tony Lindgren Sept. 5, 2024, 9:31 a.m. UTC | #10
On Tue, Sep 03, 2024 at 04:04:47PM +0800, Chenyi Qiang wrote:
> 
> 
> On 9/3/2024 1:44 PM, Tony Lindgren wrote:
> > On Tue, Sep 03, 2024 at 10:58:11AM +0800, Chenyi Qiang wrote:
> >> On 8/13/2024 6:48 AM, Rick Edgecombe wrote:
> >>> From: Isaku Yamahata <isaku.yamahata@intel.com>
> >>> @@ -543,10 +664,23 @@ static int __tdx_td_init(struct kvm *kvm)
> >>>  		}
> >>>  	}
> >>>  
> >>> -	/*
> >>> -	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
> >>> -	 * ioctl() to define the configure CPUID values for the TD.
> >>> -	 */
> >>> +	err = tdh_mng_init(kvm_tdx, __pa(td_params), &rcx);
> >>> +	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
> >>> +		/*
> >>> +		 * Because a user gives operands, don't warn.
> >>> +		 * Return a hint to the user because it's sometimes hard for the
> >>> +		 * user to figure out which operand is invalid.  SEAMCALL status
> >>> +		 * code includes which operand caused invalid operand error.
> >>> +		 */
> >>> +		*seamcall_err = err;
> >>
> >> I'm wondering if we could return or output more hint (i.e. the value of
> >> rcx) in the case of invalid operand. For example, if seamcall returns
> >> with INVALID_OPERAND_CPUID_CONFIG, rcx will contain the CPUID
> >> leaf/sub-leaf info.
> > 
> > Printing a decriptive error here would be nice when things go wrong.
> > Probably no need to return that information.
> > 
> > Sounds like you have a patch already in mind though :) Care to post a
> > patch against the current kvm-coco branch? If not, I can do it after all
> > the obvious comment changes are out of the way.
> 
> According to the comment above, this patch wants to return the hint to
> user as the user gives operands. I'm still uncertain if we should follow
> this to return value in some way or special-case the
> INVALID_OPERAND_CPUID_CONFIG like:
> 
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index c00c73b2ad4c..dd6e3149ff5a 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -2476,8 +2476,14 @@ static int __tdx_td_init(struct kvm *kvm, struct
> td_params *td_params,
>                  * Return a hint to the user because it's sometimes hard
> for the
>                  * user to figure out which operand is invalid.
> SEAMCALL status
>                  * code includes which operand caused invalid operand error.
> +                *
> +                * TDX_OPERAND_INVALID_CPUID_CONFIG contains more info
> +                * in rcx (i.e. leaf/sub-leaf), warn it to help figure
> +                * out the invalid CPUID config.
>                  */
>                 *seamcall_err = err;
> +               if (err == (TDX_OPERAND_INVALID |
> TDX_OPERAND_ID_CPUID_CONFIG))
> +                       pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
>                 ret = -EINVAL;
>                 goto teardown;
>         } else if (WARN_ON_ONCE(err)) {
> diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/kvm/vmx/tdx_errno.h
> index f9dbb3a065cc..311c3f03d398 100644
> --- a/arch/x86/kvm/vmx/tdx_errno.h
> +++ b/arch/x86/kvm/vmx/tdx_errno.h
> @@ -30,6 +30,7 @@
>   * detail information
>   */
>  #define TDX_OPERAND_ID_RCX                     0x01
> +#define TDX_OPERAND_ID_CPUID_CONFIG            0x45
>  #define TDX_OPERAND_ID_TDR                     0x80
>  #define TDX_OPERAND_ID_SEPT                    0x92
>  #define TDX_OPERAND_ID_TD_EPOCH                        0xa9
> 

OK yes that should take care of the issue, I doubt that this can be
automatically be handled by the caller even a better error code
was returned.

Regards,

Tony
Yan Zhao Sept. 6, 2024, 4:05 a.m. UTC | #11
On Thu, Sep 05, 2024 at 12:27:54PM +0300, Tony Lindgren wrote:
> On Thu, Sep 05, 2024 at 02:59:25PM +0800, Yan Zhao wrote:
> > On Mon, Sep 02, 2024 at 01:31:29PM +0300, Tony Lindgren wrote:
> > > On Thu, Aug 29, 2024 at 02:27:56PM +0800, Yan Zhao wrote:
> > > > On Mon, Aug 12, 2024 at 03:48:09PM -0700, Rick Edgecombe wrote:
> > > > > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > > > > 
> > > > ...
> > > > > +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
> > > > > +{
> > > ...
> > > 
> > > > > +	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
> > > > > +	kvm_tdx->attributes = td_params->attributes;
> > > > > +	kvm_tdx->xfam = td_params->xfam;
> > > > > +
> > > > > +	if (td_params->exec_controls & TDX_EXEC_CONTROL_MAX_GPAW)
> > > > > +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(51));
> > > > > +	else
> > > > > +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(47));
> > > > > +
> > > > Could we introduce a initialized field in struct kvm_tdx and set it true
> > > > here? e.g
> > > > +       kvm_tdx->initialized = true;
> > > > 
> > > > Then reject vCPU creation in tdx_vcpu_create() before KVM_TDX_INIT_VM is
> > > > executed successfully? e.g.
> > > > 
> > > > @@ -584,6 +589,9 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
> > > >         struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> > > >         struct vcpu_tdx *tdx = to_tdx(vcpu);
> > > > 
> > > > +       if (!kvm_tdx->initialized)
> > > > +               return -EIO;
> > > > +
> > > >         /* TDX only supports x2APIC, which requires an in-kernel local APIC. */
> > > >         if (!vcpu->arch.apic)
> > > >                 return -EINVAL;
> > > > 
> > > > Allowing vCPU creation only after TD is initialized can prevent unexpected
> > > > userspace access to uninitialized TD primitives.
> > > 
> > > Makes sense to check for initialized TD before allowing other calls. Maybe
> > > the check is needed in other places too in additoin to the tdx_vcpu_create().
> > Do you mean in places checking is_hkid_assigned()?
> 
> Sounds like the state needs to be checked in multiple places to handle
> out-of-order ioctls to that's not enough.
> 
> > > How about just a function to check for one or more of the already existing
> > > initialized struct kvm_tdx values?
> > Instead of checking multiple individual fields in kvm_tdx or vcpu_tdx, could we
> > introduce a single state field in the two strutures and utilize a state machine
> > for check (as Chao Gao pointed out at [1]) ?
> 
> OK
> 
> > e.g.
> > Now TD can have 5 states: (1)created, (2)initialized, (3)finalized,
> >                           (4)destroyed, (5)freed.
> > Each vCPU has 3 states: (1) created, (2) initialized, (3)freed
> > 
> > All the states are updated by a user operation (e.g. KVM_TDX_INIT_VM,
> > KVM_TDX_FINALIZE_VM, KVM_TDX_INIT_VCPU) or a x86 op (e.g. vm_init, vm_destroy,
> > vm_free, vcpu_create, vcpu_free).
> > 
> > 
> >      TD                                   vCPU
> > (1) created(set in op vm_init)
> > (2) initialized
> > (indicate tdr_pa != 0 && HKID assigned)
> > 
> >                                           (1) created (set in op vcpu_create)
> > 
> >                                           (2) initialized
> > 
> >                                     (can call INIT_MEM_REGION, GET_CPUID here)
> > 
> > 
> > (3) finalized
> > 
> >                                  (tdx_vcpu_run(), tdx_handle_exit() can be here)
> > 
> > 
> > (4) destroyed (indicate HKID released)
> > 
> >                                          (3) freed
> > 
> > (5) freed
> 
> So an enum for the TD state, and also for the vCPU state?

A state for TD, and a state for each vCPU.
Each vCPU needs to check TD state and vCPU state of itself for vCPU state
transition.

Does it make sense?

>  
> > [1] https://lore.kernel.org/kvm/ZfvI8t7SlfIsxbmT@chao-email/#t
Tony Lindgren Sept. 6, 2024, 4:32 a.m. UTC | #12
On Fri, Sep 06, 2024 at 12:05:41PM +0800, Yan Zhao wrote:
> On Thu, Sep 05, 2024 at 12:27:54PM +0300, Tony Lindgren wrote:
> > On Thu, Sep 05, 2024 at 02:59:25PM +0800, Yan Zhao wrote:
> > > On Mon, Sep 02, 2024 at 01:31:29PM +0300, Tony Lindgren wrote:
> > > > On Thu, Aug 29, 2024 at 02:27:56PM +0800, Yan Zhao wrote:
> > > > > On Mon, Aug 12, 2024 at 03:48:09PM -0700, Rick Edgecombe wrote:
> > > > > > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > > > > > 
> > > > > ...
> > > > > > +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
> > > > > > +{
> > > > ...
> > > > 
> > > > > > +	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
> > > > > > +	kvm_tdx->attributes = td_params->attributes;
> > > > > > +	kvm_tdx->xfam = td_params->xfam;
> > > > > > +
> > > > > > +	if (td_params->exec_controls & TDX_EXEC_CONTROL_MAX_GPAW)
> > > > > > +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(51));
> > > > > > +	else
> > > > > > +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(47));
> > > > > > +
> > > > > Could we introduce a initialized field in struct kvm_tdx and set it true
> > > > > here? e.g
> > > > > +       kvm_tdx->initialized = true;
> > > > > 
> > > > > Then reject vCPU creation in tdx_vcpu_create() before KVM_TDX_INIT_VM is
> > > > > executed successfully? e.g.
> > > > > 
> > > > > @@ -584,6 +589,9 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
> > > > >         struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> > > > >         struct vcpu_tdx *tdx = to_tdx(vcpu);
> > > > > 
> > > > > +       if (!kvm_tdx->initialized)
> > > > > +               return -EIO;
> > > > > +
> > > > >         /* TDX only supports x2APIC, which requires an in-kernel local APIC. */
> > > > >         if (!vcpu->arch.apic)
> > > > >                 return -EINVAL;
> > > > > 
> > > > > Allowing vCPU creation only after TD is initialized can prevent unexpected
> > > > > userspace access to uninitialized TD primitives.
> > > > 
> > > > Makes sense to check for initialized TD before allowing other calls. Maybe
> > > > the check is needed in other places too in additoin to the tdx_vcpu_create().
> > > Do you mean in places checking is_hkid_assigned()?
> > 
> > Sounds like the state needs to be checked in multiple places to handle
> > out-of-order ioctls to that's not enough.
> > 
> > > > How about just a function to check for one or more of the already existing
> > > > initialized struct kvm_tdx values?
> > > Instead of checking multiple individual fields in kvm_tdx or vcpu_tdx, could we
> > > introduce a single state field in the two strutures and utilize a state machine
> > > for check (as Chao Gao pointed out at [1]) ?
> > 
> > OK
> > 
> > > e.g.
> > > Now TD can have 5 states: (1)created, (2)initialized, (3)finalized,
> > >                           (4)destroyed, (5)freed.
> > > Each vCPU has 3 states: (1) created, (2) initialized, (3)freed
> > > 
> > > All the states are updated by a user operation (e.g. KVM_TDX_INIT_VM,
> > > KVM_TDX_FINALIZE_VM, KVM_TDX_INIT_VCPU) or a x86 op (e.g. vm_init, vm_destroy,
> > > vm_free, vcpu_create, vcpu_free).
> > > 
> > > 
> > >      TD                                   vCPU
> > > (1) created(set in op vm_init)
> > > (2) initialized
> > > (indicate tdr_pa != 0 && HKID assigned)
> > > 
> > >                                           (1) created (set in op vcpu_create)
> > > 
> > >                                           (2) initialized
> > > 
> > >                                     (can call INIT_MEM_REGION, GET_CPUID here)
> > > 
> > > 
> > > (3) finalized
> > > 
> > >                                  (tdx_vcpu_run(), tdx_handle_exit() can be here)
> > > 
> > > 
> > > (4) destroyed (indicate HKID released)
> > > 
> > >                                          (3) freed
> > > 
> > > (5) freed
> > 
> > So an enum for the TD state, and also for the vCPU state?
> 
> A state for TD, and a state for each vCPU.
> Each vCPU needs to check TD state and vCPU state of itself for vCPU state
> transition.
> 
> Does it make sense?

That sounds good to me :)

Regards,

Tony
 
> > > [1] https://lore.kernel.org/kvm/ZfvI8t7SlfIsxbmT@chao-email/#t
Wang, Wei W Sept. 6, 2024, 1:52 p.m. UTC | #13
On Friday, September 6, 2024 12:33 PM, Tony Lindgren wrote:
> On Fri, Sep 06, 2024 at 12:05:41PM +0800, Yan Zhao wrote:
> > On Thu, Sep 05, 2024 at 12:27:54PM +0300, Tony Lindgren wrote:
> > > On Thu, Sep 05, 2024 at 02:59:25PM +0800, Yan Zhao wrote:
> > > > On Mon, Sep 02, 2024 at 01:31:29PM +0300, Tony Lindgren wrote:
> > > > > On Thu, Aug 29, 2024 at 02:27:56PM +0800, Yan Zhao wrote:
> > > > > > On Mon, Aug 12, 2024 at 03:48:09PM -0700, Rick Edgecombe wrote:
> > > > > > > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > > > > > >
> > > > > > ...
> > > > > > > +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd
> > > > > > > +*cmd) {
> > > > > ...
> > > > >
> > > > > > > +	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx,
> TD_TDCS_EXEC_TSC_OFFSET);
> > > > > > > +	kvm_tdx->attributes = td_params->attributes;
> > > > > > > +	kvm_tdx->xfam = td_params->xfam;
> > > > > > > +
> > > > > > > +	if (td_params->exec_controls &
> TDX_EXEC_CONTROL_MAX_GPAW)
> > > > > > > +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(51));
> > > > > > > +	else
> > > > > > > +		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(47));
> > > > > > > +
> > > > > > Could we introduce a initialized field in struct kvm_tdx and
> > > > > > set it true here? e.g
> > > > > > +       kvm_tdx->initialized = true;
> > > > > >
> > > > > > Then reject vCPU creation in tdx_vcpu_create() before
> > > > > > KVM_TDX_INIT_VM is executed successfully? e.g.
> > > > > >
> > > > > > @@ -584,6 +589,9 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
> > > > > >         struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> > > > > >         struct vcpu_tdx *tdx = to_tdx(vcpu);
> > > > > >
> > > > > > +       if (!kvm_tdx->initialized)
> > > > > > +               return -EIO;
> > > > > > +
> > > > > >         /* TDX only supports x2APIC, which requires an in-kernel local
> APIC. */
> > > > > >         if (!vcpu->arch.apic)
> > > > > >                 return -EINVAL;
> > > > > >
> > > > > > Allowing vCPU creation only after TD is initialized can
> > > > > > prevent unexpected userspace access to uninitialized TD primitives.
> > > > >
> > > > > Makes sense to check for initialized TD before allowing other
> > > > > calls. Maybe the check is needed in other places too in additoin to the
> tdx_vcpu_create().
> > > > Do you mean in places checking is_hkid_assigned()?
> > >
> > > Sounds like the state needs to be checked in multiple places to
> > > handle out-of-order ioctls to that's not enough.
> > >
> > > > > How about just a function to check for one or more of the
> > > > > already existing initialized struct kvm_tdx values?
> > > > Instead of checking multiple individual fields in kvm_tdx or
> > > > vcpu_tdx, could we introduce a single state field in the two
> > > > strutures and utilize a state machine for check (as Chao Gao pointed out
> at [1]) ?
> > >
> > > OK
> > >
> > > > e.g.
> > > > Now TD can have 5 states: (1)created, (2)initialized, (3)finalized,
> > > >                           (4)destroyed, (5)freed.
> > > > Each vCPU has 3 states: (1) created, (2) initialized, (3)freed
> > > >
> > > > All the states are updated by a user operation (e.g.
> > > > KVM_TDX_INIT_VM, KVM_TDX_FINALIZE_VM, KVM_TDX_INIT_VCPU) or
> a x86
> > > > op (e.g. vm_init, vm_destroy, vm_free, vcpu_create, vcpu_free).
> > > >
> > > >
> > > >      TD                                   vCPU
> > > > (1) created(set in op vm_init)
> > > > (2) initialized
> > > > (indicate tdr_pa != 0 && HKID assigned)
> > > >
> > > >                                           (1) created (set in op
> > > > vcpu_create)
> > > >
> > > >                                           (2) initialized
> > > >
> > > >                                     (can call INIT_MEM_REGION,
> > > > GET_CPUID here)
> > > >
> > > >
> > > > (3) finalized
> > > >
> > > >                                  (tdx_vcpu_run(),
> > > > tdx_handle_exit() can be here)
> > > >
> > > >
> > > > (4) destroyed (indicate HKID released)
> > > >
> > > >                                          (3) freed
> > > >
> > > > (5) freed
> > >
> > > So an enum for the TD state, and also for the vCPU state?
> >
> > A state for TD, and a state for each vCPU.
> > Each vCPU needs to check TD state and vCPU state of itself for vCPU
> > state transition.
> >
> > Does it make sense?
> 
> That sounds good to me :)

+1 sounds good.
I also thought about this. KVM could create a shadow of the TD and vCPU
states that are already defined and maintained by the TDX module.
This should also be more extensible for adding the TD migration support later
(compared to adding various Booleans).
Edgecombe, Rick P Oct. 1, 2024, 8:45 p.m. UTC | #14
On Thu, 2024-09-05 at 12:31 +0300, Tony Lindgren wrote:
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index c00c73b2ad4c..dd6e3149ff5a 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -2476,8 +2476,14 @@ static int __tdx_td_init(struct kvm *kvm, struct
> > td_params *td_params,
> >                   * Return a hint to the user because it's sometimes hard
> > for the
> >                   * user to figure out which operand is invalid.
> > SEAMCALL status
> >                   * code includes which operand caused invalid operand
> > error.
> > +                *
> > +                * TDX_OPERAND_INVALID_CPUID_CONFIG contains more info
> > +                * in rcx (i.e. leaf/sub-leaf), warn it to help figure
> > +                * out the invalid CPUID config.
> >                   */
> >                  *seamcall_err = err;
> > +               if (err == (TDX_OPERAND_INVALID |
> > TDX_OPERAND_ID_CPUID_CONFIG))
> > +                       pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
> >                  ret = -EINVAL;
> >                  goto teardown;

Currently we filter by supported CPUID bits. But if we drop that filter and just
allow the TDX module to reject (based on discussion
https://lore.kernel.org/kvm/CABgObfbyd-a_bD-3fKmF3jVgrTiCDa3SHmrmugRji8BB-vs5GA@mail.gmail.com)

...then I guess this could be useful for userspace debugging. I'd say let's
leave this for a follow on patch. It's not critical for now.
Edgecombe, Rick P Oct. 2, 2024, 11:39 p.m. UTC | #15
Xiaoyao,

On Mon, 2024-08-12 at 15:48 -0700, Rick Edgecombe wrote:
> +static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
> +				 struct td_params *td_params)
> +{
> +	const struct tdx_sysinfo_td_conf *td_conf = &tdx_sysinfo->td_conf;
> +	const struct kvm_tdx_cpuid_config *c;
> +	const struct kvm_cpuid_entry2 *entry;
> +	struct tdx_cpuid_value *value;
> +	int i;
> +
> +	/*
> +	 * td_params.cpuid_values: The number and the order of cpuid_value must
> +	 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
> +	 * It's assumed that td_params was zeroed.
> +	 */
> +	for (i = 0; i < td_conf->num_cpuid_config; i++) {
> +		c = &kvm_tdx_caps->cpuid_configs[i];
> +		entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
> +					      c->leaf, c->sub_leaf);
> +		if (!entry)
> +			continue;
> +
> +		/*
> +		 * Check the user input value doesn't set any non-configurable
> +		 * bits reported by kvm_tdx_caps.
> +		 */
> +		if ((entry->eax & c->eax) != entry->eax ||
> +		    (entry->ebx & c->ebx) != entry->ebx ||
> +		    (entry->ecx & c->ecx) != entry->ecx ||
> +		    (entry->edx & c->edx) != entry->edx)
> +			return -EINVAL;
> +
> +		value = &td_params->cpuid_values[i];
> +		value->eax = entry->eax;
> +		value->ebx = entry->ebx;
> +		value->ecx = entry->ecx;
> +		value->edx = entry->edx;
> +	}
> +
> +	return 0;
> +}

We agreed to let the TDX module reject CPUID bits that are not supported instead
of having KVM do it. While removing conditional above I found that we actually
still need some filtering.

The problem is that the filtering here only filters bits for leafs that are in
kvm_tdx_caps, the other leafs are just ignored. But we can't pass those other
leafs to the TDX module for it to do verification on because the index they are
supposed to go in is determined by kvm_tdx_caps->cpuid_configs, so there is no
place to pass them.

So KVM still needs to make sure no leafs are provided that are not in
kvm_tdx_caps, otherwise it will accept bits from userspace and ignore them. It
turns out this is already happening because QEMU is not filtering the CPUID
leafs that it passes. After I changed KVM to reject the other leafs, I needed
the following QEMU change to not pass leafs not in tdx caps:

diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
index 29ff7d2f7e..990960ec27 100644
--- a/target/i386/kvm/tdx.c
+++ b/target/i386/kvm/tdx.c
@@ -648,22 +648,29 @@ static struct kvm_tdx_cpuid_config
*tdx_find_cpuid_config(uint32_t leaf, uint32_
 
 static void tdx_filter_cpuid(struct kvm_cpuid2 *cpuids)
 {
-    int i;
-    struct kvm_cpuid_entry2 *e;
+    int i, dest_cnt = 0;
+    struct kvm_cpuid_entry2 *src, *dest;
     struct kvm_tdx_cpuid_config *config;
 
     for (i = 0; i < cpuids->nent; i++) {
-        e = cpuids->entries + i;
-        config = tdx_find_cpuid_config(e->function, e->index);
+        src = cpuids->entries + i;
+        config = tdx_find_cpuid_config(src->function, src->index);
         if (!config) {
             continue;
         }
+        dest = cpuids->entries + dest_cnt;
+
+        dest->function = src->function;
+        dest->index = src->index;
+        dest->flags = src->flags;
+        dest->eax = src->eax & config->eax;
+        dest->ebx = src->ebx & config->ebx;
+        dest->ecx = src->ecx & config->ecx;
+        dest->edx = src->edx & config->edx;
 
-        e->eax &= config->eax;
-        e->ebx &= config->ebx;
-        e->ecx &= config->ecx;
-        e->edx &= config->edx;
+        dest_cnt++;
     }
+    cpuids->nent = dest_cnt;
 }
 
 int tdx_pre_create_vcpu(CPUState *cpu, Error **errp)
diff mbox series

Patch

diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 2e3caa5a58fd..95ae2d4a4697 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -929,6 +929,7 @@  struct kvm_hyperv_eventfd {
 /* Trust Domain eXtension sub-ioctl() commands. */
 enum kvm_tdx_cmd_id {
 	KVM_TDX_CAPABILITIES = 0,
+	KVM_TDX_INIT_VM,
 
 	KVM_TDX_CMD_NR_MAX,
 };
@@ -970,4 +971,27 @@  struct kvm_tdx_capabilities {
 	struct kvm_tdx_cpuid_config cpuid_configs[];
 };
 
+struct kvm_tdx_init_vm {
+	__u64 attributes;
+	__u64 xfam;
+	__u64 mrconfigid[6];	/* sha384 digest */
+	__u64 mrowner[6];	/* sha384 digest */
+	__u64 mrownerconfig[6];	/* sha384 digest */
+
+	/* The total space for TD_PARAMS before the CPUIDs is 256 bytes */
+	__u64 reserved[12];
+
+	/*
+	 * Call KVM_TDX_INIT_VM before vcpu creation, thus before
+	 * KVM_SET_CPUID2.
+	 * This configuration supersedes KVM_SET_CPUID2s for VCPUs because the
+	 * TDX module directly virtualizes those CPUIDs without VMM.  The user
+	 * space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with
+	 * those values.  If it doesn't, KVM may have wrong idea of vCPUIDs of
+	 * the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX
+	 * module doesn't virtualize.
+	 */
+	struct kvm_cpuid2 cpuid;
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 2617be544480..7310d8a8a503 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -1487,6 +1487,13 @@  int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
 	return r;
 }
 
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(
+	struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index)
+{
+	return cpuid_entry2_find(entries, nent, function, index);
+}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2);
+
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
 						    u32 function, u32 index)
 {
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 41697cca354e..00570227e2ae 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -13,6 +13,8 @@  void kvm_set_cpu_caps(void);
 
 void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
 void kvm_update_pv_runtime(struct kvm_vcpu *vcpu);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries,
+					       int nent, u32 function, u64 index);
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
 						    u32 function, u32 index);
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index a0954c3928e2..a6c711715a4a 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -7,6 +7,7 @@ 
 #include "tdx.h"
 #include "tdx_ops.h"
 
+
 #undef pr_fmt
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
@@ -356,12 +357,16 @@  static int tdx_do_tdh_mng_key_config(void *param)
 	return 0;
 }
 
-static int __tdx_td_init(struct kvm *kvm);
-
 int tdx_vm_init(struct kvm *kvm)
 {
 	kvm->arch.has_private_mem = true;
 
+	/*
+	 * This function initializes only KVM software construct.  It doesn't
+	 * initialize TDX stuff, e.g. TDCS, TDR, TDCX, HKID etc.
+	 * It is handled by KVM_TDX_INIT_VM, __tdx_td_init().
+	 */
+
 	/*
 	 * TDX has its own limit of the number of vcpus in addition to
 	 * KVM_MAX_VCPUS.
@@ -369,8 +374,7 @@  int tdx_vm_init(struct kvm *kvm)
 	kvm->max_vcpus = min(kvm->max_vcpus,
 			tdx_sysinfo->td_conf.max_vcpus_per_td);
 
-	/* Place holder for TDX specific logic. */
-	return __tdx_td_init(kvm);
+	return 0;
 }
 
 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
@@ -419,7 +423,123 @@  static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
 	return ret;
 }
 
-static int __tdx_td_init(struct kvm *kvm)
+static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
+					struct td_params *td_params)
+{
+	const struct kvm_cpuid_entry2 *entry;
+	int max_pa = 36;
+
+	entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
+	if (entry)
+		max_pa = entry->eax & 0xff;
+
+	td_params->eptp_controls = VMX_EPTP_MT_WB;
+	/*
+	 * No CPU supports 4-level && max_pa > 48.
+	 * "5-level paging and 5-level EPT" section 4.1 4-level EPT
+	 * "4-level EPT is limited to translating 48-bit guest-physical
+	 *  addresses."
+	 * cpu_has_vmx_ept_5levels() check is just in case.
+	 */
+	if (!cpu_has_vmx_ept_5levels() && max_pa > 48)
+		return -EINVAL;
+	if (cpu_has_vmx_ept_5levels() && max_pa > 48) {
+		td_params->eptp_controls |= VMX_EPTP_PWL_5;
+		td_params->exec_controls |= TDX_EXEC_CONTROL_MAX_GPAW;
+	} else {
+		td_params->eptp_controls |= VMX_EPTP_PWL_4;
+	}
+
+	return 0;
+}
+
+static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
+				 struct td_params *td_params)
+{
+	const struct tdx_sysinfo_td_conf *td_conf = &tdx_sysinfo->td_conf;
+	const struct kvm_tdx_cpuid_config *c;
+	const struct kvm_cpuid_entry2 *entry;
+	struct tdx_cpuid_value *value;
+	int i;
+
+	/*
+	 * td_params.cpuid_values: The number and the order of cpuid_value must
+	 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
+	 * It's assumed that td_params was zeroed.
+	 */
+	for (i = 0; i < td_conf->num_cpuid_config; i++) {
+		c = &kvm_tdx_caps->cpuid_configs[i];
+		entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
+					      c->leaf, c->sub_leaf);
+		if (!entry)
+			continue;
+
+		/*
+		 * Check the user input value doesn't set any non-configurable
+		 * bits reported by kvm_tdx_caps.
+		 */
+		if ((entry->eax & c->eax) != entry->eax ||
+		    (entry->ebx & c->ebx) != entry->ebx ||
+		    (entry->ecx & c->ecx) != entry->ecx ||
+		    (entry->edx & c->edx) != entry->edx)
+			return -EINVAL;
+
+		value = &td_params->cpuid_values[i];
+		value->eax = entry->eax;
+		value->ebx = entry->ebx;
+		value->ecx = entry->ecx;
+		value->edx = entry->edx;
+	}
+
+	return 0;
+}
+
+static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
+			struct kvm_tdx_init_vm *init_vm)
+{
+	const struct tdx_sysinfo_td_conf *td_conf = &tdx_sysinfo->td_conf;
+	struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
+	int ret;
+
+	if (kvm->created_vcpus)
+		return -EBUSY;
+
+	if (init_vm->attributes & ~kvm_tdx_caps->supported_attrs)
+		return -EINVAL;
+
+	if (init_vm->xfam & ~kvm_tdx_caps->supported_xfam)
+		return -EINVAL;
+
+	td_params->max_vcpus = kvm->max_vcpus;
+	td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
+	td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
+
+	/* td_params->exec_controls = TDX_CONTROL_FLAG_NO_RBP_MOD; */
+	td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
+
+	ret = setup_tdparams_eptp_controls(cpuid, td_params);
+	if (ret)
+		return ret;
+
+	ret = setup_tdparams_cpuids(cpuid, td_params);
+	if (ret)
+		return ret;
+
+#define MEMCPY_SAME_SIZE(dst, src)				\
+	do {							\
+		BUILD_BUG_ON(sizeof(dst) != sizeof(src));	\
+		memcpy((dst), (src), sizeof(dst));		\
+	} while (0)
+
+	MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
+	MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
+	MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
+
+	return 0;
+}
+
+static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
+			 u64 *seamcall_err)
 {
 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
 	cpumask_var_t packages;
@@ -427,8 +547,9 @@  static int __tdx_td_init(struct kvm *kvm)
 	unsigned long tdr_pa = 0;
 	unsigned long va;
 	int ret, i;
-	u64 err;
+	u64 err, rcx;
 
+	*seamcall_err = 0;
 	ret = tdx_guest_keyid_alloc();
 	if (ret < 0)
 		return ret;
@@ -543,10 +664,23 @@  static int __tdx_td_init(struct kvm *kvm)
 		}
 	}
 
-	/*
-	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
-	 * ioctl() to define the configure CPUID values for the TD.
-	 */
+	err = tdh_mng_init(kvm_tdx, __pa(td_params), &rcx);
+	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
+		/*
+		 * Because a user gives operands, don't warn.
+		 * Return a hint to the user because it's sometimes hard for the
+		 * user to figure out which operand is invalid.  SEAMCALL status
+		 * code includes which operand caused invalid operand error.
+		 */
+		*seamcall_err = err;
+		ret = -EINVAL;
+		goto teardown;
+	} else if (WARN_ON_ONCE(err)) {
+		pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
+		ret = -EIO;
+		goto teardown;
+	}
+
 	return 0;
 
 	/*
@@ -592,6 +726,86 @@  static int __tdx_td_init(struct kvm *kvm)
 	return ret;
 }
 
+static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	struct kvm_tdx_init_vm *init_vm;
+	struct td_params *td_params = NULL;
+	int ret;
+
+	BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
+	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
+
+	if (is_hkid_assigned(kvm_tdx))
+		return -EINVAL;
+
+	if (cmd->flags)
+		return -EINVAL;
+
+	init_vm = kmalloc(sizeof(*init_vm) +
+			  sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
+			  GFP_KERNEL);
+	if (!init_vm)
+		return -ENOMEM;
+
+	if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
+		ret = -E2BIG;
+		goto out;
+	}
+
+	if (copy_from_user(init_vm->cpuid.entries,
+			   u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
+			   flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (init_vm->cpuid.padding) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
+	if (!td_params) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = setup_tdparams(kvm, td_params, init_vm);
+	if (ret)
+		goto out;
+
+	ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
+	if (ret)
+		goto out;
+
+	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
+	kvm_tdx->attributes = td_params->attributes;
+	kvm_tdx->xfam = td_params->xfam;
+
+	if (td_params->exec_controls & TDX_EXEC_CONTROL_MAX_GPAW)
+		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(51));
+	else
+		kvm->arch.gfn_direct_bits = gpa_to_gfn(BIT_ULL(47));
+
+out:
+	/* kfree() accepts NULL. */
+	kfree(init_vm);
+	kfree(td_params);
+
+	return ret;
+}
+
 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_tdx_cmd tdx_cmd;
@@ -613,6 +827,9 @@  int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 	case KVM_TDX_CAPABILITIES:
 		r = tdx_get_capabilities(&tdx_cmd);
 		break;
+	case KVM_TDX_INIT_VM:
+		r = tdx_td_init(kvm, &tdx_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 268959d0f74f..8912cb6d5bc2 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -16,7 +16,11 @@  struct kvm_tdx {
 	unsigned long tdr_pa;
 	unsigned long *tdcs_pa;
 
+	u64 attributes;
+	u64 xfam;
 	int hkid;
+
+	u64 tsc_offset;
 };
 
 struct vcpu_tdx {
diff --git a/arch/x86/kvm/vmx/tdx_ops.h b/arch/x86/kvm/vmx/tdx_ops.h
index 3f64c871a3f2..0363d8544f42 100644
--- a/arch/x86/kvm/vmx/tdx_ops.h
+++ b/arch/x86/kvm/vmx/tdx_ops.h
@@ -399,4 +399,16 @@  static inline u64 tdh_vp_wr(struct vcpu_tdx *tdx, u64 field, u64 val, u64 mask)
 	return seamcall(TDH_VP_WR, &in);
 }
 
+static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
+{
+	u64 err, data;
+
+	err = tdh_mng_rd(kvm_tdx, TDCS_EXEC(field), &data);
+	if (unlikely(err)) {
+		pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err);
+		return 0;
+	}
+	return data;
+}
+
 #endif /* __KVM_X86_TDX_OPS_H */