diff mbox series

[RFC,v5,027/104] KVM: TDX: initialize VM with TDX specific parameters

Message ID c3b37cf5c83f92be0e153075d81a80729bf1031e.1646422845.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM TDX basic feature support | expand

Commit Message

Isaku Yamahata March 4, 2022, 7:48 p.m. UTC
From: Xiaoyao Li <xiaoyao.li@intel.com>

TDX requires additional parameters for TDX VM for confidential execution to
protect its confidentiality of its memory contents and its CPU state from
any other software, including VMM. When creating guest TD VM before
creating vcpu, the number of vcpu, TSC frequency (that is same among
vcpus. and it can't be changed.)  CPUIDs which is emulated by the TDX
module. It means guest can trust those CPUIDs. and sha384 values for
measurement.

Add new subcommand, KVM_TDX_INIT_VM, to pass parameters for TDX guest.  It
assigns encryption key to the TDX guest for memory encryption.  TDX
encrypts memory per-guest bases.  It assigns Device model passes per-VM
parameters for the TDX guest.  The maximum number of vcpus, tsc frequency
(TDX guest has fised VM-wide TSC frequency. not per-vcpu.  The TDX guest
can not change it.), attributes (production or debug), available extended
features (which is reflected into guest XCR0, IA32_XSS MSR), cpuids, sha384
measurements, and etc.

Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/include/asm/kvm_host.h       |   2 +
 arch/x86/include/uapi/asm/kvm.h       |  12 ++
 arch/x86/kvm/vmx/tdx.c                | 200 ++++++++++++++++++++++++++
 arch/x86/kvm/vmx/tdx.h                |  26 ++++
 arch/x86/kvm/x86.c                    |   3 +-
 arch/x86/kvm/x86.h                    |   2 +
 tools/arch/x86/include/uapi/asm/kvm.h |  12 ++
 7 files changed, 256 insertions(+), 1 deletion(-)

Comments

Huang, Kai March 31, 2022, 4:55 a.m. UTC | #1
On Fri, 2022-03-04 at 11:48 -0800, isaku.yamahata@intel.com wrote:
> From: Xiaoyao Li <xiaoyao.li@intel.com>
> 
> TDX requires additional parameters for TDX VM for confidential execution to
> protect its confidentiality of its memory contents and its CPU state from
> any other software, including VMM. When creating guest TD VM before
> creating vcpu, the number of vcpu, TSC frequency (that is same among
> vcpus. and it can't be changed.)  CPUIDs which is emulated by the TDX
> module. It means guest can trust those CPUIDs. and sha384 values for
> measurement.
> 
> Add new subcommand, KVM_TDX_INIT_VM, to pass parameters for TDX guest.  It
> assigns encryption key to the TDX guest for memory encryption.  TDX
> encrypts memory per-guest bases.  It assigns Device model passes per-VM
> parameters for the TDX guest.  The maximum number of vcpus, tsc frequency
> (TDX guest has fised VM-wide TSC frequency. not per-vcpu.  The TDX guest
> can not change it.), attributes (production or debug), available extended
> features (which is reflected into guest XCR0, IA32_XSS MSR), cpuids, sha384
> measurements, and etc.
> 
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h       |   2 +
>  arch/x86/include/uapi/asm/kvm.h       |  12 ++
>  arch/x86/kvm/vmx/tdx.c                | 200 ++++++++++++++++++++++++++
>  arch/x86/kvm/vmx/tdx.h                |  26 ++++
>  arch/x86/kvm/x86.c                    |   3 +-
>  arch/x86/kvm/x86.h                    |   2 +
>  tools/arch/x86/include/uapi/asm/kvm.h |  12 ++
>  7 files changed, 256 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 5ff7a0fba311..290e200f012c 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1234,6 +1234,8 @@ struct kvm_arch {
>  	hpa_t	hv_root_tdp;
>  	spinlock_t hv_root_tdp_lock;
>  #endif
> +
> +	gfn_t gfn_shared_mask;
>  };
>  
>  struct kvm_vm_stat {
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 70f9be4ea575..6e26dde0dce6 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -531,6 +531,7 @@ struct kvm_pmu_event_filter {
>  /* Trust Domain eXtension sub-ioctl() commands. */
>  enum kvm_tdx_cmd_id {
>  	KVM_TDX_CAPABILITIES = 0,
> +	KVM_TDX_INIT_VM,
>  
>  	KVM_TDX_CMD_NR_MAX,
>  };
> @@ -561,4 +562,15 @@ struct kvm_tdx_capabilities {
>  	struct kvm_tdx_cpuid_config cpuid_configs[0];
>  };
>  
> +struct kvm_tdx_init_vm {
> +	__u32 max_vcpus;
> +	__u32 tsc_khz;
> +	__u64 attributes;
> +	__u64 cpuid;

Is it better to append all CPUIDs directly into this structure, perhaps at end
of this structure, to make it more consistent with TD_PARAMS?

Also, I think somewhere in commit message or comments we should explain why
CPUIDs are passed here (why existing KVM_SET_CUPID2 is not sufficient).

> +	__u64 mrconfigid[6];	/* sha384 digest */
> +	__u64 mrowner[6];	/* sha384 digest */
> +	__u64 mrownerconfig[6];	/* sha348 digest */
> +	__u64 reserved[43];	/* must be zero for future extensibility */
> +};
> +
>  #endif /* _ASM_X86_KVM_H */
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 20b45bb0b032..236faaca68a0 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -387,6 +387,203 @@ static int tdx_capabilities(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
>  	return 0;
>  }
>  
> +static struct kvm_cpuid_entry2 *tdx_find_cpuid_entry(struct kvm_tdx *kvm_tdx,
> +						u32 function, u32 index)
> +{
> +	struct kvm_cpuid_entry2 *e;
> +	int i;
> +
> +	for (i = 0; i < kvm_tdx->cpuid_nent; i++) {
> +		e = &kvm_tdx->cpuid_entries[i];
> +
> +		if (e->function == function && (e->index == index ||
> +		    !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX)))
> +			return e;
> +	}
> +	return NULL;
> +}
> +
> +static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
> +			struct kvm_tdx_init_vm *init_vm)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	struct tdx_cpuid_config *config;
> +	struct kvm_cpuid_entry2 *entry;
> +	struct tdx_cpuid_value *value;
> +	u64 guest_supported_xcr0;
> +	u64 guest_supported_xss;
> +	u32 guest_tsc_khz;
> +	int max_pa;
> +	int i;
> +
> +	/* init_vm->reserved must be zero */
> +	if (find_first_bit((unsigned long *)init_vm->reserved,
> +			   sizeof(init_vm->reserved) * 8) !=
> +	    sizeof(init_vm->reserved) * 8)
> +		return -EINVAL;
> +
> +	td_params->max_vcpus = init_vm->max_vcpus;
> +
> +	td_params->attributes = init_vm->attributes;
> +	if (td_params->attributes & TDX_TD_ATTRIBUTE_PERFMON) {
> +		pr_warn("TD doesn't support perfmon. KVM needs to save/restore "
> +			"host perf registers properly.\n");
> +		return -EOPNOTSUPP;
> +	}

PERFMON can be supported but it's not support in this series, so perhaps add a
comment to explain it's a TODO?

> +
> +	/* TODO: Enforce consistent CPUID features for all vCPUs. */

I guess you have to enforce when you do KVM_SET_CPUID2 after vcpu is created?
Then I guess this comment shouldn't be here, because the enforcement isn't
something you can do here in setup_tdparams().

> +	for (i = 0; i < tdx_caps.nr_cpuid_configs; i++) {
> +		config = &tdx_caps.cpuid_configs[i];
> +
> +		entry = tdx_find_cpuid_entry(kvm_tdx, config->leaf,
> +					     config->sub_leaf);
> +		if (!entry)
> +			continue;
> +
> +		/*
> +		 * Non-configurable bits must be '0', even if they are fixed to
> +		 * '1' by the TDX module, i.e. mask off non-configurable bits.
> +		 */
> +		value = &td_params->cpuid_values[i];
> +		value->eax = entry->eax & config->eax;
> +		value->ebx = entry->ebx & config->ebx;
> +		value->ecx = entry->ecx & config->ecx;
> +		value->edx = entry->edx & config->edx;
> +	}
> +
> +	max_pa = 36;
> +	entry = tdx_find_cpuid_entry(kvm_tdx, 0x80000008, 0);
> +	if (entry)
> +		max_pa = entry->eax & 0xff;
> +
> +	td_params->eptp_controls = VMX_EPTP_MT_WB;
> +	if (cpu_has_vmx_ept_5levels() && max_pa > 48) {
> +		td_params->eptp_controls |= VMX_EPTP_PWL_5;
> +		td_params->exec_controls |= TDX_EXEC_CONTROL_MAX_GPAW;
> +	} else {
> +		td_params->eptp_controls |= VMX_EPTP_PWL_4;
> +	}

Not quite sure, but could we support >48 GPA with 4-level EPT?
Paolo Bonzini April 5, 2022, 12:58 p.m. UTC | #2
On 3/4/22 20:48, isaku.yamahata@intel.com wrote:
> +	td_params->attributes = init_vm->attributes;
> +	if (td_params->attributes & TDX_TD_ATTRIBUTE_PERFMON) {
> +		pr_warn("TD doesn't support perfmon. KVM needs to save/restore "
> +			"host perf registers properly.\n");
> +		return -EOPNOTSUPP;
> +	}

Why does KVM have to hardcode this (and LBR/AMX below)?  Is the level of 
hardware support available from tdx_caps, for example through the CPUID 
configs (0xA for this one, 0xD for LBR and AMX)?

> +	/* PT can be exposed to TD guest regardless of KVM's XSS support */
> +	guest_supported_xss &= (supported_xss | XFEATURE_MASK_PT);
> +	td_params->xfam = guest_supported_xcr0 | guest_supported_xss;
> +	if (td_params->xfam & TDX_TD_XFAM_LBR) {
> +		pr_warn("TD doesn't support LBR. KVM needs to save/restore "
> +			"IA32_LBR_DEPTH properly.\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (td_params->xfam & TDX_TD_XFAM_AMX) {
> +		pr_warn("TD doesn't support AMX. KVM needs to save/restore "
> +			"IA32_XFD, IA32_XFD_ERR properly.\n");
> +		return -EOPNOTSUPP;
> +	}

> 
> +	if (init_vm->tsc_khz)
> +		guest_tsc_khz = init_vm->tsc_khz;
> +	else
> +		guest_tsc_khz = max_tsc_khz;

You can just use kvm->arch.default_tsc_khz in the latest kvm/queue.

> +#define BUILD_BUG_ON_MEMCPY(dst, src)				\
> +	do {							\
> +		BUILD_BUG_ON(sizeof(dst) != sizeof(src));	\
> +		memcpy((dst), (src), sizeof(dst));		\
> +	} while (0)
> +
> +	BUILD_BUG_ON_MEMCPY(td_params->mrconfigid, init_vm->mrconfigid);
> +	BUILD_BUG_ON_MEMCPY(td_params->mrowner, init_vm->mrowner);
> +	BUILD_BUG_ON_MEMCPY(td_params->mrownerconfig, init_vm->mrownerconfig);
> +


Please rename to MEMCPY_SAME_SIZE.

Thanks,

Paolo
Paolo Bonzini April 5, 2022, 1:01 p.m. UTC | #3
On 3/31/22 06:55, Kai Huang wrote:
>>   
>> +struct kvm_tdx_init_vm {
>> +	__u32 max_vcpus;
>> +	__u32 tsc_khz;
>> +	__u64 attributes;
>> +	__u64 cpuid;
> Is it better to append all CPUIDs directly into this structure, perhaps at end
> of this structure, to make it more consistent with TD_PARAMS?
> 
> Also, I think somewhere in commit message or comments we should explain why
> CPUIDs are passed here (why existing KVM_SET_CUPID2 is not sufficient).
> 

Indeed, it would be easier to use the existing cpuid data in struct 
kvm_vcpu, because right now there is no way to ensure that they are 
consistent.

Why is KVM_SET_CPUID2 not enough?  Are there any modifications done by 
KVM that affect the measurement?

Thanks,

Paolo
Xiaoyao Li April 6, 2022, 2:06 a.m. UTC | #4
On 4/5/2022 9:01 PM, Paolo Bonzini wrote:
> On 3/31/22 06:55, Kai Huang wrote:
>>> +struct kvm_tdx_init_vm {
>>> +    __u32 max_vcpus;
>>> +    __u32 tsc_khz;
>>> +    __u64 attributes;
>>> +    __u64 cpuid;
>> Is it better to append all CPUIDs directly into this structure, 
>> perhaps at end
>> of this structure, to make it more consistent with TD_PARAMS?
>>
>> Also, I think somewhere in commit message or comments we should 
>> explain why
>> CPUIDs are passed here (why existing KVM_SET_CUPID2 is not sufficient).
>>
> 
> Indeed, it would be easier to use the existing cpuid data in struct 
> kvm_vcpu, because right now there is no way to ensure that they are 
> consistent.
> 
> Why is KVM_SET_CPUID2 not enough?  Are there any modifications done by 
> KVM that affect the measurement?

Then we get the situation that KVM_TDX_INIT_VM must be called after 1 
vcpu is created. It seems illogical that it has chance to fail the VM 
scope initialization after 1 vcpu is successfully created.

> Thanks,
> 
> Paolo
>
Paolo Bonzini April 6, 2022, 11:27 a.m. UTC | #5
On 4/6/22 04:06, Xiaoyao Li wrote:
>>
>> Indeed, it would be easier to use the existing cpuid data in struct 
>> kvm_vcpu, because right now there is no way to ensure that they are 
>> consistent.
>>
>> Why is KVM_SET_CPUID2 not enough?  Are there any modifications done by 
>> KVM that affect the measurement?
> 
> Then we get the situation that KVM_TDX_INIT_VM must be called after 1 
> vcpu is created. It seems illogical that it has chance to fail the VM 
> scope initialization after 1 vcpu is successfully created.

I see.  Yeah, it makes sense to have the CPUID in KVM_TDX_INIT_VM then.

Paolo
Xiaoyao Li April 7, 2022, 1:29 a.m. UTC | #6
On 4/5/2022 8:58 PM, Paolo Bonzini wrote:
> On 3/4/22 20:48, isaku.yamahata@intel.com wrote:
>> +    td_params->attributes = init_vm->attributes;
>> +    if (td_params->attributes & TDX_TD_ATTRIBUTE_PERFMON) {
>> +        pr_warn("TD doesn't support perfmon. KVM needs to save/restore "
>> +            "host perf registers properly.\n");
>> +        return -EOPNOTSUPP;
>> +    }
> 
> Why does KVM have to hardcode this (and LBR/AMX below)?  Is the level of 
> hardware support available from tdx_caps, for example through the CPUID 
> configs (0xA for this one, 0xD for LBR and AMX)?

It's wrong code. PMU is allowed.

AMX and LBR are disallowed because and the time we wrote the codes they 
are not supported by KVM. Now AMX should be allowed, but (arch-)LBR 
should be still blocked until KVM merges arch-LBR support.

>> +    /* PT can be exposed to TD guest regardless of KVM's XSS support */
>> +    guest_supported_xss &= (supported_xss | XFEATURE_MASK_PT);
>> +    td_params->xfam = guest_supported_xcr0 | guest_supported_xss;
>> +    if (td_params->xfam & TDX_TD_XFAM_LBR) {
>> +        pr_warn("TD doesn't support LBR. KVM needs to save/restore "
>> +            "IA32_LBR_DEPTH properly.\n");
>> +        return -EOPNOTSUPP;
>> +    }
>> +
>> +    if (td_params->xfam & TDX_TD_XFAM_AMX) {
>> +        pr_warn("TD doesn't support AMX. KVM needs to save/restore "
>> +            "IA32_XFD, IA32_XFD_ERR properly.\n");
>> +        return -EOPNOTSUPP;
>> +    }
> 
>>
>> +    if (init_vm->tsc_khz)
>> +        guest_tsc_khz = init_vm->tsc_khz;
>> +    else
>> +        guest_tsc_khz = max_tsc_khz;
> 
> You can just use kvm->arch.default_tsc_khz in the latest kvm/queue.

yes. will change it.
Huang, Kai April 7, 2022, 1:51 a.m. UTC | #7
On Thu, 2022-04-07 at 09:29 +0800, Xiaoyao Li wrote:
> On 4/5/2022 8:58 PM, Paolo Bonzini wrote:
> > On 3/4/22 20:48, isaku.yamahata@intel.com wrote:
> > > +    td_params->attributes = init_vm->attributes;
> > > +    if (td_params->attributes & TDX_TD_ATTRIBUTE_PERFMON) {
> > > +        pr_warn("TD doesn't support perfmon. KVM needs to save/restore "
> > > +            "host perf registers properly.\n");
> > > +        return -EOPNOTSUPP;
> > > +    }
> > 
> > Why does KVM have to hardcode this (and LBR/AMX below)?  Is the level of 
> > hardware support available from tdx_caps, for example through the CPUID 
> > configs (0xA for this one, 0xD for LBR and AMX)?
> 
> It's wrong code. PMU is allowed.
> 
> AMX and LBR are disallowed because and the time we wrote the codes they 
> are not supported by KVM. Now AMX should be allowed, but (arch-)LBR 
> should be still blocked until KVM merges arch-LBR support.

I think Isaku's idea is we don't support them in the first submission?

If so as I suggested, we should add a TODO in comment..
Isaku Yamahata April 8, 2022, 2:18 a.m. UTC | #8
On Thu, Mar 31, 2022 at 05:55:01PM +1300,
Kai Huang <kai.huang@intel.com> wrote:

> > diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> > index 70f9be4ea575..6e26dde0dce6 100644
> > --- a/arch/x86/include/uapi/asm/kvm.h
> > +++ b/arch/x86/include/uapi/asm/kvm.h
> > @@ -531,6 +531,7 @@ struct kvm_pmu_event_filter {
> >  /* Trust Domain eXtension sub-ioctl() commands. */
> >  enum kvm_tdx_cmd_id {
> >  	KVM_TDX_CAPABILITIES = 0,
> > +	KVM_TDX_INIT_VM,
> >  
> >  	KVM_TDX_CMD_NR_MAX,
> >  };
> > @@ -561,4 +562,15 @@ struct kvm_tdx_capabilities {
> >  	struct kvm_tdx_cpuid_config cpuid_configs[0];
> >  };
> >  
> > +struct kvm_tdx_init_vm {
> > +	__u32 max_vcpus;
> > +	__u32 tsc_khz;
> > +	__u64 attributes;
> > +	__u64 cpuid;
> 
> Is it better to append all CPUIDs directly into this structure, perhaps at end
> of this structure, to make it more consistent with TD_PARAMS?
> 
> Also, I think somewhere in commit message or comments we should explain why
> CPUIDs are passed here (why existing KVM_SET_CUPID2 is not sufficient).

Ok, let's change the data structure to match more with TD_PARAMS.


> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index 20b45bb0b032..236faaca68a0 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -387,6 +387,203 @@ static int tdx_capabilities(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
> >  	return 0;
> >  }
> >  
> > +static struct kvm_cpuid_entry2 *tdx_find_cpuid_entry(struct kvm_tdx *kvm_tdx,
> > +						u32 function, u32 index)
> > +{
> > +	struct kvm_cpuid_entry2 *e;
> > +	int i;
> > +
> > +	for (i = 0; i < kvm_tdx->cpuid_nent; i++) {
> > +		e = &kvm_tdx->cpuid_entries[i];
> > +
> > +		if (e->function == function && (e->index == index ||
> > +		    !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX)))
> > +			return e;
> > +	}
> > +	return NULL;
> > +}
> > +
> > +static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
> > +			struct kvm_tdx_init_vm *init_vm)
> > +{
> > +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > +	struct tdx_cpuid_config *config;
> > +	struct kvm_cpuid_entry2 *entry;
> > +	struct tdx_cpuid_value *value;
> > +	u64 guest_supported_xcr0;
> > +	u64 guest_supported_xss;
> > +	u32 guest_tsc_khz;
> > +	int max_pa;
> > +	int i;
> > +
> > +	/* init_vm->reserved must be zero */
> > +	if (find_first_bit((unsigned long *)init_vm->reserved,
> > +			   sizeof(init_vm->reserved) * 8) !=
> > +	    sizeof(init_vm->reserved) * 8)
> > +		return -EINVAL;
> > +
> > +	td_params->max_vcpus = init_vm->max_vcpus;
> > +
> > +	td_params->attributes = init_vm->attributes;
> > +	if (td_params->attributes & TDX_TD_ATTRIBUTE_PERFMON) {
> > +		pr_warn("TD doesn't support perfmon. KVM needs to save/restore "
> > +			"host perf registers properly.\n");
> > +		return -EOPNOTSUPP;
> > +	}
> 
> PERFMON can be supported but it's not support in this series, so perhaps add a
> comment to explain it's a TODO?

Yes, good idea. Will do.


> > +	max_pa = 36;
> > +	entry = tdx_find_cpuid_entry(kvm_tdx, 0x80000008, 0);
> > +	if (entry)
> > +		max_pa = entry->eax & 0xff;
> > +
> > +	td_params->eptp_controls = VMX_EPTP_MT_WB;
> > +	if (cpu_has_vmx_ept_5levels() && max_pa > 48) {
> > +		td_params->eptp_controls |= VMX_EPTP_PWL_5;
> > +		td_params->exec_controls |= TDX_EXEC_CONTROL_MAX_GPAW;
> > +	} else {
> > +		td_params->eptp_controls |= VMX_EPTP_PWL_4;
> > +	}
> 
> Not quite sure, but could we support >48 GPA with 4-level EPT?

No.
"5-level paging and 5-level EPT"
section 4.1 4-level EPT
"4-level EPT is limited to translating 48-bit guest-physical addresses."
Isaku Yamahata April 8, 2022, 3:33 a.m. UTC | #9
On Thu, Apr 07, 2022 at 01:51:38PM +1200,
Kai Huang <kai.huang@intel.com> wrote:

> On Thu, 2022-04-07 at 09:29 +0800, Xiaoyao Li wrote:
> > On 4/5/2022 8:58 PM, Paolo Bonzini wrote:
> > > On 3/4/22 20:48, isaku.yamahata@intel.com wrote:
> > > > +    td_params->attributes = init_vm->attributes;
> > > > +    if (td_params->attributes & TDX_TD_ATTRIBUTE_PERFMON) {
> > > > +        pr_warn("TD doesn't support perfmon. KVM needs to save/restore "
> > > > +            "host perf registers properly.\n");
> > > > +        return -EOPNOTSUPP;
> > > > +    }
> > > 
> > > Why does KVM have to hardcode this (and LBR/AMX below)?  Is the level of 
> > > hardware support available from tdx_caps, for example through the CPUID 
> > > configs (0xA for this one, 0xD for LBR and AMX)?
> > 
> > It's wrong code. PMU is allowed.
> > 
> > AMX and LBR are disallowed because and the time we wrote the codes they 
> > are not supported by KVM. Now AMX should be allowed, but (arch-)LBR 
> > should be still blocked until KVM merges arch-LBR support.
> 
> I think Isaku's idea is we don't support them in the first submission?
> 
> If so as I suggested, we should add a TODO in comment..

Sure, will add a TODO comment.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5ff7a0fba311..290e200f012c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1234,6 +1234,8 @@  struct kvm_arch {
 	hpa_t	hv_root_tdp;
 	spinlock_t hv_root_tdp_lock;
 #endif
+
+	gfn_t gfn_shared_mask;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 70f9be4ea575..6e26dde0dce6 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -531,6 +531,7 @@  struct kvm_pmu_event_filter {
 /* Trust Domain eXtension sub-ioctl() commands. */
 enum kvm_tdx_cmd_id {
 	KVM_TDX_CAPABILITIES = 0,
+	KVM_TDX_INIT_VM,
 
 	KVM_TDX_CMD_NR_MAX,
 };
@@ -561,4 +562,15 @@  struct kvm_tdx_capabilities {
 	struct kvm_tdx_cpuid_config cpuid_configs[0];
 };
 
+struct kvm_tdx_init_vm {
+	__u32 max_vcpus;
+	__u32 tsc_khz;
+	__u64 attributes;
+	__u64 cpuid;
+	__u64 mrconfigid[6];	/* sha384 digest */
+	__u64 mrowner[6];	/* sha384 digest */
+	__u64 mrownerconfig[6];	/* sha348 digest */
+	__u64 reserved[43];	/* must be zero for future extensibility */
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 20b45bb0b032..236faaca68a0 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -387,6 +387,203 @@  static int tdx_capabilities(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
 	return 0;
 }
 
+static struct kvm_cpuid_entry2 *tdx_find_cpuid_entry(struct kvm_tdx *kvm_tdx,
+						u32 function, u32 index)
+{
+	struct kvm_cpuid_entry2 *e;
+	int i;
+
+	for (i = 0; i < kvm_tdx->cpuid_nent; i++) {
+		e = &kvm_tdx->cpuid_entries[i];
+
+		if (e->function == function && (e->index == index ||
+		    !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX)))
+			return e;
+	}
+	return NULL;
+}
+
+static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
+			struct kvm_tdx_init_vm *init_vm)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	struct tdx_cpuid_config *config;
+	struct kvm_cpuid_entry2 *entry;
+	struct tdx_cpuid_value *value;
+	u64 guest_supported_xcr0;
+	u64 guest_supported_xss;
+	u32 guest_tsc_khz;
+	int max_pa;
+	int i;
+
+	/* init_vm->reserved must be zero */
+	if (find_first_bit((unsigned long *)init_vm->reserved,
+			   sizeof(init_vm->reserved) * 8) !=
+	    sizeof(init_vm->reserved) * 8)
+		return -EINVAL;
+
+	td_params->max_vcpus = init_vm->max_vcpus;
+
+	td_params->attributes = init_vm->attributes;
+	if (td_params->attributes & TDX_TD_ATTRIBUTE_PERFMON) {
+		pr_warn("TD doesn't support perfmon. KVM needs to save/restore "
+			"host perf registers properly.\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* TODO: Enforce consistent CPUID features for all vCPUs. */
+	for (i = 0; i < tdx_caps.nr_cpuid_configs; i++) {
+		config = &tdx_caps.cpuid_configs[i];
+
+		entry = tdx_find_cpuid_entry(kvm_tdx, config->leaf,
+					     config->sub_leaf);
+		if (!entry)
+			continue;
+
+		/*
+		 * Non-configurable bits must be '0', even if they are fixed to
+		 * '1' by the TDX module, i.e. mask off non-configurable bits.
+		 */
+		value = &td_params->cpuid_values[i];
+		value->eax = entry->eax & config->eax;
+		value->ebx = entry->ebx & config->ebx;
+		value->ecx = entry->ecx & config->ecx;
+		value->edx = entry->edx & config->edx;
+	}
+
+	max_pa = 36;
+	entry = tdx_find_cpuid_entry(kvm_tdx, 0x80000008, 0);
+	if (entry)
+		max_pa = entry->eax & 0xff;
+
+	td_params->eptp_controls = VMX_EPTP_MT_WB;
+	if (cpu_has_vmx_ept_5levels() && max_pa > 48) {
+		td_params->eptp_controls |= VMX_EPTP_PWL_5;
+		td_params->exec_controls |= TDX_EXEC_CONTROL_MAX_GPAW;
+	} else {
+		td_params->eptp_controls |= VMX_EPTP_PWL_4;
+	}
+
+	/* Setup td_params.xfam */
+	entry = tdx_find_cpuid_entry(kvm_tdx, 0xd, 0);
+	if (entry)
+		guest_supported_xcr0 = (entry->eax | ((u64)entry->edx << 32));
+	else
+		guest_supported_xcr0 = 0;
+	guest_supported_xcr0 &= supported_xcr0;
+
+	entry = tdx_find_cpuid_entry(kvm_tdx, 0xd, 1);
+	if (entry)
+		guest_supported_xss = (entry->ecx | ((u64)entry->edx << 32));
+	else
+		guest_supported_xss = 0;
+	/* PT can be exposed to TD guest regardless of KVM's XSS support */
+	guest_supported_xss &= (supported_xss | XFEATURE_MASK_PT);
+
+	td_params->xfam = guest_supported_xcr0 | guest_supported_xss;
+	if (td_params->xfam & TDX_TD_XFAM_LBR) {
+		pr_warn("TD doesn't support LBR. KVM needs to save/restore "
+			"IA32_LBR_DEPTH properly.\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (td_params->xfam & TDX_TD_XFAM_AMX) {
+		pr_warn("TD doesn't support AMX. KVM needs to save/restore "
+			"IA32_XFD, IA32_XFD_ERR properly.\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (init_vm->tsc_khz)
+		guest_tsc_khz = init_vm->tsc_khz;
+	else
+		guest_tsc_khz = max_tsc_khz;
+	td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(guest_tsc_khz);
+
+#define BUILD_BUG_ON_MEMCPY(dst, src)				\
+	do {							\
+		BUILD_BUG_ON(sizeof(dst) != sizeof(src));	\
+		memcpy((dst), (src), sizeof(dst));		\
+	} while (0)
+
+	BUILD_BUG_ON_MEMCPY(td_params->mrconfigid, init_vm->mrconfigid);
+	BUILD_BUG_ON_MEMCPY(td_params->mrowner, init_vm->mrowner);
+	BUILD_BUG_ON_MEMCPY(td_params->mrownerconfig, init_vm->mrownerconfig);
+
+	return 0;
+}
+
+static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	struct kvm_cpuid2 __user *user_cpuid;
+	struct kvm_tdx_init_vm init_vm;
+	struct td_params *td_params;
+	struct tdx_module_output out;
+	struct kvm_cpuid2 cpuid;
+	int ret;
+	u64 err;
+
+	BUILD_BUG_ON(sizeof(init_vm) != 512);
+	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
+
+	if (is_td_initialized(kvm))
+		return -EINVAL;
+
+	if (cmd->metadata)
+		return -EINVAL;
+
+	if (copy_from_user(&init_vm, (void __user *)cmd->data, sizeof(init_vm)))
+		return -EFAULT;
+
+	if (init_vm.max_vcpus > KVM_MAX_VCPUS)
+		return -EINVAL;
+
+	user_cpuid = (void *)init_vm.cpuid;
+	if (copy_from_user(&cpuid, user_cpuid, sizeof(cpuid)))
+		return -EFAULT;
+
+	if (cpuid.nent > KVM_MAX_CPUID_ENTRIES)
+		return -E2BIG;
+
+	if (copy_from_user(&kvm_tdx->cpuid_entries, user_cpuid->entries,
+			   cpuid.nent * sizeof(struct kvm_cpuid_entry2)))
+		return -EFAULT;
+
+	td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL_ACCOUNT);
+	if (!td_params)
+		return -ENOMEM;
+
+	kvm_tdx->cpuid_nent = cpuid.nent;
+
+	ret = setup_tdparams(kvm, td_params, &init_vm);
+	if (ret)
+		goto free_tdparams;
+
+	err = tdh_mng_init(kvm_tdx->tdr.pa, __pa(td_params), &out);
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_MNG_INIT, err, &out);
+		ret = -EIO;
+		goto free_tdparams;
+	}
+
+	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
+	kvm_tdx->attributes = td_params->attributes;
+	kvm_tdx->xfam = td_params->xfam;
+	kvm_tdx->tsc_khz = TDX_TSC_25MHZ_TO_KHZ(td_params->tsc_frequency);
+	kvm->max_vcpus = td_params->max_vcpus;
+
+	if (td_params->exec_controls & TDX_EXEC_CONTROL_MAX_GPAW)
+		kvm->arch.gfn_shared_mask = gpa_to_gfn(BIT_ULL(51));
+	else
+		kvm->arch.gfn_shared_mask = gpa_to_gfn(BIT_ULL(47));
+
+free_tdparams:
+	kfree(td_params);
+	if (ret)
+		kvm_tdx->cpuid_nent = 0;
+	return ret;
+}
+
 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_tdx_cmd tdx_cmd;
@@ -401,6 +598,9 @@  int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 	case KVM_TDX_CAPABILITIES:
 		r = tdx_capabilities(kvm, &tdx_cmd);
 		break;
+	case KVM_TDX_INIT_VM:
+		r = tdx_td_init(kvm, &tdx_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 860136ed70f5..f116c40ac319 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -20,7 +20,15 @@  struct kvm_tdx {
 	struct tdx_td_page tdr;
 	struct tdx_td_page *tdcs;
 
+	u64 attributes;
+	u64 xfam;
 	int hkid;
+
+	int cpuid_nent;
+	struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+
+	u64 tsc_offset;
+	unsigned long tsc_khz;
 };
 
 struct vcpu_tdx {
@@ -50,6 +58,11 @@  static inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_tdx, vcpu);
 }
 
+static inline bool is_td_initialized(struct kvm *kvm)
+{
+	return !!kvm->max_vcpus;
+}
+
 static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
 {
 	BUILD_BUG_ON_MSG(__builtin_constant_p(field) && (field) & 0x1,
@@ -135,6 +148,19 @@  TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs);
 TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch);
 TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management);
 
+static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
+{
+	struct tdx_module_output out;
+	u64 err;
+
+	err = tdh_mng_rd(kvm_tdx->tdr.pa, TDCS_EXEC(field), &out);
+	if (unlikely(err)) {
+		pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err);
+		return 0;
+	}
+	return out.r8;
+}
+
 #else
 static inline int tdx_module_setup(void) { return -ENODEV; };
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a48f5c69fadb..734699bd940f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2274,7 +2274,8 @@  static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
 #endif
 
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
-static unsigned long max_tsc_khz;
+unsigned long max_tsc_khz;
+EXPORT_SYMBOL_GPL(max_tsc_khz);
 
 static u32 adjust_tsc_khz(u32 khz, s32 ppm)
 {
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f11d945ac41f..5ff3badc3f2b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -302,6 +302,8 @@  extern int pi_inject_timer;
 
 extern bool report_ignored_msrs;
 
+extern unsigned long max_tsc_khz;
+
 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 {
 	return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index 70f9be4ea575..6e26dde0dce6 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -531,6 +531,7 @@  struct kvm_pmu_event_filter {
 /* Trust Domain eXtension sub-ioctl() commands. */
 enum kvm_tdx_cmd_id {
 	KVM_TDX_CAPABILITIES = 0,
+	KVM_TDX_INIT_VM,
 
 	KVM_TDX_CMD_NR_MAX,
 };
@@ -561,4 +562,15 @@  struct kvm_tdx_capabilities {
 	struct kvm_tdx_cpuid_config cpuid_configs[0];
 };
 
+struct kvm_tdx_init_vm {
+	__u32 max_vcpus;
+	__u32 tsc_khz;
+	__u64 attributes;
+	__u64 cpuid;
+	__u64 mrconfigid[6];	/* sha384 digest */
+	__u64 mrowner[6];	/* sha384 digest */
+	__u64 mrownerconfig[6];	/* sha348 digest */
+	__u64 reserved[43];	/* must be zero for future extensibility */
+};
+
 #endif /* _ASM_X86_KVM_H */