diff mbox series

[v8,29/40] x86/compressed/64: add support for SEV-SNP CPUID table in #VC handlers

Message ID 20211210154332.11526-30-brijesh.singh@amd.com (mailing list archive)
State New
Headers show
Series Add AMD Secure Nested Paging (SEV-SNP) Guest Support | expand

Commit Message

Brijesh Singh Dec. 10, 2021, 3:43 p.m. UTC
From: Michael Roth <michael.roth@amd.com>

CPUID instructions generate a #VC exception for SEV-ES/SEV-SNP guests,
for which early handlers are currently set up to handle. In the case
of SEV-SNP, guests can use a configurable location in guest memory
that has been pre-populated with a firmware-validated CPUID table to
look up the relevant CPUID values rather than requesting them from
hypervisor via a VMGEXIT. Add the various hooks in the #VC handlers to
allow CPUID instructions to be handled via the table. The code to
actually configure/enable the table will be added in a subsequent
commit.

Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 arch/x86/boot/compressed/sev.c    |   1 +
 arch/x86/include/asm/sev-common.h |   2 +
 arch/x86/kernel/sev-shared.c      | 320 ++++++++++++++++++++++++++++++
 arch/x86/kernel/sev.c             |   1 +
 4 files changed, 324 insertions(+)

Comments

Michael Roth Jan. 13, 2022, 4:39 p.m. UTC | #1
On Thu, Jan 13, 2022 at 02:16:05PM +0100, Borislav Petkov wrote:
> On Fri, Dec 10, 2021 at 09:43:21AM -0600, Brijesh Singh wrote:
> > +/*
> > + * Individual entries of the SEV-SNP CPUID table, as defined by the SEV-SNP
> > + * Firmware ABI, Revision 0.9, Section 7.1, Table 14. Note that the XCR0_IN
> > + * and XSS_IN are denoted here as __unused/__unused2, since they are not
> > + * needed for the current guest implementation,
> 
> That's fine and great but you need to check in the function where you
> iterate over those leafs below whether those unused variables are 0
> and fail if not. Not that BIOS or whoever creates that table, starts
> becoming creative...

It's not actually necessary for the values to be 0 in this case. If a
hypervisor chooses to use them (which is allowed by current firmware
ABI, but should perhaps be updated to suggest against it), then guest
code written for that specific hypervisor implementation can choose to
make use of the fields.

But if a guest chooses to ignore the XCR0_IN/XSS_IN values, and instead
compute the XSAVE buffer size from scratch using it's own knowledge of
what features are enabled in the actual XCR0/XSS registers, then it can
compute the same information those fields encode into the cpuid table
by simply walking 0xD sub-leaves 2-63 and calculating the XSAVE buffer
size manually based on the individual sizes encoded in those sub-leaves
(as per CPUID spec).

All entries for 0xD subleaves 0 through 1, with different XCR0_IN/XSS_IN
values, can then be treated as being identical, since the only thing
that changes from a sub-leaf entry with one XCR0_IN/XSS_IN combination to
verses another is total XSAVE buffer size, which the guest doesn't need,
since it is computing them instead by summing up the 0xD subleaves 2-63.

Requiring these to be 0 places constraints on the hypervisor that are at
odds with the current firmware ABI, so that's not enforced here, but the
guest code should work regardless of how the hypervisor chooses to use
XCR0_IN/XSS_IN.

> 
> > where the size of the buffers
> > + * needed to store enabled XSAVE-saved features are calculated rather than
> > + * encoded in the CPUID table for each possible combination of XCR0_IN/XSS_IN
> > + * to save space.
> > + */
> > +struct snp_cpuid_fn {
> > +	u32 eax_in;
> > +	u32 ecx_in;
> > +	u64 __unused;
> > +	u64 __unused2;
> > +	u32 eax;
> > +	u32 ebx;
> > +	u32 ecx;
> > +	u32 edx;
> > +	u64 __reserved;
> 
> Ditto.

I was thinking a future hypervisor/spec might make use of this field for
new functionality, while still wanting to be backward-compatible with
existing guests, so it would be better to not enforce 0. The firmware
ABI does indeed document it as must-be-zero, by that seems to be more of
a constraint on what a hypervisor is currently allowed to place in the
CPUID table, rather than something the guest is meant to enforce/rely
on.

> 
> > +} __packed;
> > +
> > +/*
> > + * SEV-SNP CPUID table header, as defined by the SEV-SNP Firmware ABI,
> > + * Revision 0.9, Section 8.14.2.6. Also noted there is the SEV-SNP
> > + * firmware-enforced limit of 64 entries per CPUID table.
> > + */
> > +#define SNP_CPUID_COUNT_MAX 64
> > +
> > +struct snp_cpuid_info {
> > +	u32 count;
> > +	u32 __reserved1;
> > +	u64 __reserved2;
> > +	struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX];
> > +} __packed;
> > +
> >  /*
> >   * Since feature negotiation related variables are set early in the boot
> >   * process they must reside in the .data section so as not to be zeroed
> > @@ -23,6 +58,20 @@
> >   */
> >  static u16 ghcb_version __ro_after_init;
> >  
> > +/* Copy of the SNP firmware's CPUID page. */
> > +static struct snp_cpuid_info cpuid_info_copy __ro_after_init;
> > +static bool snp_cpuid_initialized __ro_after_init;
> > +
> > +/*
> > + * These will be initialized based on CPUID table so that non-present
> > + * all-zero leaves (for sparse tables) can be differentiated from
> > + * invalid/out-of-range leaves. This is needed since all-zero leaves
> > + * still need to be post-processed.
> > + */
> > +u32 cpuid_std_range_max __ro_after_init;
> > +u32 cpuid_hyp_range_max __ro_after_init;
> > +u32 cpuid_ext_range_max __ro_after_init;
> 
> All of them: static.
> 
> >  static bool __init sev_es_check_cpu_features(void)
> >  {
> >  	if (!has_cpuflag(X86_FEATURE_RDRAND)) {
> > @@ -246,6 +295,244 @@ static int sev_cpuid_hv(u32 func, u32 subfunc, u32 *eax, u32 *ebx,
> >  	return 0;
> >  }
> >  
> > +static const struct snp_cpuid_info *
> 
> No need for that linebreak here.
> 
> > +snp_cpuid_info_get_ptr(void)
> > +{
> > +	void *ptr;
> > +
> > +	/*
> > +	 * This may be called early while still running on the initial identity
> > +	 * mapping. Use RIP-relative addressing to obtain the correct address
> > +	 * in both for identity mapping and after switch-over to kernel virtual
> > +	 * addresses.
> > +	 */
> 
> Put that comment over the function name.
> 
> And yah, that probably works but eww.

Yah... originally there was a cpuid table ptr that was initialized for
identity-mapping early on, then updated later after switch-over, but it
required an additional init routine or callback later in boot, which
seemed at odds with the goal of initializing everything in one spot, so
I switched to using this approach to avoid needed to re-introduce having
multiple stages of initialization throughout boot.

> 
> > +	asm ("lea cpuid_info_copy(%%rip), %0"
> > +	     : "=r" (ptr)
> 
> Why not "=g" and let the compiler decide?

I mainly re-used existing code from sme_enable() here, but I'll check
on this.

> 
> > +	     : "p" (&cpuid_info_copy));
> > +
> > +	return ptr;
> > +}
> > +
> > +static inline bool snp_cpuid_active(void)
> > +{
> > +	return snp_cpuid_initialized;
> > +}
> 
> That looks useless. That variable snp_cpuid_initialized either gets set
> or the guest terminates, so practically, if the guest is still running,
> you can assume SNP CPUID is properly initialized.

snp_cpuid_info_create() (which sets snp_cpuid_initialized) only gets
called if firmware indicates this is an SNP guests (via the cc_blob), but
the #VC handler still needs to know whether or not it should use the SNP
CPUID table still SEV-ES will still make use of it, so it uses
snp_cpuid_active() to make that determination.

Previous versions of the series basically did:

  snp_cpuid_active():
    return snp_cpuid_table_ptr != NULL;

but now that the above snp_cpuid_info_get_ptr() accessor is used instead
of storing an actual pointer somewhere, a new variable is needed to
track that, which why snp_cpuid_initialized was introduced here.

> 
> > +static int snp_cpuid_calc_xsave_size(u64 xfeatures_en, u32 base_size,
> > +				     u32 *xsave_size, bool compacted)
> > +{
> > +	const struct snp_cpuid_info *cpuid_info = snp_cpuid_info_get_ptr();
> > +	u32 xsave_size_total = base_size;
> > +	u64 xfeatures_found = 0;
> > +	int i;
> > +
> > +	for (i = 0; i < cpuid_info->count; i++) {
> > +		const struct snp_cpuid_fn *fn = &cpuid_info->fn[i];
> > +
> > +		if (!(fn->eax_in == 0xD && fn->ecx_in > 1 && fn->ecx_in < 64))
> > +			continue;
> 
> I guess that test can be as simple as
> 
> 		if (fn->eax_in != 0xd)
> 			continue;
> 
> or why do you wanna check ECX too? Funky values coming from the CPUID
> page?

This code is calculating the total XSAVE buffer size for whatever
features are enabled by the guest's XCR0/XSS registers. Those feature
bits correspond to the 0xD subleaves 2-63, which advertise the buffer
size for each particular feature. So that check needs to ignore anything
outside that range (including 0xD subleafs 0 and 1, which would normally
provide this total size dynamically based on current values of XCR0/XSS,
but here are instead calculated "manually" since we are not relying on
the XCR0_IN/XSS_IN fields in the table (due to the reasons mentioned
earlier in this thread).

> 
> > +		if (!(xfeatures_en & (BIT_ULL(fn->ecx_in))))
> > +			continue;
> > +		if (xfeatures_found & (BIT_ULL(fn->ecx_in)))
> > +			continue;
> 
> What is that test for? Don't tell me the CPUID page allows duplicate
> entries...

Not duplicate entries (though there's technically nothing in the spec
that says you can't), but I was more concerned here with multiple
entries corresponding to different combination of XCR0_IN/XSS_IN.
There's no good reason for a hypervisor to use those fields for anything
other than 0xD subleaves 0 and 1, but a hypervisor could in theory encode
1 "duplicate" sub-leaf for each possible combination of XCR0_IN/XSS_IN,
similar to what it might do for subleaves 0 and 1, and not violate the
spec.

The current spec is a bit open-ended in some of these areas so the guest
code is trying to be as agnostic as possible to the underlying implementation
so there's less chance of breakage running on one hypervisor verses
another. We're working on updating the spec to encourage better
interoperability, but that would likely only be enforceable for future
firmware versions/guests.

> 
> > +		xfeatures_found |= (BIT_ULL(fn->ecx_in));
> > +
> > +		if (compacted)
> > +			xsave_size_total += fn->eax;
> > +		else
> > +			xsave_size_total = max(xsave_size_total,
> > +					       fn->eax + fn->ebx);
> > +	}
> > +
> > +	/*
> > +	 * Either the guest set unsupported XCR0/XSS bits, or the corresponding
> > +	 * entries in the CPUID table were not present. This is not a valid
> > +	 * state to be in.
> > +	 */
> > +	if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2)))
> > +		return -EINVAL;
> > +
> > +	*xsave_size = xsave_size_total;
> > +
> > +	return 0;
> 
> This function can return xsave_size in the success case and negative in
> the error case so you don't need the IO param *xsave_size.

Makes sense.

> 
> > +}
> > +
> > +static void snp_cpuid_hv(u32 func, u32 subfunc, u32 *eax, u32 *ebx, u32 *ecx,
> > +			 u32 *edx)
> > +{
> > +	/*
> > +	 * MSR protocol does not support fetching indexed subfunction, but is
> > +	 * sufficient to handle current fallback cases. Should that change,
> > +	 * make sure to terminate rather than ignoring the index and grabbing
> > +	 * random values. If this issue arises in the future, handling can be
> > +	 * added here to use GHCB-page protocol for cases that occur late
> > +	 * enough in boot that GHCB page is available.
> > +	 */
> > +	if (cpuid_function_is_indexed(func) && subfunc)
> > +		sev_es_terminate(1, GHCB_TERM_CPUID_HV);
> > +
> > +	if (sev_cpuid_hv(func, 0, eax, ebx, ecx, edx))
> > +		sev_es_terminate(1, GHCB_TERM_CPUID_HV);
> > +}
> > +
> > +static bool
> > +snp_cpuid_find_validated_func(u32 func, u32 subfunc, u32 *eax, u32 *ebx,
> 
> snp_cpuid_get_validated_func()
> 
> > +			      u32 *ecx, u32 *edx)
> > +{
> > +	const struct snp_cpuid_info *cpuid_info = snp_cpuid_info_get_ptr();
> > +	int i;
> > +
> > +	for (i = 0; i < cpuid_info->count; i++) {
> > +		const struct snp_cpuid_fn *fn = &cpuid_info->fn[i];
> > +
> > +		if (fn->eax_in != func)
> > +			continue;
> > +
> > +		if (cpuid_function_is_indexed(func) && fn->ecx_in != subfunc)
> > +			continue;
> > +
> > +		*eax = fn->eax;
> > +		*ebx = fn->ebx;
> > +		*ecx = fn->ecx;
> > +		*edx = fn->edx;
> > +
> > +		return true;
> > +	}
> > +
> > +	return false;
> > +}
> > +
> > +static bool snp_cpuid_check_range(u32 func)
> > +{
> > +	if (func <= cpuid_std_range_max ||
> > +	    (func >= 0x40000000 && func <= cpuid_hyp_range_max) ||
> > +	    (func >= 0x80000000 && func <= cpuid_ext_range_max))
> > +		return true;
> > +
> > +	return false;
> > +}
> > +
> > +static int snp_cpuid_postprocess(u32 func, u32 subfunc, u32 *eax, u32 *ebx,
> > +				 u32 *ecx, u32 *edx)
> 
> I'm wondering if you could make everything a lot easier by doing
> 
> static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
> 
> and marshall around that struct cpuid_leaf which contains func, subfunc,
> e[abcd]x instead of dealing with 6 parameters.
> 
> Callers of snp_cpuid() can simply allocate it on their stack and hand it
> in and it is all in sev-shared.c so nicely self-contained...
> 
> ...

True, I could have snp_cpuid_find_validated_func() return a pointer to the
actual entry and pass that through to postprocess(). I'll see what that
looks like.

> 
> > +/*
> > + * Returns -EOPNOTSUPP if feature not enabled. Any other return value should be
> > + * treated as fatal by caller.
> > + */
> > +static int snp_cpuid(u32 func, u32 subfunc, u32 *eax, u32 *ebx, u32 *ecx,
> > +		     u32 *edx)
> > +{
> > +	if (!snp_cpuid_active())
> > +		return -EOPNOTSUPP;
> 
> And this becomes superfluous.
> 
> > +
> > +	if (!snp_cpuid_find_validated_func(func, subfunc, eax, ebx, ecx, edx)) {
> > +		/*
> > +		 * Some hypervisors will avoid keeping track of CPUID entries
> > +		 * where all values are zero, since they can be handled the
> > +		 * same as out-of-range values (all-zero). This is useful here
> > +		 * as well as it allows virtually all guest configurations to
> > +		 * work using a single SEV-SNP CPUID table.
> > +		 *
> > +		 * To allow for this, there is a need to distinguish between
> > +		 * out-of-range entries and in-range zero entries, since the
> > +		 * CPUID table entries are only a template that may need to be
> > +		 * augmented with additional values for things like
> > +		 * CPU-specific information during post-processing. So if it's
> > +		 * not in the table, but is still in the valid range, proceed
> > +		 * with the post-processing. Otherwise, just return zeros.
> > +		 */
> > +		*eax = *ebx = *ecx = *edx = 0;
> > +		if (!snp_cpuid_check_range(func))
> > +			return 0;
> 
> Do the check first and then assign.
> 
> > +	}
> > +
> > +	return snp_cpuid_postprocess(func, subfunc, eax, ebx, ecx, edx);
> > +}
> > +
> >  /*
> >   * Boot VC Handler - This is the first VC handler during boot, there is no GHCB
> >   * page yet, so it only supports the MSR based communication with the
> > @@ -253,16 +540,26 @@ static int sev_cpuid_hv(u32 func, u32 subfunc, u32 *eax, u32 *ebx,
> >   */
> >  void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
> >  {
> > +	unsigned int subfn = lower_bits(regs->cx, 32);
> >  	unsigned int fn = lower_bits(regs->ax, 32);
> >  	u32 eax, ebx, ecx, edx;
> > +	int ret;
> >  
> >  	/* Only CPUID is supported via MSR protocol */
> >  	if (exit_code != SVM_EXIT_CPUID)
> >  		goto fail;
> >  
> > +	ret = snp_cpuid(fn, subfn, &eax, &ebx, &ecx, &edx);
> > +	if (ret == 0)
> 
> 	if (!ret)
> 
> > +		goto cpuid_done;
> > +
> > +	if (ret != -EOPNOTSUPP)
> > +		goto fail;
> > +
> >  	if (sev_cpuid_hv(fn, 0, &eax, &ebx, &ecx, &edx))
> >  		goto fail;
> >  
> > +cpuid_done:
> >  	regs->ax = eax;
> >  	regs->bx = ebx;
> >  	regs->cx = ecx;
> > @@ -557,12 +854,35 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
> >  	return ret;
> >  }
> >  
> > +static int vc_handle_cpuid_snp(struct pt_regs *regs)
> > +{
> > +	u32 eax, ebx, ecx, edx;
> > +	int ret;
> > +
> > +	ret = snp_cpuid(regs->ax, regs->cx, &eax, &ebx, &ecx, &edx);
> > +	if (ret == 0) {
> 
> 	if (!ret)
> 
> > +		regs->ax = eax;
> > +		regs->bx = ebx;
> > +		regs->cx = ecx;
> > +		regs->dx = edx;
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> >  static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
> >  				      struct es_em_ctxt *ctxt)
> >  {
> >  	struct pt_regs *regs = ctxt->regs;
> >  	u32 cr4 = native_read_cr4();
> >  	enum es_result ret;
> > +	int snp_cpuid_ret;
> > +
> > +	snp_cpuid_ret = vc_handle_cpuid_snp(regs);
> > +	if (snp_cpuid_ret == 0)
> 
> 	if (! ... - you get the idea.

Thanks for the suggestions, will get these all implemented unless
otherwise noted above.

-Mike

> 
> 
> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpeople.kernel.org%2Ftglx%2Fnotes-about-netiquette&amp;data=04%7C01%7CMichael.Roth%40amd.com%7Cbdeae41f29e841b9c98108d9d696dd53%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637776765726982239%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=u2W1ZARL%2Fn%2FgQZb7KXvnpR3o8%2FMUw7GoXwrIY19xFcY%3D&amp;reserved=0
Michael Roth Jan. 18, 2022, 4:35 a.m. UTC | #2
On Fri, Jan 14, 2022 at 05:13:30PM +0100, Borislav Petkov wrote:
> On Thu, Jan 13, 2022 at 10:39:13AM -0600, Michael Roth wrote:
> > I was thinking a future hypervisor/spec might make use of this field for
> > new functionality, while still wanting to be backward-compatible with
> > existing guests, so it would be better to not enforce 0. The firmware
> > ABI does indeed document it as must-be-zero,
> 
> Maybe there's a good reason for that.
> 
> > by that seems to be more of a constraint on what a hypervisor is
> > currently allowed to place in the CPUID table, rather than something
> > the guest is meant to enforce/rely on.
> 
> So imagine whoever creates those, starts putting stuff in those fields.
> Then, in the future, the spec decides to rename those reserved/unused
> fields into something else and starts putting concrete values in them.
> I.e., it starts using them for something.
> 
> But, now the spec breaks existing usage because those fields are already
> in use. And by then it doesn't matter what the spec says - existing
> usage makes it an ABI.
> 
> So we start doing expensive and ugly workarounds just so that we don't
> break the old, undocumented use which the spec simply silently allowed,
> and accomodate that new feature the spec adds.
> 
> So no, what you're thinking is a bad bad idea.

I can certainly see that argument. I'll add checks to enforce these
cases. If it breaks an existing hypervisor implementation, it's probably
better to have that happen early based on this initial reference
implementation rather than down the road when we actually need these
fields for something.

> 
> > snp_cpuid_info_create() (which sets snp_cpuid_initialized) only gets
> > called if firmware indicates this is an SNP guests (via the cc_blob), but
> > the #VC handler still needs to know whether or not it should use the SNP
> > CPUID table still SEV-ES will still make use of it, so it uses
> > snp_cpuid_active() to make that determination.
> 
> So I went and applied the rest of the series. And I think you mean
> do_vc_no_ghcb() and it doing snp_cpuid().

Yes that's correct.

> 
> Then, looking at sev_enable() and it calling snp_init(), you fail
> further init if there's any discrepancy in the supplied data - CPUID,
> SEV status MSR, etc.
> 
> So, practically, what you wanna test in all those places is whether
> you're a SNP guest or not. Which we already have:
> 
> 	sev_status & MSR_AMD64_SEV_SNP_ENABLED
> 
> so, unless I'm missing something, you don't need yet another
> <bla>_active() helper.

Unfortunately, in sev_enable(), between the point where snp_init() is
called, and sev_status is actually set, there are a number of cpuid
intructions which will make use of do_vc_no_ghcb() prior to sev_status
being set (and it needs to happen in that order to set sev_status
appropriately). After that point, snp_cpuid_active() would no longer be
necessary, but during that span some indicator is needed in case this
is just an SEV-ES guest trigger cpuid #VCs.

> 
> > This code is calculating the total XSAVE buffer size for whatever
> > features are enabled by the guest's XCR0/XSS registers. Those feature
> > bits correspond to the 0xD subleaves 2-63, which advertise the buffer
> > size for each particular feature. So that check needs to ignore anything
> > outside that range (including 0xD subleafs 0 and 1, which would normally
> > provide this total size dynamically based on current values of XCR0/XSS,
> > but here are instead calculated "manually" since we are not relying on
> > the XCR0_IN/XSS_IN fields in the table (due to the reasons mentioned
> > earlier in this thread).
> 
> Yah, the gist of that needs to be as a comment of that line as it is not
> obvious (at least to me).
> 
> > Not duplicate entries (though there's technically nothing in the spec
> > that says you can't), but I was more concerned here with multiple
> > entries corresponding to different combination of XCR0_IN/XSS_IN.
> > There's no good reason for a hypervisor to use those fields for anything
> > other than 0xD subleaves 0 and 1, but a hypervisor could in theory encode
> > 1 "duplicate" sub-leaf for each possible combination of XCR0_IN/XSS_IN,
> > similar to what it might do for subleaves 0 and 1, and not violate the
> > spec.
> 
> 
> Ditto. Also a comment ontop please.

Will do.

> 
> > The current spec is a bit open-ended in some of these areas so the guest
> > code is trying to be as agnostic as possible to the underlying implementation
> > so there's less chance of breakage running on one hypervisor verses
> > another. We're working on updating the spec to encourage better
> > interoperability, but that would likely only be enforceable for future
> > firmware versions/guests.
> 
> This has the same theoretical problem as the reserved/unused fields. If
> you don't enforce it, people will do whatever and once it is implemented
> in hypervisors and it has become an ABI, you can't change it anymore.
> 
> So I'd very strongly suggest you tighten in up upfront and only allow
> stuff later, when it makes sense. Not the other way around.

Yah, I was trying to avoid causing issues for other early
implementations trying to make do with the current open-ended spec, but
that's probably a losing battle and it's probably better to try to get
everyone using the proposed reference implementation early on. I'll tighten
the code up in this regard and also send a new version of reference
implementation document to SNP mailing list for awareness.

> 
> Thx.
> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpeople.kernel.org%2Ftglx%2Fnotes-about-netiquette&amp;data=04%7C01%7Cmichael.roth%40amd.com%7Cb2fefc5c0458441d2c9508d9d778cf46%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637777736172270637%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=lCq1wBm8iyW1AOorhL35om6gU9GEypzCksiFZUI3H%2Fk%3D&amp;reserved=0
Michael Roth Jan. 18, 2022, 2:23 p.m. UTC | #3
On Tue, Jan 18, 2022 at 03:02:48PM +0100, Borislav Petkov wrote:
> On Mon, Jan 17, 2022 at 10:35:21PM -0600, Michael Roth wrote:
> > Unfortunately, in sev_enable(), between the point where snp_init() is
> > called, and sev_status is actually set, there are a number of cpuid
> > intructions which will make use of do_vc_no_ghcb() prior to sev_status
> > being set (and it needs to happen in that order to set sev_status
> > appropriately). After that point, snp_cpuid_active() would no longer be
> > necessary, but during that span some indicator is needed in case this
> > is just an SEV-ES guest trigger cpuid #VCs.
> 
> You mean testing what snp_cpuid_info_create() set up is not enough?
> 
> diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
> index 7bc7e297f88c..17cfe804bad3 100644
> --- a/arch/x86/kernel/sev-shared.c
> +++ b/arch/x86/kernel/sev-shared.c
> @@ -523,7 +523,9 @@ static int snp_cpuid_postprocess(u32 func, u32 subfunc, u32 *eax, u32 *ebx,
>  static int snp_cpuid(u32 func, u32 subfunc, u32 *eax, u32 *ebx, u32 *ecx,
>  		     u32 *edx)
>  {
> -	if (!snp_cpuid_active())
> +	const struct snp_cpuid_info *c = snp_cpuid_info_get_ptr();
> +
> +	if (!c->count)
>  		return -EOPNOTSUPP;
>  
>  	if (!snp_cpuid_find_validated_func(func, subfunc, eax, ebx, ecx, edx)) {

snp_cpuid_info_get_ptr() will always return non-NULL, since it's a
pointer to the local copy of the cpuid page. But I can probably re-work
it slightly that so snp_cpuid_info_get_ptr() does the same check that
snp_cpuid_active() does, and have it return NULL if the copy hasn't been
initialized, if that seems preferable to the separate snp_cpuid_active()
function.

> 
> ---
> 
> Btw, all those
> 
>         /* SEV-SNP CPUID table should be set up now. */
>         if (!snp_cpuid_active())
>                 sev_es_terminate(1, GHCB_TERM_CPUID);
> 
> after snp_cpuid_info_create() has returned are useless either. If that
> function returns, you know you're good to go wrt SNP.

It seemed like a good thing to assert in case something slipped in later
that tried to use snp_cpuid() without the table being initialized, but
if I implement things the way you suggested above,
snp_cpuid_info_get_ptr() will return NULL in that case, so we get that
assurance for free. That does sound cleaner.

> 
> Thx.
> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpeople.kernel.org%2Ftglx%2Fnotes-about-netiquette&amp;data=04%7C01%7Cmichael.roth%40amd.com%7Cb8be348c4db84954006708d9da8b3820%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637781113782595576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=HszNjYntQNNI827LNK4H8Tpx0vhpICo7y3FCzIvhNtc%3D&amp;reserved=0
Michael Roth Jan. 18, 2022, 2:32 p.m. UTC | #4
On Tue, Jan 18, 2022 at 08:23:45AM -0600, Michael Roth wrote:
> On Tue, Jan 18, 2022 at 03:02:48PM +0100, Borislav Petkov wrote:
> > On Mon, Jan 17, 2022 at 10:35:21PM -0600, Michael Roth wrote:
> > > Unfortunately, in sev_enable(), between the point where snp_init() is
> > > called, and sev_status is actually set, there are a number of cpuid
> > > intructions which will make use of do_vc_no_ghcb() prior to sev_status
> > > being set (and it needs to happen in that order to set sev_status
> > > appropriately). After that point, snp_cpuid_active() would no longer be
> > > necessary, but during that span some indicator is needed in case this
> > > is just an SEV-ES guest trigger cpuid #VCs.
> > 
> > You mean testing what snp_cpuid_info_create() set up is not enough?
> > 
> > diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
> > index 7bc7e297f88c..17cfe804bad3 100644
> > --- a/arch/x86/kernel/sev-shared.c
> > +++ b/arch/x86/kernel/sev-shared.c
> > @@ -523,7 +523,9 @@ static int snp_cpuid_postprocess(u32 func, u32 subfunc, u32 *eax, u32 *ebx,
> >  static int snp_cpuid(u32 func, u32 subfunc, u32 *eax, u32 *ebx, u32 *ecx,
> >  		     u32 *edx)
> >  {
> > -	if (!snp_cpuid_active())
> > +	const struct snp_cpuid_info *c = snp_cpuid_info_get_ptr();
> > +
> > +	if (!c->count)
> >  		return -EOPNOTSUPP;
> >  
> >  	if (!snp_cpuid_find_validated_func(func, subfunc, eax, ebx, ecx, edx)) {
> 
> snp_cpuid_info_get_ptr() will always return non-NULL, since it's a
> pointer to the local copy of the cpuid page. But I can probably re-work

Doh, misread your patch. Yes I think checking the count would also work,
since a valid table should be non-zero.
Michael Roth Jan. 18, 2022, 2:37 p.m. UTC | #5
On Tue, Jan 18, 2022 at 08:32:38AM -0600, Michael Roth wrote:
> On Tue, Jan 18, 2022 at 08:23:45AM -0600, Michael Roth wrote:
> > On Tue, Jan 18, 2022 at 03:02:48PM +0100, Borislav Petkov wrote:
> > > On Mon, Jan 17, 2022 at 10:35:21PM -0600, Michael Roth wrote:
> > > > Unfortunately, in sev_enable(), between the point where snp_init() is
> > > > called, and sev_status is actually set, there are a number of cpuid
> > > > intructions which will make use of do_vc_no_ghcb() prior to sev_status
> > > > being set (and it needs to happen in that order to set sev_status
> > > > appropriately). After that point, snp_cpuid_active() would no longer be
> > > > necessary, but during that span some indicator is needed in case this
> > > > is just an SEV-ES guest trigger cpuid #VCs.
> > > 
> > > You mean testing what snp_cpuid_info_create() set up is not enough?
> > > 
> > > diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
> > > index 7bc7e297f88c..17cfe804bad3 100644
> > > --- a/arch/x86/kernel/sev-shared.c
> > > +++ b/arch/x86/kernel/sev-shared.c
> > > @@ -523,7 +523,9 @@ static int snp_cpuid_postprocess(u32 func, u32 subfunc, u32 *eax, u32 *ebx,
> > >  static int snp_cpuid(u32 func, u32 subfunc, u32 *eax, u32 *ebx, u32 *ecx,
> > >  		     u32 *edx)
> > >  {
> > > -	if (!snp_cpuid_active())
> > > +	const struct snp_cpuid_info *c = snp_cpuid_info_get_ptr();
> > > +
> > > +	if (!c->count)
> > >  		return -EOPNOTSUPP;
> > >  
> > >  	if (!snp_cpuid_find_validated_func(func, subfunc, eax, ebx, ecx, edx)) {
> > 
> > snp_cpuid_info_get_ptr() will always return non-NULL, since it's a
> > pointer to the local copy of the cpuid page. But I can probably re-work
> 
> Doh, misread your patch. Yes I think checking the count would also work,
> since a valid table should be non-zero.

Actually, no, because doing that would provide hypervisor a means to
effectively disable CPUID page for an SNP guest by provided a table with
count == 0, which needs to be guarded against.

But can still implement something similar by having snp_cpuid_info_get_ptr()
return NULL if local copy of cpuid page hasn't been initialized.
Michael Roth Jan. 18, 2022, 5:20 p.m. UTC | #6
On Tue, Jan 18, 2022 at 05:34:49PM +0100, Borislav Petkov wrote:
> On Tue, Jan 18, 2022 at 08:37:30AM -0600, Michael Roth wrote:
> > Actually, no, because doing that would provide hypervisor a means to
> > effectively disable CPUID page for an SNP guest by provided a table with
> > count == 0, which needs to be guarded against.
> 
> Err, I'm confused.
> 
> Isn't that "SEV-SNP guests will be provided the location of special
> 'secrets' 'CPUID' pages via the Confidential Computing blob..." and the
> HV has no say in there?
> 
> Why does the HV provide the CPUID page?

The HV fills out the initial contents of the CPUID page, which includes
the count. SNP/PSP firmware will validate the contents the HV tries to put
in the initial page, but does not currently enforce that the 'count' field
is non-zero. So we can't rely on the 'count' field as an indicator of
whether or not the CPUID page is active, we need to rely on the presence
of the ccblob as the true indicator, then treat a non-zero 'count' field
as an invalid state.

I think we discussed this to some extent in the past. The following
document was added to clarify the security model for CPUID page:

https://lore.kernel.org/lkml/20211210154332.11526-29-brijesh.singh@amd.com/

> 
> And when I read "secrets page" I think, encrypted/signed and given
> directly to the guest, past the HV which cannot even touch it.

The CPUID page is also encrypted, but its initial contents come from the
HV, which are then passed through the PSP for initial validation before
being placed in the CPUID page. But the count==0 case is not disallowed
by the PSP firmware, so can't be relied upon as a means to indicate that
the CPUID page is not active.

> 
> Hmmm.
> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpeople.kernel.org%2Ftglx%2Fnotes-about-netiquette&amp;data=04%7C01%7Cmichael.roth%40amd.com%7C9e8f64f998744605658608d9daa073ee%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637781205020570217%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=26Fkw4FJ9jOLVRW7SMs6IWYyaY5gO8iUfdm3x4HDaJk%3D&amp;reserved=0
Michael Roth Jan. 18, 2022, 6:49 p.m. UTC | #7
On Tue, Jan 18, 2022 at 06:41:16PM +0100, Borislav Petkov wrote:
> On Tue, Jan 18, 2022 at 11:20:43AM -0600, Michael Roth wrote:
> > The HV fills out the initial contents of the CPUID page, which includes
> > the count. SNP/PSP firmware will validate the contents the HV tries to put
> > in the initial page, but does not currently enforce that the 'count' field
> > is non-zero.
> 
> So if the HV sets count to 0, then the PSP can validate all it wants but
> you basically don't have a CPUID page. And that's a pretty easy way to
> defeat it, if you ask me.
> 
> So, if it is too late to change this, I guess the only way out of here
> is to terminate the guest on count == 0.

Right, which is already enforced as part of snp_cpuid_info_create(). So
snp_cpuid_info->count will always be non-zero for SNP guests...

Er... so I suppose we *could* use snp_cpuid_info->count==0 as an indicator
that the cpuid page isn't enabled afterall...since if this was an SNP guest
then count==0 would've caused it to terminate...

Sorry I missed that, early versions of the code used count==0 as
indicator to bypass CPUID page before we realized that wasn't safe, and
I'd avoided relying on count==0 for anything since then. But in the
current code it should work, since count==0 causes SNP guests to
terminate, so a running guest with count==0 is clearly non-SNP.

> 
> And regardless, what if the HV fakes the count - how do you figure out
> what the proper count is? You go and read the whole CPUID page and try
> to make sense of what's there, even beyond the "last" function leaf.

The current code trusts the count value, as long as it is within the
bounds of the CPUID page. If the hypervisor provides a count that is
higher or lower than the number of entries added to the table, the PSP
will fail the guest launch.

count==0 is sort of a special case because of the reasons above, and
since it is never a valid CPUID configuration, so makes sense to
guard against.


> 
> > So we can't rely on the 'count' field as an indicator of whether or
> > not the CPUID page is active, we need to rely on the presence of the
> > ccblob as the true indicator, then treat a non-zero 'count' field as
> > an invalid state.
> 
> treat a non-zero count field as invalid?
> 
> You mean, "a zero count" maybe...

Yes, sorry for the confusion.

> 
> But see above, how do you check whether the HV hasn't "hidden" some
> entries by modifying the count field?
> 
> Either I'm missing something or this sounds really weird...

Yes, that's my fault. count must match the actual number of entries in
the table in all cases. If count==0 then there must also be no entries
in the table. count==0 is only special in that code might erroneously
decide to treat it as an indicator that cpuid table isn't enabled at
all, but since that case causes termination it should actually be ok.

Though I wonder if we should do something like this to still keep
callers from relying on checking count==0 directly:

  static const struct snp_cpuid_info *
  snp_cpuid_info_get_ptr(void)
  {
          const struct snp_cpuid_info *cpuid_info;
          void *ptr;
  
          /*
           * This may be called early while still running on the initial identity
           * mapping. Use RIP-relative addressing to obtain the correct address
           * in both for identity mapping and after switch-over to kernel virtual
           * addresses.
           */
          asm ("lea cpuid_info_copy(%%rip), %0"
               : "=r" (ptr)
               : "p" (&cpuid_info_copy));
  
          cpuid_info = ptr;
          if (cpuid_info->count == 0)
                  return NULL
  
          return cpuid_info;
  }

Because then it's impossible for a caller to accidentally misconstrue
what count==0 means (0 entries? or cpuid table not present?), since the
table then simply becomes inaccessible for anything other than an SNP
guest, and callers just need a NULL check (and will get a free hint
(crash) if they don't).

> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpeople.kernel.org%2Ftglx%2Fnotes-about-netiquette&amp;data=04%7C01%7Cmichael.roth%40amd.com%7C56a8943d73484fda82ce08d9daa9bc0c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637781244848502186%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=2QO85fJBiZt2opWRWX%2FGb5LPt4How5cuAt4UJzAiQsg%3D&amp;reserved=0
Michael Roth Jan. 19, 2022, 1:18 a.m. UTC | #8
On Tue, Jan 18, 2022 at 12:49:30PM -0600, Michael Roth wrote:
> On Tue, Jan 18, 2022 at 06:41:16PM +0100, Borislav Petkov wrote:
> > On Tue, Jan 18, 2022 at 11:20:43AM -0600, Michael Roth wrote:
> > > The HV fills out the initial contents of the CPUID page, which includes
> > > the count. SNP/PSP firmware will validate the contents the HV tries to put
> > > in the initial page, but does not currently enforce that the 'count' field
> > > is non-zero.
> > 
> > And regardless, what if the HV fakes the count - how do you figure out
> > what the proper count is? You go and read the whole CPUID page and try
> > to make sense of what's there, even beyond the "last" function leaf.
> 
> 

<snip>

> > 
> > But see above, how do you check whether the HV hasn't "hidden" some
> > entries by modifying the count field?
> > 
> > Either I'm missing something or this sounds really weird...
> 
> ...count must match the actual number of entries in the table in all
> cases.

Turns out in my testing earlier there was a separate check that was
causing the PSP to fail, so I re-tested the behavior, and things are
actually a bit more interesting, but nothing too concerning:

If 'fake_count'/'reported_count' is greater than the actual number of
entries in the table, 'actual_count', then all table entries up to
'fake_count' will also need to pass validation. Generally the table
will be zero'd out initially, so those additional/bogus entries will
be interpreted as a CPUID leaves where all fields are 0. Unfortunately,
that's still considered a valid leaf, even if it's a duplicate of the
*actual* 0x0 leaf present earlier in the table. The current code will
handle this fine, since it scans the table in order, and uses the
valid 0x0 leaf earlier in the table.

This is isn't really a special case though, it falls under the general
category of a hypervisor inserting garbage entries that happen to pass
validation, but don't reflect values that a guest would normally see.
This will be detectable as part of guest owner attestation, since the
guest code is careful to guarantee that the values seen after boot,
once the attestation stage is reached, will be identical to the values
seen during boot, so if this sort of manipulation of CPUID values
occurred, the guest owner will notice this during attestation, and can
abort the boot at that point. The Documentation patch addresses this
in more detail.

If 'fake_count' is less than 'actual_count', then the PSP skips
validation for anything >= 'fake_count', and leaves them in the table.
That should also be fine though, since guest code should never exceed
'fake_count'/'reported_count', as that's a blatant violation of the
spec, and it doesn't make any sense for a guest to do this. This will
effectively 'hide' entries, but those resulting missing CPUID leaves
will be noticeable to the guest owner once attestation phase is
reached.

This does all highlight the need for some very thorough guidelines
on how a guest owner should implement their attestation checks for
cpuid, however. I think a section in the reference implementation
notes/document that covers this would be a good starting point. I'll
also check with the PSP team on tightening up some of these CPUID
page checks to rule out some of these possibilities in the future.

> in the table. count==0 is only special in that code might erroneously
> decide to treat it as an indicator that cpuid table isn't enabled at
> all, but since that case causes termination it should actually be ok.
> 
> Though I wonder if we should do something like this to still keep
> callers from relying on checking count==0 directly:
> 
>   static const struct snp_cpuid_info *
>   snp_cpuid_info_get_ptr(void)
>   {
>           const struct snp_cpuid_info *cpuid_info;
>           void *ptr;
>   
>           /*
>            * This may be called early while still running on the initial identity
>            * mapping. Use RIP-relative addressing to obtain the correct address
>            * in both for identity mapping and after switch-over to kernel virtual
>            * addresses.
>            */
>           asm ("lea cpuid_info_copy(%%rip), %0"
>                : "=r" (ptr)
>                : "p" (&cpuid_info_copy));
>   
>           cpuid_info = ptr;
>           if (cpuid_info->count == 0)
>                   return NULL
>   
>           return cpuid_info;
>   }

Nevermind, that doesn't work since snp_cpuid_info_get_ptr() is also called
by snp_cpuid_info_get_ptr() *prior* to initializing the table, so it ends
seeing cpuid->count==0 and fails right away. So your initial suggestion
of checking cpuid->count==0 at the call-sites to determine if the table
is enabled is probably the best option.

Sorry for the noise/confusion.
Michael Roth Jan. 19, 2022, 4:27 p.m. UTC | #9
On Wed, Jan 19, 2022 at 12:17:22PM +0100, Borislav Petkov wrote:
> On Tue, Jan 18, 2022 at 07:18:06PM -0600, Michael Roth wrote:
> > If 'fake_count'/'reported_count' is greater than the actual number of
> > entries in the table, 'actual_count', then all table entries up to
> > 'fake_count' will also need to pass validation. Generally the table
> > will be zero'd out initially, so those additional/bogus entries will
> > be interpreted as a CPUID leaves where all fields are 0. Unfortunately,
> > that's still considered a valid leaf, even if it's a duplicate of the
> > *actual* 0x0 leaf present earlier in the table. The current code will
> > handle this fine, since it scans the table in order, and uses the
> > valid 0x0 leaf earlier in the table.
> 
> I guess it would be prudent to have some warnings when enumerating those
> leafs and when the count index "goes off into the weeds", so to speak,
> and starts reading 0-CPUID entries. I.e., "dear guest owner, your HV is
> giving you a big lie: a weird/bogus CPUID leaf count..."
> 
> :-)

Ok, there's some sanity checks that happen a little later in boot via
snp_cpuid_check_status(), after printk is enabled, that reports some
basic details to dmesg like the number of entries in the table. I can
add some additional sanity checks to flag the above case (really,
all-zero entries never make sense, since CPUID 0x0 is supposed to report
the max standard-range CPUID leaf, and leaf 0x1 at least should always
be present). I'll print a warning for such cases, add maybe dump the
cpuid the table in that case so it can be examined more easily by
owner.

> 
> And lemme make sure I understand it: the ->count itself is not
> measured/encrypted because you want to be flexible here and supply
> different blobs with different CPUID leafs?

Yes, but to be clear it's the entire CPUID page, including the count,
that's not measured (though it is encrypted after passing PSP
validation). Probably the biggest reason is the logistics of having
untrusted cloud vendors provide a copy of the CPUID values they plan
to pass to the guest, since a new measurement would need to be
calculated for every new configuration (using different guest
cpuflags, SMP count, etc.), since those table values will need to be
made easily-accessible to guest owner for all these measurement
calculations, and they can't be trusted so each table would need to
be checked either manually or by some tooling that could be difficult
to implement unless it was something simple like "give me the expected
CPUID values and I'll check if the provided CPUID table agrees with
that".

At that point it's much easier for the guest owner to just check the
CPUID values directly against known good values for a particular
configuration as part of their attestation process and leave the
untrusted cloud vendor out of it completely. So not measuring the
CPUID page as part of SNP attestation allows for that flexibility.

> 
> > This is isn't really a special case though, it falls under the general
> > category of a hypervisor inserting garbage entries that happen to pass
> > validation, but don't reflect values that a guest would normally see.
> > This will be detectable as part of guest owner attestation, since the
> > guest code is careful to guarantee that the values seen after boot,
> > once the attestation stage is reached, will be identical to the values
> > seen during boot, so if this sort of manipulation of CPUID values
> > occurred, the guest owner will notice this during attestation, and can
> > abort the boot at that point. The Documentation patch addresses this
> > in more detail.
> 
> Yap, it is important this is properly explained there so that people can
> pay attention to during attestation.
> 
> > If 'fake_count' is less than 'actual_count', then the PSP skips
> > validation for anything >= 'fake_count', and leaves them in the table.
> > That should also be fine though, since guest code should never exceed
> > 'fake_count'/'reported_count', as that's a blatant violation of the
> > spec, and it doesn't make any sense for a guest to do this. This will
> > effectively 'hide' entries, but those resulting missing CPUID leaves
> > will be noticeable to the guest owner once attestation phase is
> > reached.
> 
> Noticeable because the guest owner did supply a CPUID table with X
> entries but the HV is reporting Y?

Or even more simply by the guest owner simply running 'cpuid -r -1' on
the guest after boot, and making sure all the expected entries are
present. If the HV manipulated the count to be lower, there would be
missing entries, if they manipulated it to be higher, then there would
either be extra duplicate entries at the end of the table (which the
#VC handler would ignore due to it using the first matching entry in
the table when doing lookups), or additional non-duplicate garbage
entries, which will show up in 'cpuid -r -1' as unexpected entries.

Really 'cpuid -r -1' is the guest owner/userspace view of things, so
some of these nuances about the table contents might be noteworthy,
but wouldn't actually affect guest behavior, which would be the main
thing attestation process should be concerned with.

> 
> If so, you can make this part of the attestation process: guest owners
> should always check the CPUID entries count to be of a certain value.
> 
> > This does all highlight the need for some very thorough guidelines
> > on how a guest owner should implement their attestation checks for
> > cpuid, however. I think a section in the reference implementation
> > notes/document that covers this would be a good starting point. I'll
> > also check with the PSP team on tightening up some of these CPUID
> > page checks to rule out some of these possibilities in the future.
> 
> Now you're starting to grow the right amount of paranoia - I'm glad I
> was able to sensitize you properly!
> 
> :-)))

Hehe =*D

Thanks!

-Mike
Michael Roth Jan. 27, 2022, 5:23 p.m. UTC | #10
On Wed, Jan 19, 2022 at 10:27:47AM -0600, Michael Roth wrote:
> On Wed, Jan 19, 2022 at 12:17:22PM +0100, Borislav Petkov wrote:
> > On Tue, Jan 18, 2022 at 07:18:06PM -0600, Michael Roth wrote:
> > > If 'fake_count'/'reported_count' is greater than the actual number of
> > > entries in the table, 'actual_count', then all table entries up to
> > > 'fake_count' will also need to pass validation. Generally the table
> > > will be zero'd out initially, so those additional/bogus entries will
> > > be interpreted as a CPUID leaves where all fields are 0. Unfortunately,
> > > that's still considered a valid leaf, even if it's a duplicate of the
> > > *actual* 0x0 leaf present earlier in the table. The current code will
> > > handle this fine, since it scans the table in order, and uses the
> > > valid 0x0 leaf earlier in the table.
> > 
> > I guess it would be prudent to have some warnings when enumerating those
> > leafs and when the count index "goes off into the weeds", so to speak,
> > and starts reading 0-CPUID entries. I.e., "dear guest owner, your HV is
> > giving you a big lie: a weird/bogus CPUID leaf count..."
> > 
> > :-)
> 

Sorry for the delay, this response got stuck in my mail queue apparently.

> Ok, there's some sanity checks that happen a little later in boot via
> snp_cpuid_check_status(), after printk is enabled, that reports some
> basic details to dmesg like the number of entries in the table. I can
> add some additional sanity checks to flag the above case (really,
> all-zero entries never make sense, since CPUID 0x0 is supposed to report
> the max standard-range CPUID leaf, and leaf 0x1 at least should always
> be present). I'll print a warning for such cases, add maybe dump the
> cpuid the table in that case so it can be examined more easily by
> owner.
> 
> > 
> > And lemme make sure I understand it: the ->count itself is not
> > measured/encrypted because you want to be flexible here and supply
> > different blobs with different CPUID leafs?
> 
> Yes, but to be clear it's the entire CPUID page, including the count,
> that's not measured (though it is encrypted after passing PSP
> validation). Probably the biggest reason is the logistics of having
> untrusted cloud vendors provide a copy of the CPUID values they plan
> to pass to the guest, since a new measurement would need to be
> calculated for every new configuration (using different guest
> cpuflags, SMP count, etc.), since those table values will need to be
> made easily-accessible to guest owner for all these measurement
> calculations, and they can't be trusted so each table would need to
> be checked either manually or by some tooling that could be difficult
> to implement unless it was something simple like "give me the expected
> CPUID values and I'll check if the provided CPUID table agrees with
> that".
> 
> At that point it's much easier for the guest owner to just check the
> CPUID values directly against known good values for a particular
> configuration as part of their attestation process and leave the
> untrusted cloud vendor out of it completely. So not measuring the
> CPUID page as part of SNP attestation allows for that flexibility.
> 
> > 
> > > This is isn't really a special case though, it falls under the general
> > > category of a hypervisor inserting garbage entries that happen to pass
> > > validation, but don't reflect values that a guest would normally see.
> > > This will be detectable as part of guest owner attestation, since the
> > > guest code is careful to guarantee that the values seen after boot,
> > > once the attestation stage is reached, will be identical to the values
> > > seen during boot, so if this sort of manipulation of CPUID values
> > > occurred, the guest owner will notice this during attestation, and can
> > > abort the boot at that point. The Documentation patch addresses this
> > > in more detail.
> > 
> > Yap, it is important this is properly explained there so that people can
> > pay attention to during attestation.
> > 
> > > If 'fake_count' is less than 'actual_count', then the PSP skips
> > > validation for anything >= 'fake_count', and leaves them in the table.
> > > That should also be fine though, since guest code should never exceed
> > > 'fake_count'/'reported_count', as that's a blatant violation of the
> > > spec, and it doesn't make any sense for a guest to do this. This will
> > > effectively 'hide' entries, but those resulting missing CPUID leaves
> > > will be noticeable to the guest owner once attestation phase is
> > > reached.
> > 
> > Noticeable because the guest owner did supply a CPUID table with X
> > entries but the HV is reporting Y?
> 
> Or even more simply by the guest owner simply running 'cpuid -r -1' on
> the guest after boot, and making sure all the expected entries are
> present. If the HV manipulated the count to be lower, there would be
> missing entries, if they manipulated it to be higher, then there would
> either be extra duplicate entries at the end of the table (which the
> #VC handler would ignore due to it using the first matching entry in
> the table when doing lookups), or additional non-duplicate garbage
> entries, which will show up in 'cpuid -r -1' as unexpected entries.
> 
> Really 'cpuid -r -1' is the guest owner/userspace view of things, so
> some of these nuances about the table contents might be noteworthy,
> but wouldn't actually affect guest behavior, which would be the main
> thing attestation process should be concerned with.
> 
> > 
> > If so, you can make this part of the attestation process: guest owners
> > should always check the CPUID entries count to be of a certain value.
> > 
> > > This does all highlight the need for some very thorough guidelines
> > > on how a guest owner should implement their attestation checks for
> > > cpuid, however. I think a section in the reference implementation
> > > notes/document that covers this would be a good starting point. I'll
> > > also check with the PSP team on tightening up some of these CPUID
> > > page checks to rule out some of these possibilities in the future.
> > 
> > Now you're starting to grow the right amount of paranoia - I'm glad I
> > was able to sensitize you properly!
> > 
> > :-)))
> 
> Hehe =*D
> 
> Thanks!
> 
> -Mike
diff mbox series

Patch

diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 348f7711c3ea..3514feb5b226 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -20,6 +20,7 @@ 
 #include <asm/fpu/xcr.h>
 #include <asm/ptrace.h>
 #include <asm/svm.h>
+#include <asm/cpuid.h>
 
 #include "error.h"
 
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 38c14601ae4a..673e6778194b 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -152,6 +152,8 @@  struct snp_psc_desc {
 #define GHCB_TERM_PSC			1	/* Page State Change failure */
 #define GHCB_TERM_PVALIDATE		2	/* Pvalidate failure */
 #define GHCB_TERM_NOT_VMPL0		3	/* SNP guest is not running at VMPL-0 */
+#define GHCB_TERM_CPUID			4	/* CPUID-validation failure */
+#define GHCB_TERM_CPUID_HV		5	/* CPUID failure during hypervisor fallback */
 
 #define GHCB_RESP_CODE(v)		((v) & GHCB_MSR_INFO_MASK)
 
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index d89481b31022..dabb425498e0 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -14,6 +14,41 @@ 
 #define has_cpuflag(f)	boot_cpu_has(f)
 #endif
 
+/*
+ * Individual entries of the SEV-SNP CPUID table, as defined by the SEV-SNP
+ * Firmware ABI, Revision 0.9, Section 7.1, Table 14. Note that the XCR0_IN
+ * and XSS_IN are denoted here as __unused/__unused2, since they are not
+ * needed for the current guest implementation, where the size of the buffers
+ * needed to store enabled XSAVE-saved features are calculated rather than
+ * encoded in the CPUID table for each possible combination of XCR0_IN/XSS_IN
+ * to save space.
+ */
+struct snp_cpuid_fn {
+	u32 eax_in;
+	u32 ecx_in;
+	u64 __unused;
+	u64 __unused2;
+	u32 eax;
+	u32 ebx;
+	u32 ecx;
+	u32 edx;
+	u64 __reserved;
+} __packed;
+
+/*
+ * SEV-SNP CPUID table header, as defined by the SEV-SNP Firmware ABI,
+ * Revision 0.9, Section 8.14.2.6. Also noted there is the SEV-SNP
+ * firmware-enforced limit of 64 entries per CPUID table.
+ */
+#define SNP_CPUID_COUNT_MAX 64
+
+struct snp_cpuid_info {
+	u32 count;
+	u32 __reserved1;
+	u64 __reserved2;
+	struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX];
+} __packed;
+
 /*
  * Since feature negotiation related variables are set early in the boot
  * process they must reside in the .data section so as not to be zeroed
@@ -23,6 +58,20 @@ 
  */
 static u16 ghcb_version __ro_after_init;
 
+/* Copy of the SNP firmware's CPUID page. */
+static struct snp_cpuid_info cpuid_info_copy __ro_after_init;
+static bool snp_cpuid_initialized __ro_after_init;
+
+/*
+ * These will be initialized based on CPUID table so that non-present
+ * all-zero leaves (for sparse tables) can be differentiated from
+ * invalid/out-of-range leaves. This is needed since all-zero leaves
+ * still need to be post-processed.
+ */
+u32 cpuid_std_range_max __ro_after_init;
+u32 cpuid_hyp_range_max __ro_after_init;
+u32 cpuid_ext_range_max __ro_after_init;
+
 static bool __init sev_es_check_cpu_features(void)
 {
 	if (!has_cpuflag(X86_FEATURE_RDRAND)) {
@@ -246,6 +295,244 @@  static int sev_cpuid_hv(u32 func, u32 subfunc, u32 *eax, u32 *ebx,
 	return 0;
 }
 
+static const struct snp_cpuid_info *
+snp_cpuid_info_get_ptr(void)
+{
+	void *ptr;
+
+	/*
+	 * This may be called early while still running on the initial identity
+	 * mapping. Use RIP-relative addressing to obtain the correct address
+	 * in both for identity mapping and after switch-over to kernel virtual
+	 * addresses.
+	 */
+	asm ("lea cpuid_info_copy(%%rip), %0"
+	     : "=r" (ptr)
+	     : "p" (&cpuid_info_copy));
+
+	return ptr;
+}
+
+static inline bool snp_cpuid_active(void)
+{
+	return snp_cpuid_initialized;
+}
+
+static int snp_cpuid_calc_xsave_size(u64 xfeatures_en, u32 base_size,
+				     u32 *xsave_size, bool compacted)
+{
+	const struct snp_cpuid_info *cpuid_info = snp_cpuid_info_get_ptr();
+	u32 xsave_size_total = base_size;
+	u64 xfeatures_found = 0;
+	int i;
+
+	for (i = 0; i < cpuid_info->count; i++) {
+		const struct snp_cpuid_fn *fn = &cpuid_info->fn[i];
+
+		if (!(fn->eax_in == 0xD && fn->ecx_in > 1 && fn->ecx_in < 64))
+			continue;
+		if (!(xfeatures_en & (BIT_ULL(fn->ecx_in))))
+			continue;
+		if (xfeatures_found & (BIT_ULL(fn->ecx_in)))
+			continue;
+
+		xfeatures_found |= (BIT_ULL(fn->ecx_in));
+
+		if (compacted)
+			xsave_size_total += fn->eax;
+		else
+			xsave_size_total = max(xsave_size_total,
+					       fn->eax + fn->ebx);
+	}
+
+	/*
+	 * Either the guest set unsupported XCR0/XSS bits, or the corresponding
+	 * entries in the CPUID table were not present. This is not a valid
+	 * state to be in.
+	 */
+	if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2)))
+		return -EINVAL;
+
+	*xsave_size = xsave_size_total;
+
+	return 0;
+}
+
+static void snp_cpuid_hv(u32 func, u32 subfunc, u32 *eax, u32 *ebx, u32 *ecx,
+			 u32 *edx)
+{
+	/*
+	 * MSR protocol does not support fetching indexed subfunction, but is
+	 * sufficient to handle current fallback cases. Should that change,
+	 * make sure to terminate rather than ignoring the index and grabbing
+	 * random values. If this issue arises in the future, handling can be
+	 * added here to use GHCB-page protocol for cases that occur late
+	 * enough in boot that GHCB page is available.
+	 */
+	if (cpuid_function_is_indexed(func) && subfunc)
+		sev_es_terminate(1, GHCB_TERM_CPUID_HV);
+
+	if (sev_cpuid_hv(func, 0, eax, ebx, ecx, edx))
+		sev_es_terminate(1, GHCB_TERM_CPUID_HV);
+}
+
+static bool
+snp_cpuid_find_validated_func(u32 func, u32 subfunc, u32 *eax, u32 *ebx,
+			      u32 *ecx, u32 *edx)
+{
+	const struct snp_cpuid_info *cpuid_info = snp_cpuid_info_get_ptr();
+	int i;
+
+	for (i = 0; i < cpuid_info->count; i++) {
+		const struct snp_cpuid_fn *fn = &cpuid_info->fn[i];
+
+		if (fn->eax_in != func)
+			continue;
+
+		if (cpuid_function_is_indexed(func) && fn->ecx_in != subfunc)
+			continue;
+
+		*eax = fn->eax;
+		*ebx = fn->ebx;
+		*ecx = fn->ecx;
+		*edx = fn->edx;
+
+		return true;
+	}
+
+	return false;
+}
+
+static bool snp_cpuid_check_range(u32 func)
+{
+	if (func <= cpuid_std_range_max ||
+	    (func >= 0x40000000 && func <= cpuid_hyp_range_max) ||
+	    (func >= 0x80000000 && func <= cpuid_ext_range_max))
+		return true;
+
+	return false;
+}
+
+static int snp_cpuid_postprocess(u32 func, u32 subfunc, u32 *eax, u32 *ebx,
+				 u32 *ecx, u32 *edx)
+{
+	u32 ebx2, ecx2, edx2;
+
+	switch (func) {
+	case 0x1:
+		snp_cpuid_hv(func, subfunc, NULL, &ebx2, NULL, &edx2);
+
+		/* initial APIC ID */
+		*ebx = (ebx2 & GENMASK(31, 24)) | (*ebx & GENMASK(23, 0));
+		/* APIC enabled bit */
+		*edx = (edx2 & BIT(9)) | (*edx & ~BIT(9));
+
+		/* OSXSAVE enabled bit */
+		if (native_read_cr4() & X86_CR4_OSXSAVE)
+			*ecx |= BIT(27);
+		break;
+	case 0x7:
+		/* OSPKE enabled bit */
+		*ecx &= ~BIT(4);
+		if (native_read_cr4() & X86_CR4_PKE)
+			*ecx |= BIT(4);
+		break;
+	case 0xB:
+		/* extended APIC ID */
+		snp_cpuid_hv(func, 0, NULL, NULL, NULL, edx);
+		break;
+	case 0xD: {
+		bool compacted = false;
+		u64 xcr0 = 1, xss = 0;
+		u32 xsave_size;
+
+		if (subfunc != 0 && subfunc != 1)
+			return 0;
+
+		if (native_read_cr4() & X86_CR4_OSXSAVE)
+			xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+		if (subfunc == 1) {
+			/* Get XSS value if XSAVES is enabled. */
+			if (*eax & BIT(3)) {
+				unsigned long lo, hi;
+
+				asm volatile("rdmsr" : "=a" (lo), "=d" (hi)
+						     : "c" (MSR_IA32_XSS));
+				xss = (hi << 32) | lo;
+			}
+
+			/*
+			 * The PPR and APM aren't clear on what size should be
+			 * encoded in 0xD:0x1:EBX when compaction is not enabled
+			 * by either XSAVEC (feature bit 1) or XSAVES (feature
+			 * bit 3) since SNP-capable hardware has these feature
+			 * bits fixed as 1. KVM sets it to 0 in this case, but
+			 * to avoid this becoming an issue it's safer to simply
+			 * treat this as unsupported for SEV-SNP guests.
+			 */
+			if (!(*eax & (BIT(1) | BIT(3))))
+				return -EINVAL;
+
+			compacted = true;
+		}
+
+		if (snp_cpuid_calc_xsave_size(xcr0 | xss, *ebx, &xsave_size,
+					      compacted))
+			return -EINVAL;
+
+		*ebx = xsave_size;
+		}
+		break;
+	case 0x8000001E:
+		/* extended APIC ID */
+		snp_cpuid_hv(func, subfunc, eax, &ebx2, &ecx2, NULL);
+		/* compute ID */
+		*ebx = (*ebx & GENMASK(31, 8)) | (ebx2 & GENMASK(7, 0));
+		/* node ID */
+		*ecx = (*ecx & GENMASK(31, 8)) | (ecx2 & GENMASK(7, 0));
+		break;
+	default:
+		/* No fix-ups needed, use values as-is. */
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ * Returns -EOPNOTSUPP if feature not enabled. Any other return value should be
+ * treated as fatal by caller.
+ */
+static int snp_cpuid(u32 func, u32 subfunc, u32 *eax, u32 *ebx, u32 *ecx,
+		     u32 *edx)
+{
+	if (!snp_cpuid_active())
+		return -EOPNOTSUPP;
+
+	if (!snp_cpuid_find_validated_func(func, subfunc, eax, ebx, ecx, edx)) {
+		/*
+		 * Some hypervisors will avoid keeping track of CPUID entries
+		 * where all values are zero, since they can be handled the
+		 * same as out-of-range values (all-zero). This is useful here
+		 * as well as it allows virtually all guest configurations to
+		 * work using a single SEV-SNP CPUID table.
+		 *
+		 * To allow for this, there is a need to distinguish between
+		 * out-of-range entries and in-range zero entries, since the
+		 * CPUID table entries are only a template that may need to be
+		 * augmented with additional values for things like
+		 * CPU-specific information during post-processing. So if it's
+		 * not in the table, but is still in the valid range, proceed
+		 * with the post-processing. Otherwise, just return zeros.
+		 */
+		*eax = *ebx = *ecx = *edx = 0;
+		if (!snp_cpuid_check_range(func))
+			return 0;
+	}
+
+	return snp_cpuid_postprocess(func, subfunc, eax, ebx, ecx, edx);
+}
+
 /*
  * Boot VC Handler - This is the first VC handler during boot, there is no GHCB
  * page yet, so it only supports the MSR based communication with the
@@ -253,16 +540,26 @@  static int sev_cpuid_hv(u32 func, u32 subfunc, u32 *eax, u32 *ebx,
  */
 void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
 {
+	unsigned int subfn = lower_bits(regs->cx, 32);
 	unsigned int fn = lower_bits(regs->ax, 32);
 	u32 eax, ebx, ecx, edx;
+	int ret;
 
 	/* Only CPUID is supported via MSR protocol */
 	if (exit_code != SVM_EXIT_CPUID)
 		goto fail;
 
+	ret = snp_cpuid(fn, subfn, &eax, &ebx, &ecx, &edx);
+	if (ret == 0)
+		goto cpuid_done;
+
+	if (ret != -EOPNOTSUPP)
+		goto fail;
+
 	if (sev_cpuid_hv(fn, 0, &eax, &ebx, &ecx, &edx))
 		goto fail;
 
+cpuid_done:
 	regs->ax = eax;
 	regs->bx = ebx;
 	regs->cx = ecx;
@@ -557,12 +854,35 @@  static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
 	return ret;
 }
 
+static int vc_handle_cpuid_snp(struct pt_regs *regs)
+{
+	u32 eax, ebx, ecx, edx;
+	int ret;
+
+	ret = snp_cpuid(regs->ax, regs->cx, &eax, &ebx, &ecx, &edx);
+	if (ret == 0) {
+		regs->ax = eax;
+		regs->bx = ebx;
+		regs->cx = ecx;
+		regs->dx = edx;
+	}
+
+	return ret;
+}
+
 static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
 				      struct es_em_ctxt *ctxt)
 {
 	struct pt_regs *regs = ctxt->regs;
 	u32 cr4 = native_read_cr4();
 	enum es_result ret;
+	int snp_cpuid_ret;
+
+	snp_cpuid_ret = vc_handle_cpuid_snp(regs);
+	if (snp_cpuid_ret == 0)
+		return ES_OK;
+	if (snp_cpuid_ret != -EOPNOTSUPP)
+		return ES_VMM_ERROR;
 
 	ghcb_set_rax(ghcb, regs->ax);
 	ghcb_set_rcx(ghcb, regs->cx);
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 21926b094378..32f60602ec29 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -33,6 +33,7 @@ 
 #include <asm/smp.h>
 #include <asm/cpu.h>
 #include <asm/apic.h>
+#include <asm/cpuid.h>
 
 #define DR7_RESET_VALUE        0x400