diff mbox series

[v12,32/46] x86/compressed/64: Add support for SEV-SNP CPUID table in #VC handlers

Message ID 20220307213356.2797205-33-brijesh.singh@amd.com (mailing list archive)
State New, archived
Headers show
Series Add AMD Secure Nested Paging (SEV-SNP) Guest Support | expand

Commit Message

Brijesh Singh March 7, 2022, 9:33 p.m. UTC
From: Michael Roth <michael.roth@amd.com>

CPUID instructions generate a #VC exception for SEV-ES/SEV-SNP guests,
for which early handlers are currently set up to handle. In the case
of SEV-SNP, guests can use a configurable location in guest memory
that has been pre-populated with a firmware-validated CPUID table to
look up the relevant CPUID values rather than requesting them from
hypervisor via a VMGEXIT. Add the various hooks in the #VC handlers to
allow CPUID instructions to be handled via the table. The code to
actually configure/enable the table will be added in a subsequent
commit.

Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 arch/x86/include/asm/sev-common.h |   2 +
 arch/x86/kernel/sev-shared.c      | 324 ++++++++++++++++++++++++++++++
 2 files changed, 326 insertions(+)

Comments

Peter Gonda March 10, 2022, 2:51 p.m. UTC | #1
()

On Mon, Mar 7, 2022 at 2:35 PM Brijesh Singh <brijesh.singh@amd.com> wrote:
>
> From: Michael Roth <michael.roth@amd.com>
>
> CPUID instructions generate a #VC exception for SEV-ES/SEV-SNP guests,
> for which early handlers are currently set up to handle. In the case
> of SEV-SNP, guests can use a configurable location in guest memory
> that has been pre-populated with a firmware-validated CPUID table to
> look up the relevant CPUID values rather than requesting them from
> hypervisor via a VMGEXIT. Add the various hooks in the #VC handlers to
> allow CPUID instructions to be handled via the table. The code to
> actually configure/enable the table will be added in a subsequent
> commit.
>
> Signed-off-by: Michael Roth <michael.roth@amd.com>
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> ---
>  arch/x86/include/asm/sev-common.h |   2 +
>  arch/x86/kernel/sev-shared.c      | 324 ++++++++++++++++++++++++++++++
>  2 files changed, 326 insertions(+)
>
> diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
> index e9b6815b3b3d..0759af9b1acf 100644
> --- a/arch/x86/include/asm/sev-common.h
> +++ b/arch/x86/include/asm/sev-common.h
> @@ -152,6 +152,8 @@ struct snp_psc_desc {
>  #define GHCB_TERM_PSC                  1       /* Page State Change failure */
>  #define GHCB_TERM_PVALIDATE            2       /* Pvalidate failure */
>  #define GHCB_TERM_NOT_VMPL0            3       /* SNP guest is not running at VMPL-0 */
> +#define GHCB_TERM_CPUID                        4       /* CPUID-validation failure */
> +#define GHCB_TERM_CPUID_HV             5       /* CPUID failure during hypervisor fallback */
>
>  #define GHCB_RESP_CODE(v)              ((v) & GHCB_MSR_INFO_MASK)
>
> diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
> index b4d5558c9d0a..0f1375164ff0 100644
> --- a/arch/x86/kernel/sev-shared.c
> +++ b/arch/x86/kernel/sev-shared.c
> @@ -24,6 +24,36 @@ struct cpuid_leaf {
>         u32 edx;
>  };
>
> +/*
> + * Individual entries of the SNP CPUID table, as defined by the SNP
> + * Firmware ABI, Revision 0.9, Section 7.1, Table 14.
> + */
> +struct snp_cpuid_fn {
> +       u32 eax_in;
> +       u32 ecx_in;
> +       u64 xcr0_in;
> +       u64 xss_in;
> +       u32 eax;
> +       u32 ebx;
> +       u32 ecx;
> +       u32 edx;
> +       u64 __reserved;
> +} __packed;
> +
> +/*
> + * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9,
> + * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit
> + * of 64 entries per CPUID table.
> + */
> +#define SNP_CPUID_COUNT_MAX 64
> +
> +struct snp_cpuid_table {
> +       u32 count;
> +       u32 __reserved1;
> +       u64 __reserved2;
> +       struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX];
> +} __packed;
> +
>  /*
>   * Since feature negotiation related variables are set early in the boot
>   * process they must reside in the .data section so as not to be zeroed
> @@ -33,6 +63,19 @@ struct cpuid_leaf {
>   */
>  static u16 ghcb_version __ro_after_init;
>
> +/* Copy of the SNP firmware's CPUID page. */
> +static struct snp_cpuid_table cpuid_table_copy __ro_after_init;
> +
> +/*
> + * These will be initialized based on CPUID table so that non-present
> + * all-zero leaves (for sparse tables) can be differentiated from
> + * invalid/out-of-range leaves. This is needed since all-zero leaves
> + * still need to be post-processed.
> + */
> +static u32 cpuid_std_range_max __ro_after_init;
> +static u32 cpuid_hyp_range_max __ro_after_init;
> +static u32 cpuid_ext_range_max __ro_after_init;
> +
>  static bool __init sev_es_check_cpu_features(void)
>  {
>         if (!has_cpuflag(X86_FEATURE_RDRAND)) {
> @@ -242,6 +285,252 @@ static int sev_cpuid_hv(struct cpuid_leaf *leaf)
>         return ret;
>  }
>
> +/*
> + * This may be called early while still running on the initial identity
> + * mapping. Use RIP-relative addressing to obtain the correct address
> + * while running with the initial identity mapping as well as the
> + * switch-over to kernel virtual addresses later.
> + */
> +static const struct snp_cpuid_table *snp_cpuid_get_table(void)
> +{
> +       void *ptr;
> +
> +       asm ("lea cpuid_table_copy(%%rip), %0"
> +            : "=r" (ptr)
> +            : "p" (&cpuid_table_copy));
> +
> +       return ptr;
> +}
> +
> +/*
> + * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of
> + * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0
> + * and 1 based on the corresponding features enabled by a particular
> + * combination of XCR0 and XSS registers so that a guest can look up the
> + * version corresponding to the features currently enabled in its XCR0/XSS
> + * registers. The only values that differ between these versions/table
> + * entries is the enabled XSAVE area size advertised via EBX.
> + *
> + * While hypervisors may choose to make use of this support, it is more
> + * robust/secure for a guest to simply find the entry corresponding to the
> + * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the
> + * XSAVE area size using subfunctions 2 through 64, as documented in APM
> + * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here.
> + *
> + * Since base/legacy XSAVE area size is documented as 0x240, use that value
> + * directly rather than relying on the base size in the CPUID table.
> + *
> + * Return: XSAVE area size on success, 0 otherwise.
> + */
> +static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
> +{
> +       const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
> +       u64 xfeatures_found = 0;
> +       u32 xsave_size = 0x240;
> +       int i;
> +
> +       for (i = 0; i < cpuid_table->count; i++) {
> +               const struct snp_cpuid_fn *e = &cpuid_table->fn[i];
> +
> +               if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64))
> +                       continue;
> +               if (!(xfeatures_en & (BIT_ULL(e->ecx_in))))
> +                       continue;
> +               if (xfeatures_found & (BIT_ULL(e->ecx_in)))
> +                       continue;
> +
> +               xfeatures_found |= (BIT_ULL(e->ecx_in));
> +
> +               if (compacted)
> +                       xsave_size += e->eax;
> +               else
> +                       xsave_size = max(xsave_size, e->eax + e->ebx);
> +       }
> +
> +       /*
> +        * Either the guest set unsupported XCR0/XSS bits, or the corresponding
> +        * entries in the CPUID table were not present. This is not a valid
> +        * state to be in.
> +        */
> +       if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2)))
> +               return 0;
> +
> +       return xsave_size;
> +}
> +
> +static bool
> +snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
> +{
> +       const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
> +       int i;
> +
> +       for (i = 0; i < cpuid_table->count; i++) {
> +               const struct snp_cpuid_fn *e = &cpuid_table->fn[i];
> +
> +               if (e->eax_in != leaf->fn)
> +                       continue;
> +
> +               if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn)
> +                       continue;
> +
> +               /*
> +                * For 0xD subfunctions 0 and 1, only use the entry corresponding
> +                * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0).
> +                * See the comments above snp_cpuid_calc_xsave_size() for more
> +                * details.
> +                */
> +               if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1))
> +                       if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in)
> +                               continue;
> +
> +               leaf->eax = e->eax;
> +               leaf->ebx = e->ebx;
> +               leaf->ecx = e->ecx;
> +               leaf->edx = e->edx;
> +
> +               return true;
> +       }
> +
> +       return false;
> +}
> +
> +static void snp_cpuid_hv(struct cpuid_leaf *leaf)
> +{
> +       if (sev_cpuid_hv(leaf))
> +               sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
> +}
> +
> +static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
> +{
> +       struct cpuid_leaf leaf_hv = *leaf;
> +
> +       switch (leaf->fn) {
> +       case 0x1:
> +               snp_cpuid_hv(&leaf_hv);
> +
> +               /* initial APIC ID */
> +               leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0));
> +               /* APIC enabled bit */
> +               leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9));
> +
> +               /* OSXSAVE enabled bit */
> +               if (native_read_cr4() & X86_CR4_OSXSAVE)
> +                       leaf->ecx |= BIT(27);
> +               break;
> +       case 0x7:
> +               /* OSPKE enabled bit */
> +               leaf->ecx &= ~BIT(4);
> +               if (native_read_cr4() & X86_CR4_PKE)
> +                       leaf->ecx |= BIT(4);
> +               break;
> +       case 0xB:
> +               leaf_hv.subfn = 0;
> +               snp_cpuid_hv(&leaf_hv);
> +
> +               /* extended APIC ID */
> +               leaf->edx = leaf_hv.edx;
> +               break;
> +       case 0xD: {
> +               bool compacted = false;
> +               u64 xcr0 = 1, xss = 0;
> +               u32 xsave_size;
> +
> +               if (leaf->subfn != 0 && leaf->subfn != 1)
> +                       return 0;
> +
> +               if (native_read_cr4() & X86_CR4_OSXSAVE)
> +                       xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
> +               if (leaf->subfn == 1) {
> +                       /* Get XSS value if XSAVES is enabled. */
> +                       if (leaf->eax & BIT(3)) {
> +                               unsigned long lo, hi;
> +
> +                               asm volatile("rdmsr" : "=a" (lo), "=d" (hi)
> +                                                    : "c" (MSR_IA32_XSS));
> +                               xss = (hi << 32) | lo;
> +                       }
> +
> +                       /*
> +                        * The PPR and APM aren't clear on what size should be
> +                        * encoded in 0xD:0x1:EBX when compaction is not enabled
> +                        * by either XSAVEC (feature bit 1) or XSAVES (feature
> +                        * bit 3) since SNP-capable hardware has these feature
> +                        * bits fixed as 1. KVM sets it to 0 in this case, but
> +                        * to avoid this becoming an issue it's safer to simply
> +                        * treat this as unsupported for SNP guests.
> +                        */
> +                       if (!(leaf->eax & (BIT(1) | BIT(3))))
> +                               return -EINVAL;

I couldn't get this patch set to boot and I found that I was setting
these XSAVE cpuid bits wrong. This took me a while to debug because
inside of handle_vc_boot_ghcb() this -EINVAL means we jump into the
halt loop, in addition the early_printk()s inside of that function
don't seem to  be working for me but should the halt in
handle_vc_boot_ghcb() be replaced with an sev_es_terminate() or
something?

I am still working on why the early_printk()s in that function are not
working, it seems that they lead to a different halt. Have you tested
any of those error paths manually? For example if you set your CPUID
bits to explicitly fail here do you see the expected printks?

> +
> +                       compacted = true;
> +               }
> +
> +               xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted);
> +               if (!xsave_size)
> +                       return -EINVAL;
> +
> +               leaf->ebx = xsave_size;
> +               }
> +               break;
> +       case 0x8000001E:
> +               snp_cpuid_hv(&leaf_hv);
> +
> +               /* extended APIC ID */
> +               leaf->eax = leaf_hv.eax;
> +               /* compute ID */
> +               leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0));
> +               /* node ID */
> +               leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0));
> +               break;
> +       default:
> +               /* No fix-ups needed, use values as-is. */
> +               break;
> +       }
> +
> +       return 0;
> +}
> +
> +/*
> + * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
> + * should be treated as fatal by caller.
> + */
> +static int snp_cpuid(struct cpuid_leaf *leaf)
> +{
> +       const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
> +
> +       if (!cpuid_table->count)
> +               return -EOPNOTSUPP;
> +
> +       if (!snp_cpuid_get_validated_func(leaf)) {
> +               /*
> +                * Some hypervisors will avoid keeping track of CPUID entries
> +                * where all values are zero, since they can be handled the
> +                * same as out-of-range values (all-zero). This is useful here
> +                * as well as it allows virtually all guest configurations to
> +                * work using a single SNP CPUID table.
> +                *
> +                * To allow for this, there is a need to distinguish between
> +                * out-of-range entries and in-range zero entries, since the
> +                * CPUID table entries are only a template that may need to be
> +                * augmented with additional values for things like
> +                * CPU-specific information during post-processing. So if it's
> +                * not in the table, set the values to zero. Then, if they are
> +                * within a valid CPUID range, proceed with post-processing
> +                * using zeros as the initial values. Otherwise, skip
> +                * post-processing and just return zeros immediately.
> +                */
> +               leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0;
> +
> +               /* Skip post-processing for out-of-range zero leafs. */
> +               if (!(leaf->fn <= cpuid_std_range_max ||
> +                     (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) ||
> +                     (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max)))
> +                       return 0;
> +       }
> +
> +       return snp_cpuid_postprocess(leaf);
> +}
> +
>  /*
>   * Boot VC Handler - This is the first VC handler during boot, there is no GHCB
>   * page yet, so it only supports the MSR based communication with the
> @@ -252,6 +541,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
>         unsigned int subfn = lower_bits(regs->cx, 32);
>         unsigned int fn = lower_bits(regs->ax, 32);
>         struct cpuid_leaf leaf;
> +       int ret;
>
>         /* Only CPUID is supported via MSR protocol */
>         if (exit_code != SVM_EXIT_CPUID)
> @@ -259,9 +549,18 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
>
>         leaf.fn = fn;
>         leaf.subfn = subfn;
> +
> +       ret = snp_cpuid(&leaf);
> +       if (!ret)
> +               goto cpuid_done;
> +
> +       if (ret != -EOPNOTSUPP)
> +               goto fail;
> +
>         if (sev_cpuid_hv(&leaf))
>                 goto fail;
>
> +cpuid_done:
>         regs->ax = leaf.eax;
>         regs->bx = leaf.ebx;
>         regs->cx = leaf.ecx;
> @@ -556,12 +855,37 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
>         return ret;
>  }
>
> +static int vc_handle_cpuid_snp(struct pt_regs *regs)
> +{
> +       struct cpuid_leaf leaf;
> +       int ret;
> +
> +       leaf.fn = regs->ax;
> +       leaf.subfn = regs->cx;
> +       ret = snp_cpuid(&leaf);
> +       if (!ret) {
> +               regs->ax = leaf.eax;
> +               regs->bx = leaf.ebx;
> +               regs->cx = leaf.ecx;
> +               regs->dx = leaf.edx;
> +       }
> +
> +       return ret;
> +}
> +
>  static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
>                                       struct es_em_ctxt *ctxt)
>  {
>         struct pt_regs *regs = ctxt->regs;
>         u32 cr4 = native_read_cr4();
>         enum es_result ret;
> +       int snp_cpuid_ret;
> +
> +       snp_cpuid_ret = vc_handle_cpuid_snp(regs);
> +       if (!snp_cpuid_ret)
> +               return ES_OK;
> +       if (snp_cpuid_ret != -EOPNOTSUPP)
> +               return ES_VMM_ERROR;
>
>         ghcb_set_rax(ghcb, regs->ax);
>         ghcb_set_rcx(ghcb, regs->cx);
> --
> 2.25.1
>
Michael Roth March 10, 2022, 9:25 p.m. UTC | #2
On Thu, Mar 10, 2022 at 07:51:17AM -0700, Peter Gonda wrote:
> ()
> 
> On Mon, Mar 7, 2022 at 2:35 PM Brijesh Singh <brijesh.singh@amd.com> wrote:
> > +static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
> > +{
> > +       struct cpuid_leaf leaf_hv = *leaf;
> > +
> > +       switch (leaf->fn) {
> > +       case 0x1:
> > +               snp_cpuid_hv(&leaf_hv);
> > +
> > +               /* initial APIC ID */
> > +               leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0));
> > +               /* APIC enabled bit */
> > +               leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9));
> > +
> > +               /* OSXSAVE enabled bit */
> > +               if (native_read_cr4() & X86_CR4_OSXSAVE)
> > +                       leaf->ecx |= BIT(27);
> > +               break;
> > +       case 0x7:
> > +               /* OSPKE enabled bit */
> > +               leaf->ecx &= ~BIT(4);
> > +               if (native_read_cr4() & X86_CR4_PKE)
> > +                       leaf->ecx |= BIT(4);
> > +               break;
> > +       case 0xB:
> > +               leaf_hv.subfn = 0;
> > +               snp_cpuid_hv(&leaf_hv);
> > +
> > +               /* extended APIC ID */
> > +               leaf->edx = leaf_hv.edx;
> > +               break;
> > +       case 0xD: {
> > +               bool compacted = false;
> > +               u64 xcr0 = 1, xss = 0;
> > +               u32 xsave_size;
> > +
> > +               if (leaf->subfn != 0 && leaf->subfn != 1)
> > +                       return 0;
> > +
> > +               if (native_read_cr4() & X86_CR4_OSXSAVE)
> > +                       xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
> > +               if (leaf->subfn == 1) {
> > +                       /* Get XSS value if XSAVES is enabled. */
> > +                       if (leaf->eax & BIT(3)) {
> > +                               unsigned long lo, hi;
> > +
> > +                               asm volatile("rdmsr" : "=a" (lo), "=d" (hi)
> > +                                                    : "c" (MSR_IA32_XSS));
> > +                               xss = (hi << 32) | lo;
> > +                       }
> > +
> > +                       /*
> > +                        * The PPR and APM aren't clear on what size should be
> > +                        * encoded in 0xD:0x1:EBX when compaction is not enabled
> > +                        * by either XSAVEC (feature bit 1) or XSAVES (feature
> > +                        * bit 3) since SNP-capable hardware has these feature
> > +                        * bits fixed as 1. KVM sets it to 0 in this case, but
> > +                        * to avoid this becoming an issue it's safer to simply
> > +                        * treat this as unsupported for SNP guests.
> > +                        */
> > +                       if (!(leaf->eax & (BIT(1) | BIT(3))))
> > +                               return -EINVAL;
> 
> I couldn't get this patch set to boot and I found that I was setting
> these XSAVE cpuid bits wrong. This took me a while to debug because
> inside of handle_vc_boot_ghcb() this -EINVAL means we jump into the
> halt loop, in addition the early_printk()s inside of that function
> don't seem to  be working for me but should the halt in
> handle_vc_boot_ghcb() be replaced with an sev_es_terminate() or
> something?

For consistency, the error is propagated up the stack the same way as with
all other individual handlers, and it's up to the current #VC handler
function how it wants to handle errors. The other #VC handlers terminate,
but this one has used a halt loop since its initial implementation in 2020
(1aa9aa8ee517e).

Joerg, do you have more background on that? Would it make sense, outside
of this series, to change it to a terminate? Maybe with a specific set
of error codes for ES_{OK,UNSUPPORTED,VMM_ERROR,DECODE_FAILED}?

> 
> I am still working on why the early_printk()s in that function are not
> working, it seems that they lead to a different halt.

I don't see a different halt. They just don't seem to print anything.
(keep in mind you still need to advance the IP or else the guest is
still gonna end up spinning here, even if you're removing the halt loop
for testing purposes)

> working, it seems that they lead to a different halt. Have you tested
> any of those error paths manually? For example if you set your CPUID
> bits to explicitly fail here do you see the expected printks?

I think at that point in the code, when the XSAVE stuff is setup, the
console hasn't been enabled yet, so messages would get buffered until they
get flushed later (which won't happen since there's halt loop after). I
know in some cases devs will dump the log buffer from memory instead to get
at the error messages for early failures. (Maybe that's also why Joerg
decided to use a halt loop there instead of terminating?)

That said, I did some testing to confirm with earlyprintk=serial|vga and
I don't see the error messages even if I modify the #VC handler to allow
booting to continue. pr_err() messages however do show up if I drop the
halt loop. So maybe pr_err() is more appropriate here? But it doesn't
really matter unless you plan on digging into guest memory for the logs.

So maybe reworking the error handling in handle_vc_boot_ghcb() to use
sev_es_terminate() might be warranted, but probably worth checking with
Joerg first, and should be done as a separate series since it is not
SNP-related.
Joerg Roedel March 11, 2022, 5:06 p.m. UTC | #3
On Thu, Mar 10, 2022 at 03:25:04PM -0600, Michael Roth wrote:
> Joerg, do you have more background on that? Would it make sense, outside
> of this series, to change it to a terminate? Maybe with a specific set
> of error codes for ES_{OK,UNSUPPORTED,VMM_ERROR,DECODE_FAILED}?

This seems to be a left over from development of the SEV-ES guest
patch-set. I wanted to see whether the VM crashed due to a triple fault
or an error in the #VC handler. The halt loop can be replaced by
termination request now.

> > I am still working on why the early_printk()s in that function are not
> > working, it seems that they lead to a different halt.
> 
> I don't see a different halt. They just don't seem to print anything.
> (keep in mind you still need to advance the IP or else the guest is
> still gonna end up spinning here, even if you're removing the halt loop
> for testing purposes)

The early_printks() also cause #VC exceptions, and if that handling is
broken for some reason nothing will be printed.

> 
> > working, it seems that they lead to a different halt. Have you tested
> > any of those error paths manually? For example if you set your CPUID
> > bits to explicitly fail here do you see the expected printks?
> 
> I think at that point in the code, when the XSAVE stuff is setup, the
> console hasn't been enabled yet, so messages would get buffered until they
> get flushed later (which won't happen since there's halt loop after). I
> know in some cases devs will dump the log buffer from memory instead to get
> at the error messages for early failures. (Maybe that's also why Joerg
> decided to use a halt loop there instead of terminating?)

It is hard to dump the log-buffer from encrypted memory :) But I
remember having seen messages from these early_printks under SEV-ES for
different bugs. Not sure why they don't appear in this situation.

> So maybe reworking the error handling in handle_vc_boot_ghcb() to use
> sev_es_terminate() might be warranted, but probably worth checking with
> Joerg first, and should be done as a separate series since it is not
> SNP-related.

I am fine with this change.

Regards,
Peter Gonda March 14, 2022, 5:34 p.m. UTC | #4
On Fri, Mar 11, 2022 at 10:06 AM Joerg Roedel <jroedel@suse.de> wrote:
>
> On Thu, Mar 10, 2022 at 03:25:04PM -0600, Michael Roth wrote:
> > Joerg, do you have more background on that? Would it make sense, outside
> > of this series, to change it to a terminate? Maybe with a specific set
> > of error codes for ES_{OK,UNSUPPORTED,VMM_ERROR,DECODE_FAILED}?
>
> This seems to be a left over from development of the SEV-ES guest
> patch-set. I wanted to see whether the VM crashed due to a triple fault
> or an error in the #VC handler. The halt loop can be replaced by
> termination request now.
>
> > > I am still working on why the early_printk()s in that function are not
> > > working, it seems that they lead to a different halt.
> >
> > I don't see a different halt. They just don't seem to print anything.
> > (keep in mind you still need to advance the IP or else the guest is
> > still gonna end up spinning here, even if you're removing the halt loop
> > for testing purposes)
>
> The early_printks() also cause #VC exceptions, and if that handling is
> broken for some reason nothing will be printed.
>
> >
> > > working, it seems that they lead to a different halt. Have you tested
> > > any of those error paths manually? For example if you set your CPUID
> > > bits to explicitly fail here do you see the expected printks?
> >
> > I think at that point in the code, when the XSAVE stuff is setup, the
> > console hasn't been enabled yet, so messages would get buffered until they
> > get flushed later (which won't happen since there's halt loop after). I
> > know in some cases devs will dump the log buffer from memory instead to get
> > at the error messages for early failures. (Maybe that's also why Joerg
> > decided to use a halt loop there instead of terminating?)
>
> It is hard to dump the log-buffer from encrypted memory :) But I
> remember having seen messages from these early_printks under SEV-ES for
> different bugs. Not sure why they don't appear in this situation.
>
> > So maybe reworking the error handling in handle_vc_boot_ghcb() to use
> > sev_es_terminate() might be warranted, but probably worth checking with
> > Joerg first, and should be done as a separate series since it is not
> > SNP-related.
>
> I am fine with this change.

I'll send a patch out for that.

I was also thinking about adding a vcpu run exit reason for
termination. It would be nice to get a more informative exit reason
than -EINVAL in userspace. Thoughts?

>
> Regards,
>
> --
> Jörg Rödel
> jroedel@suse.de
>
> SUSE Software Solutions Germany GmbH
> Maxfeldstr. 5
> 90409 Nürnberg
> Germany
>
> (HRB 36809, AG Nürnberg)
> Geschäftsführer: Ivo Totev
>
Borislav Petkov March 17, 2022, 1:11 p.m. UTC | #5
On March 14, 2022 5:34:42 PM UTC, Peter Gonda <pgonda@google.com> wrote:
>I was also thinking about adding a vcpu run exit reason for
>termination. It would be nice to get a more informative exit reason
>than -EINVAL in userspace. Thoughts?

Absolutely - it should be unambiguously clear why we're terminating.
Peter Gonda March 17, 2022, 8:20 p.m. UTC | #6
On Thu, Mar 17, 2022 at 7:11 AM Boris Petkov <bp@suse.de> wrote:
>
> On March 14, 2022 5:34:42 PM UTC, Peter Gonda <pgonda@google.com> wrote:
> >I was also thinking about adding a vcpu run exit reason for
> >termination. It would be nice to get a more informative exit reason
> >than -EINVAL in userspace. Thoughts?
>
> Absolutely - it should be unambiguously clear why we're terminating.
>
> --
> Sent from a small device: formatting sux and brevity is inevitable.

Great, I can work on that too.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index e9b6815b3b3d..0759af9b1acf 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -152,6 +152,8 @@  struct snp_psc_desc {
 #define GHCB_TERM_PSC			1	/* Page State Change failure */
 #define GHCB_TERM_PVALIDATE		2	/* Pvalidate failure */
 #define GHCB_TERM_NOT_VMPL0		3	/* SNP guest is not running at VMPL-0 */
+#define GHCB_TERM_CPUID			4	/* CPUID-validation failure */
+#define GHCB_TERM_CPUID_HV		5	/* CPUID failure during hypervisor fallback */
 
 #define GHCB_RESP_CODE(v)		((v) & GHCB_MSR_INFO_MASK)
 
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index b4d5558c9d0a..0f1375164ff0 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -24,6 +24,36 @@  struct cpuid_leaf {
 	u32 edx;
 };
 
+/*
+ * Individual entries of the SNP CPUID table, as defined by the SNP
+ * Firmware ABI, Revision 0.9, Section 7.1, Table 14.
+ */
+struct snp_cpuid_fn {
+	u32 eax_in;
+	u32 ecx_in;
+	u64 xcr0_in;
+	u64 xss_in;
+	u32 eax;
+	u32 ebx;
+	u32 ecx;
+	u32 edx;
+	u64 __reserved;
+} __packed;
+
+/*
+ * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9,
+ * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit
+ * of 64 entries per CPUID table.
+ */
+#define SNP_CPUID_COUNT_MAX 64
+
+struct snp_cpuid_table {
+	u32 count;
+	u32 __reserved1;
+	u64 __reserved2;
+	struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX];
+} __packed;
+
 /*
  * Since feature negotiation related variables are set early in the boot
  * process they must reside in the .data section so as not to be zeroed
@@ -33,6 +63,19 @@  struct cpuid_leaf {
  */
 static u16 ghcb_version __ro_after_init;
 
+/* Copy of the SNP firmware's CPUID page. */
+static struct snp_cpuid_table cpuid_table_copy __ro_after_init;
+
+/*
+ * These will be initialized based on CPUID table so that non-present
+ * all-zero leaves (for sparse tables) can be differentiated from
+ * invalid/out-of-range leaves. This is needed since all-zero leaves
+ * still need to be post-processed.
+ */
+static u32 cpuid_std_range_max __ro_after_init;
+static u32 cpuid_hyp_range_max __ro_after_init;
+static u32 cpuid_ext_range_max __ro_after_init;
+
 static bool __init sev_es_check_cpu_features(void)
 {
 	if (!has_cpuflag(X86_FEATURE_RDRAND)) {
@@ -242,6 +285,252 @@  static int sev_cpuid_hv(struct cpuid_leaf *leaf)
 	return ret;
 }
 
+/*
+ * This may be called early while still running on the initial identity
+ * mapping. Use RIP-relative addressing to obtain the correct address
+ * while running with the initial identity mapping as well as the
+ * switch-over to kernel virtual addresses later.
+ */
+static const struct snp_cpuid_table *snp_cpuid_get_table(void)
+{
+	void *ptr;
+
+	asm ("lea cpuid_table_copy(%%rip), %0"
+	     : "=r" (ptr)
+	     : "p" (&cpuid_table_copy));
+
+	return ptr;
+}
+
+/*
+ * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of
+ * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0
+ * and 1 based on the corresponding features enabled by a particular
+ * combination of XCR0 and XSS registers so that a guest can look up the
+ * version corresponding to the features currently enabled in its XCR0/XSS
+ * registers. The only values that differ between these versions/table
+ * entries is the enabled XSAVE area size advertised via EBX.
+ *
+ * While hypervisors may choose to make use of this support, it is more
+ * robust/secure for a guest to simply find the entry corresponding to the
+ * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the
+ * XSAVE area size using subfunctions 2 through 64, as documented in APM
+ * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here.
+ *
+ * Since base/legacy XSAVE area size is documented as 0x240, use that value
+ * directly rather than relying on the base size in the CPUID table.
+ *
+ * Return: XSAVE area size on success, 0 otherwise.
+ */
+static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
+{
+	const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+	u64 xfeatures_found = 0;
+	u32 xsave_size = 0x240;
+	int i;
+
+	for (i = 0; i < cpuid_table->count; i++) {
+		const struct snp_cpuid_fn *e = &cpuid_table->fn[i];
+
+		if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64))
+			continue;
+		if (!(xfeatures_en & (BIT_ULL(e->ecx_in))))
+			continue;
+		if (xfeatures_found & (BIT_ULL(e->ecx_in)))
+			continue;
+
+		xfeatures_found |= (BIT_ULL(e->ecx_in));
+
+		if (compacted)
+			xsave_size += e->eax;
+		else
+			xsave_size = max(xsave_size, e->eax + e->ebx);
+	}
+
+	/*
+	 * Either the guest set unsupported XCR0/XSS bits, or the corresponding
+	 * entries in the CPUID table were not present. This is not a valid
+	 * state to be in.
+	 */
+	if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2)))
+		return 0;
+
+	return xsave_size;
+}
+
+static bool
+snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
+{
+	const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+	int i;
+
+	for (i = 0; i < cpuid_table->count; i++) {
+		const struct snp_cpuid_fn *e = &cpuid_table->fn[i];
+
+		if (e->eax_in != leaf->fn)
+			continue;
+
+		if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn)
+			continue;
+
+		/*
+		 * For 0xD subfunctions 0 and 1, only use the entry corresponding
+		 * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0).
+		 * See the comments above snp_cpuid_calc_xsave_size() for more
+		 * details.
+		 */
+		if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1))
+			if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in)
+				continue;
+
+		leaf->eax = e->eax;
+		leaf->ebx = e->ebx;
+		leaf->ecx = e->ecx;
+		leaf->edx = e->edx;
+
+		return true;
+	}
+
+	return false;
+}
+
+static void snp_cpuid_hv(struct cpuid_leaf *leaf)
+{
+	if (sev_cpuid_hv(leaf))
+		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
+}
+
+static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
+{
+	struct cpuid_leaf leaf_hv = *leaf;
+
+	switch (leaf->fn) {
+	case 0x1:
+		snp_cpuid_hv(&leaf_hv);
+
+		/* initial APIC ID */
+		leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0));
+		/* APIC enabled bit */
+		leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9));
+
+		/* OSXSAVE enabled bit */
+		if (native_read_cr4() & X86_CR4_OSXSAVE)
+			leaf->ecx |= BIT(27);
+		break;
+	case 0x7:
+		/* OSPKE enabled bit */
+		leaf->ecx &= ~BIT(4);
+		if (native_read_cr4() & X86_CR4_PKE)
+			leaf->ecx |= BIT(4);
+		break;
+	case 0xB:
+		leaf_hv.subfn = 0;
+		snp_cpuid_hv(&leaf_hv);
+
+		/* extended APIC ID */
+		leaf->edx = leaf_hv.edx;
+		break;
+	case 0xD: {
+		bool compacted = false;
+		u64 xcr0 = 1, xss = 0;
+		u32 xsave_size;
+
+		if (leaf->subfn != 0 && leaf->subfn != 1)
+			return 0;
+
+		if (native_read_cr4() & X86_CR4_OSXSAVE)
+			xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+		if (leaf->subfn == 1) {
+			/* Get XSS value if XSAVES is enabled. */
+			if (leaf->eax & BIT(3)) {
+				unsigned long lo, hi;
+
+				asm volatile("rdmsr" : "=a" (lo), "=d" (hi)
+						     : "c" (MSR_IA32_XSS));
+				xss = (hi << 32) | lo;
+			}
+
+			/*
+			 * The PPR and APM aren't clear on what size should be
+			 * encoded in 0xD:0x1:EBX when compaction is not enabled
+			 * by either XSAVEC (feature bit 1) or XSAVES (feature
+			 * bit 3) since SNP-capable hardware has these feature
+			 * bits fixed as 1. KVM sets it to 0 in this case, but
+			 * to avoid this becoming an issue it's safer to simply
+			 * treat this as unsupported for SNP guests.
+			 */
+			if (!(leaf->eax & (BIT(1) | BIT(3))))
+				return -EINVAL;
+
+			compacted = true;
+		}
+
+		xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted);
+		if (!xsave_size)
+			return -EINVAL;
+
+		leaf->ebx = xsave_size;
+		}
+		break;
+	case 0x8000001E:
+		snp_cpuid_hv(&leaf_hv);
+
+		/* extended APIC ID */
+		leaf->eax = leaf_hv.eax;
+		/* compute ID */
+		leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0));
+		/* node ID */
+		leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0));
+		break;
+	default:
+		/* No fix-ups needed, use values as-is. */
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
+ * should be treated as fatal by caller.
+ */
+static int snp_cpuid(struct cpuid_leaf *leaf)
+{
+	const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+
+	if (!cpuid_table->count)
+		return -EOPNOTSUPP;
+
+	if (!snp_cpuid_get_validated_func(leaf)) {
+		/*
+		 * Some hypervisors will avoid keeping track of CPUID entries
+		 * where all values are zero, since they can be handled the
+		 * same as out-of-range values (all-zero). This is useful here
+		 * as well as it allows virtually all guest configurations to
+		 * work using a single SNP CPUID table.
+		 *
+		 * To allow for this, there is a need to distinguish between
+		 * out-of-range entries and in-range zero entries, since the
+		 * CPUID table entries are only a template that may need to be
+		 * augmented with additional values for things like
+		 * CPU-specific information during post-processing. So if it's
+		 * not in the table, set the values to zero. Then, if they are
+		 * within a valid CPUID range, proceed with post-processing
+		 * using zeros as the initial values. Otherwise, skip
+		 * post-processing and just return zeros immediately.
+		 */
+		leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0;
+
+		/* Skip post-processing for out-of-range zero leafs. */
+		if (!(leaf->fn <= cpuid_std_range_max ||
+		      (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) ||
+		      (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max)))
+			return 0;
+	}
+
+	return snp_cpuid_postprocess(leaf);
+}
+
 /*
  * Boot VC Handler - This is the first VC handler during boot, there is no GHCB
  * page yet, so it only supports the MSR based communication with the
@@ -252,6 +541,7 @@  void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
 	unsigned int subfn = lower_bits(regs->cx, 32);
 	unsigned int fn = lower_bits(regs->ax, 32);
 	struct cpuid_leaf leaf;
+	int ret;
 
 	/* Only CPUID is supported via MSR protocol */
 	if (exit_code != SVM_EXIT_CPUID)
@@ -259,9 +549,18 @@  void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
 
 	leaf.fn = fn;
 	leaf.subfn = subfn;
+
+	ret = snp_cpuid(&leaf);
+	if (!ret)
+		goto cpuid_done;
+
+	if (ret != -EOPNOTSUPP)
+		goto fail;
+
 	if (sev_cpuid_hv(&leaf))
 		goto fail;
 
+cpuid_done:
 	regs->ax = leaf.eax;
 	regs->bx = leaf.ebx;
 	regs->cx = leaf.ecx;
@@ -556,12 +855,37 @@  static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
 	return ret;
 }
 
+static int vc_handle_cpuid_snp(struct pt_regs *regs)
+{
+	struct cpuid_leaf leaf;
+	int ret;
+
+	leaf.fn = regs->ax;
+	leaf.subfn = regs->cx;
+	ret = snp_cpuid(&leaf);
+	if (!ret) {
+		regs->ax = leaf.eax;
+		regs->bx = leaf.ebx;
+		regs->cx = leaf.ecx;
+		regs->dx = leaf.edx;
+	}
+
+	return ret;
+}
+
 static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
 				      struct es_em_ctxt *ctxt)
 {
 	struct pt_regs *regs = ctxt->regs;
 	u32 cr4 = native_read_cr4();
 	enum es_result ret;
+	int snp_cpuid_ret;
+
+	snp_cpuid_ret = vc_handle_cpuid_snp(regs);
+	if (!snp_cpuid_ret)
+		return ES_OK;
+	if (snp_cpuid_ret != -EOPNOTSUPP)
+		return ES_VMM_ERROR;
 
 	ghcb_set_rax(ghcb, regs->ax);
 	ghcb_set_rcx(ghcb, regs->cx);