diff mbox series

[6/8] tools/libxc: Rework xc_cpuid_apply_policy() to use {get, set}_cpu_policy()

Message ID 20190911200504.5693-7-andrew.cooper3@citrix.com (mailing list archive)
State Superseded
Headers show
Series x86/cpuid: Switch to using XEN_DOMCTL_set_cpumsr_policy | expand

Commit Message

Andrew Cooper Sept. 11, 2019, 8:05 p.m. UTC
The purpose of this change is to stop using xc_cpuid_do_domctl(), and to stop
basing decisions on a local CPUID instruction.  This is not a correct or
appropriate way to construct policy information for other domains.

The overwhelming majority of this logic is redundant with the policy logic in
Xen, but has a habit of becoming stale (e.g. c/s 97e4ebdcd76 resulting in
AVX512_BF16 not ever actually being offered to guests).

There are a few subtle side effects which need to remain in place.  A
successful call to xc_cpuid_apply_policy() must result in a call to
xc_set_domain_cpu_policy() because that is currently the only way the
ITSC/VMX/SVM bits become reflected in the guests CPUID view.  Future cleanup
will remove this side effect.

The topology tweaks are local to libxc.  Extend struct cpuid_policy with
enough named fields to express the logic, but keep it identical to before.
Fixing topology representation is another future area of work.

No (expected) change in behaviour.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Wei Liu <wl@xen.org>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Ian Jackson <Ian.Jackson@citrix.com>

The repositioning of xc_cpuid_apply_policy() relative to xc_cpuid_set() is
simply to make the diff readable.  It is completely illegible otherwise.
---
 tools/libxc/xc_cpuid_x86.c      | 798 ++++++++++------------------------------
 xen/include/xen/lib/x86/cpuid.h |  11 +-
 2 files changed, 197 insertions(+), 612 deletions(-)

Comments

Jan Beulich Sept. 12, 2019, 9:02 a.m. UTC | #1
On 11.09.2019 22:05, Andrew Cooper wrote:
> The purpose of this change is to stop using xc_cpuid_do_domctl(), and to stop
> basing decisions on a local CPUID instruction.  This is not a correct or
> appropriate way to construct policy information for other domains.
> 
> The overwhelming majority of this logic is redundant with the policy logic in
> Xen, but has a habit of becoming stale (e.g. c/s 97e4ebdcd76 resulting in
> AVX512_BF16 not ever actually being offered to guests).

Well, not offering it to guests was intentional at that point,
but I guess you validly imply that by adding the A marker to the
public header it _still_ wouldn't have got exposed?

> ---
>  tools/libxc/xc_cpuid_x86.c      | 798 ++++++++++------------------------------
>  xen/include/xen/lib/x86/cpuid.h |  11 +-
>  2 files changed, 197 insertions(+), 612 deletions(-)

Nice.

> @@ -1057,3 +449,191 @@ int xc_cpuid_set(
>  
>      return rc;
>  }
> +
> +int xc_cpuid_apply_policy(xc_interface *xch, uint32_t domid,
> +                          const uint32_t *featureset, unsigned int nr_features)
> +{
> +    int rc;
> +    xc_dominfo_t di;
> +    unsigned int i, nr_leaves, nr_msrs;
> +    xen_cpuid_leaf_t *leaves = NULL;
> +    struct cpuid_policy *p = NULL;
> +    uint32_t err_leaf = -1, err_subleaf = -1, err_msr = -1;
> +
> +    if ( xc_domain_getinfo(xch, domid, 1, &di) != 1 ||
> +         di.domid != domid )
> +    {
> +        ERROR("Failed to obtain d%d info", domid);
> +        rc = -ESRCH;
> +        goto out;
> +    }
> +
> +    rc = xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs);
> +    if ( rc )
> +    {
> +        PERROR("Failed to obtain policy info size");
> +        rc = -errno;
> +        goto out;
> +    }
> +
> +    rc = -ENOMEM;
> +    if ( (leaves = calloc(nr_leaves, sizeof(*leaves))) == NULL ||
> +         (p = calloc(1, sizeof(*p))) == NULL )
> +        goto out;
> +
> +    nr_msrs = 0;
> +    rc = xc_get_domain_cpu_policy(xch, domid, &nr_leaves, leaves,
> +                                  &nr_msrs, NULL);
> +    if ( rc )
> +    {
> +        PERROR("Failed to obtain d%d's policy", domid);
> +        rc = -errno;
> +        goto out;
> +    }
> +
> +    rc = x86_cpuid_copy_from_buffer(p, leaves, nr_leaves,
> +                                    &err_leaf, &err_subleaf);
> +    if ( rc )
> +    {
> +        ERROR("Failed to deserialise CPUID (err leaf %#x, subleaf %#x) (%d = %s)",
> +              err_leaf, err_subleaf, -rc, strerror(-rc));
> +        goto out;
> +    }
> +
> +    if ( featureset )
> +    {
> +        uint32_t disabled_features[FEATURESET_NR_ENTRIES],
> +            feat[FEATURESET_NR_ENTRIES] = {};
> +        static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
> +        unsigned int i, b;
> +
> +        /*
> +         * The user supplied featureset may be shorter or longer than
> +         * FEATURESET_NR_ENTRIES.  Shorter is fine, and we will zero-extend.
> +         * Longer is fine, so long as it only padded with zeros.
> +         */
> +        unsigned int user_len = min(FEATURESET_NR_ENTRIES + 0u, nr_features);
> +
> +        /* Check for truncated set bits. */
> +        rc = -EOPNOTSUPP;
> +        for ( i = user_len; i < nr_features; ++i )
> +            if ( featureset[i] != 0 )
> +                goto out;
> +
> +        memcpy(feat, featureset, sizeof(*featureset) * user_len);
> +
> +        /* Disable deep dependencies of disabled features. */
> +        for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i )
> +            disabled_features[i] = ~feat[i] & deep_features[i];
> +
> +        for ( b = 0; b < sizeof(disabled_features) * CHAR_BIT; ++b )
> +        {
> +            const uint32_t *dfs;
> +
> +            if ( !test_bit(b, disabled_features) ||
> +                 !(dfs = x86_cpuid_lookup_deep_deps(b)) )
> +                continue;
> +
> +            for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i )
> +            {
> +                feat[i] &= ~dfs[i];
> +                disabled_features[i] &= ~dfs[i];
> +            }
> +        }
> +
> +        cpuid_featureset_to_policy(feat, p);
> +    }
> +
> +    if ( !di.hvm )
> +    {
> +        uint32_t host_featureset[FEATURESET_NR_ENTRIES];
> +        uint32_t len = ARRAY_SIZE(host_featureset);
> +
> +        rc = xc_get_cpu_featureset(xch, XEN_SYSCTL_cpu_featureset_host,
> +                                   &len, host_featureset);
> +        if ( rc )
> +        {
> +            /* Tolerate "buffer too small", as we've got the bits we need. */
> +            if ( errno == ENOBUFS )
> +                rc = 0;

So this is where I think returning an error (instead of a positive
number) from the hypercall is latently problematic: There's not
really any guarantee for ENOBUFS to not result from other than the
actual hypercall. I guess we have such dependencies elsewhere, so
having one more here isn't a big deal, but as a precaution against
using uninitialized data, wouldn't it be prudent for
host_featureset[] to get zero-initialized up front?

> +            else
> +            {
> +                PERROR("Failed to obtain host featureset");
> +                rc = -errno;
> +                goto out;
> +            }
> +        }
> +
> +        /*
> +         * On hardware without CPUID Faulting, PV guests see real topology.
> +         * As a consequence, they also need to see the host htt/cmp fields.
> +         */
> +        p->basic.htt       = test_bit(X86_FEATURE_HTT, host_featureset);
> +        p->extd.cmp_legacy = test_bit(X86_FEATURE_CMP_LEGACY, host_featureset);
> +    }
> +    else
> +    {
> +        /*
> +         * Topology for HVM guests is entirely controlled by Xen.  For now, we
> +         * hardcode APIC_ID = vcpu_id * 2 to give the illusion of no SMT.
> +         */
> +        p->basic.htt = true;
> +        p->extd.cmp_legacy = false;
> +
> +        p->basic.lppp *= 2;
> +
> +        switch ( p->x86_vendor )
> +        {
> +        case X86_VENDOR_INTEL:
> +            for ( i = 0; (p->cache.subleaf[i].type &&
> +                          i < ARRAY_SIZE(p->cache.raw)); ++i )
> +            {
> +                p->cache.subleaf[i].cores_per_package =
> +                    (p->cache.subleaf[i].cores_per_package << 1) | 1;
> +                p->cache.subleaf[i].threads_per_cache = 0;
> +            }
> +            break;

The original code masked EDX by 0x3ff. I don't see how this is reflected
here, and the description also doesn't indicate the change is on purpose.

> +        case X86_VENDOR_AMD:
> +        case X86_VENDOR_HYGON:
> +            p->extd.nc = (p->extd.nc << 1) | 1;

This actually fixes a latent "spill into bit 8" issue of the original
code.

Jan
Andrew Cooper Sept. 12, 2019, 1:47 p.m. UTC | #2
On 12/09/2019 10:02, Jan Beulich wrote:
> On 11.09.2019 22:05, Andrew Cooper wrote:
>> The purpose of this change is to stop using xc_cpuid_do_domctl(), and to stop
>> basing decisions on a local CPUID instruction.  This is not a correct or
>> appropriate way to construct policy information for other domains.
>>
>> The overwhelming majority of this logic is redundant with the policy logic in
>> Xen, but has a habit of becoming stale (e.g. c/s 97e4ebdcd76 resulting in
>> AVX512_BF16 not ever actually being offered to guests).
> Well, not offering it to guests was intentional at that point,
> but I guess you validly imply that by adding the A marker to the
> public header it _still_ wouldn't have got exposed?

Ah - I'd forgotten the point about the the lack of A marker, but yes -
my point was that, had I not forgotten during review, that changeset
should have added the 7a1 case to libxc.

I'll see if I can rephrase this slightly to be clearer.

>
>> ---
>>  tools/libxc/xc_cpuid_x86.c      | 798 ++++++++++------------------------------
>>  xen/include/xen/lib/x86/cpuid.h |  11 +-
>>  2 files changed, 197 insertions(+), 612 deletions(-)
> Nice.

I was very pleased with how this turned out.  When I started, I wasn't
expecting to be able to delete this much.

>
>> @@ -1057,3 +449,191 @@ int xc_cpuid_set(
>>  
>>      return rc;
>>  }
>> +
>> +int xc_cpuid_apply_policy(xc_interface *xch, uint32_t domid,
>> +                          const uint32_t *featureset, unsigned int nr_features)
>> +{
>> +    int rc;
>> +    xc_dominfo_t di;
>> +    unsigned int i, nr_leaves, nr_msrs;
>> +    xen_cpuid_leaf_t *leaves = NULL;
>> +    struct cpuid_policy *p = NULL;
>> +    uint32_t err_leaf = -1, err_subleaf = -1, err_msr = -1;
>> +
>> +    if ( xc_domain_getinfo(xch, domid, 1, &di) != 1 ||
>> +         di.domid != domid )
>> +    {
>> +        ERROR("Failed to obtain d%d info", domid);
>> +        rc = -ESRCH;
>> +        goto out;
>> +    }
>> +
>> +    rc = xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs);
>> +    if ( rc )
>> +    {
>> +        PERROR("Failed to obtain policy info size");
>> +        rc = -errno;
>> +        goto out;
>> +    }
>> +
>> +    rc = -ENOMEM;
>> +    if ( (leaves = calloc(nr_leaves, sizeof(*leaves))) == NULL ||
>> +         (p = calloc(1, sizeof(*p))) == NULL )
>> +        goto out;
>> +
>> +    nr_msrs = 0;
>> +    rc = xc_get_domain_cpu_policy(xch, domid, &nr_leaves, leaves,
>> +                                  &nr_msrs, NULL);
>> +    if ( rc )
>> +    {
>> +        PERROR("Failed to obtain d%d's policy", domid);
>> +        rc = -errno;
>> +        goto out;
>> +    }
>> +
>> +    rc = x86_cpuid_copy_from_buffer(p, leaves, nr_leaves,
>> +                                    &err_leaf, &err_subleaf);
>> +    if ( rc )
>> +    {
>> +        ERROR("Failed to deserialise CPUID (err leaf %#x, subleaf %#x) (%d = %s)",
>> +              err_leaf, err_subleaf, -rc, strerror(-rc));
>> +        goto out;
>> +    }
>> +
>> +    if ( featureset )
>> +    {
>> +        uint32_t disabled_features[FEATURESET_NR_ENTRIES],
>> +            feat[FEATURESET_NR_ENTRIES] = {};
>> +        static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
>> +        unsigned int i, b;
>> +
>> +        /*
>> +         * The user supplied featureset may be shorter or longer than
>> +         * FEATURESET_NR_ENTRIES.  Shorter is fine, and we will zero-extend.
>> +         * Longer is fine, so long as it only padded with zeros.
>> +         */
>> +        unsigned int user_len = min(FEATURESET_NR_ENTRIES + 0u, nr_features);
>> +
>> +        /* Check for truncated set bits. */
>> +        rc = -EOPNOTSUPP;
>> +        for ( i = user_len; i < nr_features; ++i )
>> +            if ( featureset[i] != 0 )
>> +                goto out;
>> +
>> +        memcpy(feat, featureset, sizeof(*featureset) * user_len);
>> +
>> +        /* Disable deep dependencies of disabled features. */
>> +        for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i )
>> +            disabled_features[i] = ~feat[i] & deep_features[i];
>> +
>> +        for ( b = 0; b < sizeof(disabled_features) * CHAR_BIT; ++b )
>> +        {
>> +            const uint32_t *dfs;
>> +
>> +            if ( !test_bit(b, disabled_features) ||
>> +                 !(dfs = x86_cpuid_lookup_deep_deps(b)) )
>> +                continue;
>> +
>> +            for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i )
>> +            {
>> +                feat[i] &= ~dfs[i];
>> +                disabled_features[i] &= ~dfs[i];
>> +            }
>> +        }
>> +
>> +        cpuid_featureset_to_policy(feat, p);
>> +    }
>> +
>> +    if ( !di.hvm )
>> +    {
>> +        uint32_t host_featureset[FEATURESET_NR_ENTRIES];
>> +        uint32_t len = ARRAY_SIZE(host_featureset);
>> +
>> +        rc = xc_get_cpu_featureset(xch, XEN_SYSCTL_cpu_featureset_host,
>> +                                   &len, host_featureset);
>> +        if ( rc )
>> +        {
>> +            /* Tolerate "buffer too small", as we've got the bits we need. */
>> +            if ( errno == ENOBUFS )
>> +                rc = 0;
> So this is where I think returning an error (instead of a positive
> number) from the hypercall is latently problematic: There's not
> really any guarantee for ENOBUFS to not result from other than the
> actual hypercall. I guess we have such dependencies elsewhere, so
> having one more here isn't a big deal,

We have it in multiple places.  Adjusting this is going to require some
careful checking of the errno handling.

> but as a precaution against
> using uninitialized data, wouldn't it be prudent for
> host_featureset[] to get zero-initialized up front?

Fair enough.

>
>> +            else
>> +            {
>> +                PERROR("Failed to obtain host featureset");
>> +                rc = -errno;
>> +                goto out;
>> +            }
>> +        }
>> +
>> +        /*
>> +         * On hardware without CPUID Faulting, PV guests see real topology.
>> +         * As a consequence, they also need to see the host htt/cmp fields.
>> +         */
>> +        p->basic.htt       = test_bit(X86_FEATURE_HTT, host_featureset);
>> +        p->extd.cmp_legacy = test_bit(X86_FEATURE_CMP_LEGACY, host_featureset);
>> +    }
>> +    else
>> +    {
>> +        /*
>> +         * Topology for HVM guests is entirely controlled by Xen.  For now, we
>> +         * hardcode APIC_ID = vcpu_id * 2 to give the illusion of no SMT.
>> +         */
>> +        p->basic.htt = true;
>> +        p->extd.cmp_legacy = false;
>> +
>> +        p->basic.lppp *= 2;
>> +
>> +        switch ( p->x86_vendor )
>> +        {
>> +        case X86_VENDOR_INTEL:
>> +            for ( i = 0; (p->cache.subleaf[i].type &&
>> +                          i < ARRAY_SIZE(p->cache.raw)); ++i )
>> +            {
>> +                p->cache.subleaf[i].cores_per_package =
>> +                    (p->cache.subleaf[i].cores_per_package << 1) | 1;
>> +                p->cache.subleaf[i].threads_per_cache = 0;
>> +            }
>> +            break;
> The original code masked EDX by 0x3ff. I don't see how this is reflected
> here, and the description also doesn't indicate the change is on purpose.

This falls into the "The overwhelming majority of this logic is
redundant with the policy logic in Xen" statement.

I've got no idea why 0x3ff was used before - only the bottom 3 bits are
defined, and I'm pretty sure that wasn't wasn't the case when this code
first appeared (c/s 5f14a87ceb in 2008).

>
>> +        case X86_VENDOR_AMD:
>> +        case X86_VENDOR_HYGON:
>> +            p->extd.nc = (p->extd.nc << 1) | 1;
> This actually fixes a latent "spill into bit 8" issue of the original
> code.

Oh.  So it does.

~Andrew
diff mbox series

Patch

diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c
index d1a2b61214..c88acbac9e 100644
--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -34,18 +34,13 @@  enum {
 
 #include <xen/asm/x86-vendors.h>
 
-#include <xen/lib/x86/cpuid.h>
-#include <xen/lib/x86/msr.h>
+#include <xen/lib/x86/cpu-policy.h>
 
 #define bitmaskof(idx)      (1u << ((idx) & 31))
 #define featureword_of(idx) ((idx) >> 5)
 #define clear_feature(idx, dst) ((dst) &= ~bitmaskof(idx))
 #define set_feature(idx, dst)   ((dst) |=  bitmaskof(idx))
 
-#define DEF_MAX_BASE 0x0000000du
-#define DEF_MAX_INTELEXT  0x80000008u
-#define DEF_MAX_AMDEXT    0x8000001cu
-
 int xc_get_cpu_levelling_caps(xc_interface *xch, uint32_t *caps)
 {
     DECLARE_SYSCTL;
@@ -278,609 +273,6 @@  int xc_set_domain_cpu_policy(xc_interface *xch, uint32_t domid,
     return ret;
 }
 
-struct cpuid_domain_info
-{
-    unsigned int vendor; /* X86_VENDOR_* */
-
-    bool hvm;
-    uint64_t xfeature_mask;
-
-    /*
-     * Careful with featureset lengths.
-     *
-     * Code in this file requires featureset to have at least
-     * xc_get_cpu_featureset_size() entries.  This is a libxc compiletime
-     * constant.
-     *
-     * The featureset length used by the hypervisor may be different.  If the
-     * hypervisor version is longer, XEN_SYSCTL_get_cpu_featureset will fail
-     * with -ENOBUFS, and libxc really does need rebuilding.  If the
-     * hypervisor version is shorter, it is safe to zero-extend.
-     */
-    uint32_t *featureset;
-    unsigned int nr_features;
-
-    /* PV-only information. */
-    bool pv64;
-
-    /* HVM-only information. */
-    bool pae;
-    bool nestedhvm;
-};
-
-static void cpuid(const unsigned int *input, unsigned int *regs)
-{
-    unsigned int count = (input[1] == XEN_CPUID_INPUT_UNUSED) ? 0 : input[1];
-#ifdef __i386__
-    /* Use the stack to avoid reg constraint failures with some gcc flags */
-    asm (
-        "push %%ebx; push %%edx\n\t"
-        "cpuid\n\t"
-        "mov %%ebx,4(%4)\n\t"
-        "mov %%edx,12(%4)\n\t"
-        "pop %%edx; pop %%ebx\n\t"
-        : "=a" (regs[0]), "=c" (regs[2])
-        : "0" (input[0]), "1" (count), "S" (regs)
-        : "memory" );
-#else
-    asm (
-        "cpuid"
-        : "=a" (regs[0]), "=b" (regs[1]), "=c" (regs[2]), "=d" (regs[3])
-        : "0" (input[0]), "2" (count) );
-#endif
-}
-
-static int get_cpuid_domain_info(xc_interface *xch, uint32_t domid,
-                                 struct cpuid_domain_info *info,
-                                 const uint32_t *featureset,
-                                 unsigned int nr_features)
-{
-    struct xen_domctl domctl = {};
-    xc_dominfo_t di;
-    unsigned int in[2] = { 0, ~0U }, regs[4];
-    unsigned int i, host_nr_features = xc_get_cpu_featureset_size();
-    int rc;
-
-    cpuid(in, regs);
-    info->vendor = x86_cpuid_lookup_vendor(regs[1], regs[2], regs[3]);
-
-    if ( xc_domain_getinfo(xch, domid, 1, &di) != 1 ||
-         di.domid != domid )
-        return -ESRCH;
-
-    info->hvm = di.hvm;
-
-    info->featureset = calloc(host_nr_features, sizeof(*info->featureset));
-    if ( !info->featureset )
-        return -ENOMEM;
-
-    info->nr_features = host_nr_features;
-
-    if ( featureset )
-    {
-        /*
-         * The user supplied featureset may be shorter or longer than
-         * host_nr_features.  Shorter is fine, and we will zero-extend.
-         * Longer is fine, so long as it only padded with zeros.
-         */
-        unsigned int fslen = min(host_nr_features, nr_features);
-
-        memcpy(info->featureset, featureset,
-               fslen * sizeof(*info->featureset));
-
-        /* Check for truncated set bits. */
-        for ( i = fslen; i < nr_features; ++i )
-            if ( featureset[i] != 0 )
-                return -EOPNOTSUPP;
-    }
-    else
-    {
-        rc = xc_get_cpu_featureset(xch, (info->hvm
-                                         ? XEN_SYSCTL_cpu_featureset_hvm
-                                         : XEN_SYSCTL_cpu_featureset_pv),
-                                   &host_nr_features, info->featureset);
-        if ( rc )
-            return -errno;
-    }
-
-    /* Get xstate information. */
-    domctl.cmd = XEN_DOMCTL_getvcpuextstate;
-    domctl.domain = domid;
-    rc = do_domctl(xch, &domctl);
-    if ( rc )
-        return -errno;
-
-    info->xfeature_mask = domctl.u.vcpuextstate.xfeature_mask;
-
-    if ( di.hvm )
-    {
-        uint64_t val;
-
-        rc = xc_hvm_param_get(xch, domid, HVM_PARAM_PAE_ENABLED, &val);
-        if ( rc )
-            return -errno;
-
-        info->pae = !!val;
-
-        rc = xc_hvm_param_get(xch, domid, HVM_PARAM_NESTEDHVM, &val);
-        if ( rc )
-            return -errno;
-
-        info->nestedhvm = !!val;
-    }
-    else
-    {
-        unsigned int width;
-
-        rc = xc_domain_get_guest_width(xch, domid, &width);
-        if ( rc )
-            return -errno;
-
-        info->pv64 = (width == 8);
-    }
-
-    return 0;
-}
-
-static void free_cpuid_domain_info(struct cpuid_domain_info *info)
-{
-    free(info->featureset);
-}
-
-static void amd_xc_cpuid_policy(const struct cpuid_domain_info *info,
-                                const unsigned int *input, unsigned int *regs)
-{
-    switch ( input[0] )
-    {
-    case 0x00000002:
-    case 0x00000004:
-        regs[0] = regs[1] = regs[2] = 0;
-        break;
-
-    case 0x80000000:
-        if ( regs[0] > DEF_MAX_AMDEXT )
-            regs[0] = DEF_MAX_AMDEXT;
-        break;
-
-    case 0x80000008:
-        /*
-         * ECX[15:12] is ApicIdCoreSize: ECX[7:0] is NumberOfCores (minus one).
-         * Update to reflect vLAPIC_ID = vCPU_ID * 2.
-         */
-        regs[2] = ((regs[2] + (1u << 12)) & 0xf000u) |
-                  ((regs[2] & 0xffu) << 1) | 1u;
-        break;
-
-    case 0x8000000a: {
-        if ( !info->nestedhvm )
-        {
-            regs[0] = regs[1] = regs[2] = regs[3] = 0;
-            break;
-        }
-
-#define SVM_FEATURE_NPT            0x00000001 /* Nested page table support */
-#define SVM_FEATURE_LBRV           0x00000002 /* LBR virtualization support */
-#define SVM_FEATURE_SVML           0x00000004 /* SVM locking MSR support */
-#define SVM_FEATURE_NRIPS          0x00000008 /* Next RIP save on VMEXIT */
-#define SVM_FEATURE_TSCRATEMSR     0x00000010 /* TSC ratio MSR support */
-#define SVM_FEATURE_VMCBCLEAN      0x00000020 /* VMCB clean bits support */
-#define SVM_FEATURE_FLUSHBYASID    0x00000040 /* TLB flush by ASID support */
-#define SVM_FEATURE_DECODEASSISTS  0x00000080 /* Decode assists support */
-#define SVM_FEATURE_PAUSEFILTER    0x00000400 /* Pause intercept filter */
-
-        /* Pass 1: Only passthrough SVM features which are
-         * available in hw and which are implemented
-         */
-        regs[3] &= (SVM_FEATURE_NPT | SVM_FEATURE_LBRV | \
-            SVM_FEATURE_NRIPS | SVM_FEATURE_PAUSEFILTER | \
-            SVM_FEATURE_DECODEASSISTS);
-
-        /* Pass 2: Always enable SVM features which are emulated */
-        regs[3] |= SVM_FEATURE_VMCBCLEAN | SVM_FEATURE_TSCRATEMSR;
-        break;
-    }
-
-    }
-}
-
-static void intel_xc_cpuid_policy(const struct cpuid_domain_info *info,
-                                  const unsigned int *input, unsigned int *regs)
-{
-    switch ( input[0] )
-    {
-    case 0x00000004:
-        /*
-         * EAX[31:26] is Maximum Cores Per Package (minus one).
-         * Update to reflect vLAPIC_ID = vCPU_ID * 2.
-         */
-        regs[0] = (((regs[0] & 0x7c000000u) << 1) | 0x04000000u |
-                   (regs[0] & 0x3ffu));
-        regs[3] &= 0x3ffu;
-        break;
-
-    case 0x80000000:
-        if ( regs[0] > DEF_MAX_INTELEXT )
-            regs[0] = DEF_MAX_INTELEXT;
-        break;
-
-    case 0x80000005:
-        regs[0] = regs[1] = regs[2] = 0;
-        break;
-
-    case 0x80000008:
-        /* Mask AMD Number of Cores information. */
-        regs[2] = 0;
-        break;
-    }
-}
-
-static void xc_cpuid_hvm_policy(const struct cpuid_domain_info *info,
-                                const unsigned int *input, unsigned int *regs)
-{
-    switch ( input[0] )
-    {
-    case 0x00000000:
-        if ( regs[0] > DEF_MAX_BASE )
-            regs[0] = DEF_MAX_BASE;
-        break;
-
-    case 0x00000001:
-        /*
-         * EBX[23:16] is Maximum Logical Processors Per Package.
-         * Update to reflect vLAPIC_ID = vCPU_ID * 2.
-         */
-        regs[1] = (regs[1] & 0x0000ffffu) | ((regs[1] & 0x007f0000u) << 1);
-
-        regs[2] = info->featureset[featureword_of(X86_FEATURE_SSE3)];
-        regs[3] = (info->featureset[featureword_of(X86_FEATURE_FPU)] |
-                   bitmaskof(X86_FEATURE_HTT));
-        break;
-
-    case 0x00000007: /* Intel-defined CPU features */
-        if ( input[1] == 0 )
-        {
-            regs[1] = info->featureset[featureword_of(X86_FEATURE_FSGSBASE)];
-            regs[2] = info->featureset[featureword_of(X86_FEATURE_PREFETCHWT1)];
-            regs[3] = info->featureset[featureword_of(X86_FEATURE_AVX512_4VNNIW)];
-        }
-        else
-        {
-            regs[1] = 0;
-            regs[2] = 0;
-            regs[3] = 0;
-        }
-        regs[0] = 0;
-        break;
-
-    case 0x0000000d: /* Xen automatically calculates almost everything. */
-        if ( input[1] == 1 )
-            regs[0] = info->featureset[featureword_of(X86_FEATURE_XSAVEOPT)];
-        else
-            regs[0] = 0;
-        regs[1] = regs[2] = regs[3] = 0;
-        break;
-
-    case 0x80000000:
-        /* Passthrough to cpu vendor specific functions */
-        break;
-
-    case 0x80000001:
-        regs[2] = (info->featureset[featureword_of(X86_FEATURE_LAHF_LM)] &
-                   ~bitmaskof(X86_FEATURE_CMP_LEGACY));
-        regs[3] = info->featureset[featureword_of(X86_FEATURE_SYSCALL)];
-        break;
-
-    case 0x80000007:
-        /*
-         * Keep only TSCInvariant. This may be cleared by the hypervisor
-         * depending on guest TSC and migration settings.
-         */
-        regs[0] = regs[1] = regs[2] = 0;
-        regs[3] &= 1u<<8;
-        break;
-
-    case 0x80000008:
-        regs[0] &= 0x0000ffffu;
-        regs[1] = info->featureset[featureword_of(X86_FEATURE_CLZERO)];
-        /* regs[2] handled in the per-vendor logic. */
-        regs[3] = 0;
-        break;
-
-    case 0x00000002: /* Intel cache info (dumped by AMD policy) */
-    case 0x00000004: /* Intel cache info (dumped by AMD policy) */
-    case 0x0000000a: /* Architectural Performance Monitor Features */
-    case 0x80000002: /* Processor name string */
-    case 0x80000003: /* ... continued         */
-    case 0x80000004: /* ... continued         */
-    case 0x80000005: /* AMD L1 cache/TLB info (dumped by Intel policy) */
-    case 0x80000006: /* AMD L2/3 cache/TLB info ; Intel L2 cache features */
-    case 0x8000000a: /* AMD SVM feature bits */
-    case 0x80000019: /* AMD 1G TLB */
-    case 0x8000001a: /* AMD perf hints */
-    case 0x8000001c: /* AMD lightweight profiling */
-        break;
-
-    default:
-        regs[0] = regs[1] = regs[2] = regs[3] = 0;
-        break;
-    }
-
-    if ( info->vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON) )
-        amd_xc_cpuid_policy(info, input, regs);
-    else
-        intel_xc_cpuid_policy(info, input, regs);
-}
-
-static void xc_cpuid_pv_policy(const struct cpuid_domain_info *info,
-                               const unsigned int *input, unsigned int *regs)
-{
-    switch ( input[0] )
-    {
-    case 0x00000000:
-        if ( regs[0] > DEF_MAX_BASE )
-            regs[0] = DEF_MAX_BASE;
-        break;
-
-    case 0x00000001:
-    {
-        /* Host topology exposed to PV guest.  Provide host value. */
-        bool host_htt = regs[3] & bitmaskof(X86_FEATURE_HTT);
-
-        /*
-         * Don't pick host's Initial APIC ID which can change from run
-         * to run.
-         */
-        regs[1] &= 0x00ffffffu;
-
-        regs[2] = info->featureset[featureword_of(X86_FEATURE_SSE3)];
-        regs[3] = (info->featureset[featureword_of(X86_FEATURE_FPU)] &
-                   ~bitmaskof(X86_FEATURE_HTT));
-
-        if ( host_htt )
-            regs[3] |= bitmaskof(X86_FEATURE_HTT);
-        break;
-    }
-
-    case 0x00000007:
-        if ( input[1] == 0 )
-        {
-            regs[1] = info->featureset[featureword_of(X86_FEATURE_FSGSBASE)];
-            regs[2] = info->featureset[featureword_of(X86_FEATURE_PREFETCHWT1)];
-            regs[3] = info->featureset[featureword_of(X86_FEATURE_AVX512_4VNNIW)];
-        }
-        else
-        {
-            regs[1] = 0;
-            regs[2] = 0;
-            regs[3] = 0;
-        }
-        regs[0] = 0;
-        break;
-
-    case 0x0000000d: /* Xen automatically calculates almost everything. */
-        if ( input[1] == 1 )
-            regs[0] = info->featureset[featureword_of(X86_FEATURE_XSAVEOPT)];
-        else
-            regs[0] = 0;
-        regs[1] = regs[2] = regs[3] = 0;
-        break;
-
-    case 0x80000000:
-    {
-        unsigned int max = (info->vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON))
-            ? DEF_MAX_AMDEXT : DEF_MAX_INTELEXT;
-
-        if ( regs[0] > max )
-            regs[0] = max;
-        break;
-    }
-
-    case 0x80000001:
-    {
-        /* Host topology exposed to PV guest.  Provide host CMP_LEGACY value. */
-        bool host_cmp_legacy = regs[2] & bitmaskof(X86_FEATURE_CMP_LEGACY);
-
-        regs[2] = (info->featureset[featureword_of(X86_FEATURE_LAHF_LM)] &
-                   ~bitmaskof(X86_FEATURE_CMP_LEGACY));
-        regs[3] = info->featureset[featureword_of(X86_FEATURE_SYSCALL)];
-
-        if ( host_cmp_legacy )
-            regs[2] |= bitmaskof(X86_FEATURE_CMP_LEGACY);
-
-        break;
-    }
-
-    case 0x80000008:
-        regs[0] &= 0x0000ffffu;
-        regs[1] = info->featureset[featureword_of(X86_FEATURE_CLZERO)];
-        regs[2] = regs[3] = 0;
-        break;
-
-    case 0x00000005: /* MONITOR/MWAIT */
-    case 0x0000000b: /* Extended Topology Enumeration */
-    case 0x8000000a: /* SVM revision and features */
-    case 0x8000001b: /* Instruction Based Sampling */
-    case 0x8000001c: /* Light Weight Profiling */
-    case 0x8000001e: /* Extended topology reporting */
-        regs[0] = regs[1] = regs[2] = regs[3] = 0;
-        break;
-    }
-}
-
-static void xc_cpuid_policy(const struct cpuid_domain_info *info,
-                            const unsigned int *input, unsigned int *regs)
-{
-    /*
-     * For hypervisor leaves (0x4000XXXX) only 0x4000xx00.EAX[7:0] bits (max
-     * number of leaves) can be set by user. Hypervisor will enforce this so
-     * all other bits are don't-care and we can set them to zero.
-     */
-    if ( (input[0] & 0xffff0000) == 0x40000000 )
-    {
-        regs[0] = regs[1] = regs[2] = regs[3] = 0;
-        return;
-    }
-
-    if ( info->hvm )
-        xc_cpuid_hvm_policy(info, input, regs);
-    else
-        xc_cpuid_pv_policy(info, input, regs);
-}
-
-static int xc_cpuid_do_domctl(
-    xc_interface *xch, uint32_t domid,
-    const unsigned int *input, const unsigned int *regs)
-{
-    DECLARE_DOMCTL;
-
-    memset(&domctl, 0, sizeof (domctl));
-    domctl.domain = domid;
-    domctl.cmd = XEN_DOMCTL_set_cpuid;
-    domctl.u.cpuid.input[0] = input[0];
-    domctl.u.cpuid.input[1] = input[1];
-    domctl.u.cpuid.eax = regs[0];
-    domctl.u.cpuid.ebx = regs[1];
-    domctl.u.cpuid.ecx = regs[2];
-    domctl.u.cpuid.edx = regs[3];
-
-    return do_domctl(xch, &domctl);
-}
-
-static void sanitise_featureset(struct cpuid_domain_info *info)
-{
-    const uint32_t fs_size = xc_get_cpu_featureset_size();
-    uint32_t disabled_features[fs_size];
-    static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
-    unsigned int i, b;
-
-    if ( info->hvm )
-    {
-        /* HVM or PVH Guest */
-
-        if ( !info->pae )
-            clear_bit(X86_FEATURE_PAE, info->featureset);
-
-        if ( !info->nestedhvm )
-        {
-            clear_bit(X86_FEATURE_SVM, info->featureset);
-            clear_bit(X86_FEATURE_VMX, info->featureset);
-        }
-    }
-    else
-    {
-        /* PV Guest */
-
-        if ( !info->pv64 )
-        {
-            clear_bit(X86_FEATURE_LM, info->featureset);
-            if ( !(info->vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
-                clear_bit(X86_FEATURE_SYSCALL, info->featureset);
-        }
-
-        clear_bit(X86_FEATURE_PSE, info->featureset);
-        clear_bit(X86_FEATURE_PSE36, info->featureset);
-        clear_bit(X86_FEATURE_PGE, info->featureset);
-        clear_bit(X86_FEATURE_PAGE1GB, info->featureset);
-    }
-
-    if ( info->xfeature_mask == 0 )
-        clear_bit(X86_FEATURE_XSAVE, info->featureset);
-
-    /* Disable deep dependencies of disabled features. */
-    for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i )
-        disabled_features[i] = ~info->featureset[i] & deep_features[i];
-
-    for ( b = 0; b < sizeof(disabled_features) * CHAR_BIT; ++b )
-    {
-        const uint32_t *dfs;
-
-        if ( !test_bit(b, disabled_features) ||
-             !(dfs = x86_cpuid_lookup_deep_deps(b)) )
-             continue;
-
-        for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i )
-        {
-            info->featureset[i] &= ~dfs[i];
-            disabled_features[i] &= ~dfs[i];
-        }
-    }
-}
-
-int xc_cpuid_apply_policy(xc_interface *xch, uint32_t domid,
-                          const uint32_t *featureset, unsigned int nr_features)
-{
-    struct cpuid_domain_info info = {};
-    unsigned int input[2] = { 0, 0 }, regs[4];
-    unsigned int base_max, ext_max;
-    int rc;
-
-    rc = get_cpuid_domain_info(xch, domid, &info, featureset, nr_features);
-    if ( rc )
-        goto out;
-
-    cpuid(input, regs);
-    base_max = (regs[0] <= DEF_MAX_BASE) ? regs[0] : DEF_MAX_BASE;
-    input[0] = 0x80000000;
-    cpuid(input, regs);
-
-    if ( info.vendor == X86_VENDOR_AMD || info.vendor == X86_VENDOR_HYGON )
-        ext_max = (regs[0] <= DEF_MAX_AMDEXT) ? regs[0] : DEF_MAX_AMDEXT;
-    else
-        ext_max = (regs[0] <= DEF_MAX_INTELEXT) ? regs[0] : DEF_MAX_INTELEXT;
-
-    sanitise_featureset(&info);
-
-    input[0] = 0;
-    input[1] = XEN_CPUID_INPUT_UNUSED;
-    for ( ; ; )
-    {
-        cpuid(input, regs);
-        xc_cpuid_policy(&info, input, regs);
-
-        if ( regs[0] || regs[1] || regs[2] || regs[3] )
-        {
-            rc = xc_cpuid_do_domctl(xch, domid, input, regs);
-            if ( rc )
-                goto out;
-        }
-
-        /* Intel cache descriptor leaves. */
-        if ( input[0] == 4 )
-        {
-            input[1]++;
-            /* More to do? Then loop keeping %%eax==0x00000004. */
-            if ( (regs[0] & 0x1f) != 0 )
-                continue;
-        }
-        /* Extended Topology leaves. */
-        else if ( input[0] == 0xb )
-        {
-            uint8_t level_type = regs[2] >> 8;
-
-            input[1]++;
-            if ( level_type >= 1 && level_type <= 2 )
-                continue;
-        }
-
-        input[0]++;
-        if ( !(input[0] & 0x80000000u) && (input[0] > base_max ) )
-            input[0] = 0x80000000u;
-
-        input[1] = XEN_CPUID_INPUT_UNUSED;
-        if ( (input[0] == 4) || (input[0] == 7) || (input[0] == 0xb) )
-            input[1] = 0;
-        else if ( input[0] == 0xd )
-            input[1] = 1; /* Xen automatically calculates almost everything. */
-
-        if ( (input[0] & 0x80000000u) && (input[0] > ext_max) )
-            break;
-    }
-
- out:
-    free_cpuid_domain_info(&info);
-    return rc;
-}
-
 /*
  * Configure a single input with the informatiom from config.
  *
@@ -1057,3 +449,191 @@  int xc_cpuid_set(
 
     return rc;
 }
+
+int xc_cpuid_apply_policy(xc_interface *xch, uint32_t domid,
+                          const uint32_t *featureset, unsigned int nr_features)
+{
+    int rc;
+    xc_dominfo_t di;
+    unsigned int i, nr_leaves, nr_msrs;
+    xen_cpuid_leaf_t *leaves = NULL;
+    struct cpuid_policy *p = NULL;
+    uint32_t err_leaf = -1, err_subleaf = -1, err_msr = -1;
+
+    if ( xc_domain_getinfo(xch, domid, 1, &di) != 1 ||
+         di.domid != domid )
+    {
+        ERROR("Failed to obtain d%d info", domid);
+        rc = -ESRCH;
+        goto out;
+    }
+
+    rc = xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs);
+    if ( rc )
+    {
+        PERROR("Failed to obtain policy info size");
+        rc = -errno;
+        goto out;
+    }
+
+    rc = -ENOMEM;
+    if ( (leaves = calloc(nr_leaves, sizeof(*leaves))) == NULL ||
+         (p = calloc(1, sizeof(*p))) == NULL )
+        goto out;
+
+    nr_msrs = 0;
+    rc = xc_get_domain_cpu_policy(xch, domid, &nr_leaves, leaves,
+                                  &nr_msrs, NULL);
+    if ( rc )
+    {
+        PERROR("Failed to obtain d%d's policy", domid);
+        rc = -errno;
+        goto out;
+    }
+
+    rc = x86_cpuid_copy_from_buffer(p, leaves, nr_leaves,
+                                    &err_leaf, &err_subleaf);
+    if ( rc )
+    {
+        ERROR("Failed to deserialise CPUID (err leaf %#x, subleaf %#x) (%d = %s)",
+              err_leaf, err_subleaf, -rc, strerror(-rc));
+        goto out;
+    }
+
+    if ( featureset )
+    {
+        uint32_t disabled_features[FEATURESET_NR_ENTRIES],
+            feat[FEATURESET_NR_ENTRIES] = {};
+        static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
+        unsigned int i, b;
+
+        /*
+         * The user supplied featureset may be shorter or longer than
+         * FEATURESET_NR_ENTRIES.  Shorter is fine, and we will zero-extend.
+         * Longer is fine, so long as it only padded with zeros.
+         */
+        unsigned int user_len = min(FEATURESET_NR_ENTRIES + 0u, nr_features);
+
+        /* Check for truncated set bits. */
+        rc = -EOPNOTSUPP;
+        for ( i = user_len; i < nr_features; ++i )
+            if ( featureset[i] != 0 )
+                goto out;
+
+        memcpy(feat, featureset, sizeof(*featureset) * user_len);
+
+        /* Disable deep dependencies of disabled features. */
+        for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i )
+            disabled_features[i] = ~feat[i] & deep_features[i];
+
+        for ( b = 0; b < sizeof(disabled_features) * CHAR_BIT; ++b )
+        {
+            const uint32_t *dfs;
+
+            if ( !test_bit(b, disabled_features) ||
+                 !(dfs = x86_cpuid_lookup_deep_deps(b)) )
+                continue;
+
+            for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i )
+            {
+                feat[i] &= ~dfs[i];
+                disabled_features[i] &= ~dfs[i];
+            }
+        }
+
+        cpuid_featureset_to_policy(feat, p);
+    }
+
+    if ( !di.hvm )
+    {
+        uint32_t host_featureset[FEATURESET_NR_ENTRIES];
+        uint32_t len = ARRAY_SIZE(host_featureset);
+
+        rc = xc_get_cpu_featureset(xch, XEN_SYSCTL_cpu_featureset_host,
+                                   &len, host_featureset);
+        if ( rc )
+        {
+            /* Tolerate "buffer too small", as we've got the bits we need. */
+            if ( errno == ENOBUFS )
+                rc = 0;
+            else
+            {
+                PERROR("Failed to obtain host featureset");
+                rc = -errno;
+                goto out;
+            }
+        }
+
+        /*
+         * On hardware without CPUID Faulting, PV guests see real topology.
+         * As a consequence, they also need to see the host htt/cmp fields.
+         */
+        p->basic.htt       = test_bit(X86_FEATURE_HTT, host_featureset);
+        p->extd.cmp_legacy = test_bit(X86_FEATURE_CMP_LEGACY, host_featureset);
+    }
+    else
+    {
+        /*
+         * Topology for HVM guests is entirely controlled by Xen.  For now, we
+         * hardcode APIC_ID = vcpu_id * 2 to give the illusion of no SMT.
+         */
+        p->basic.htt = true;
+        p->extd.cmp_legacy = false;
+
+        p->basic.lppp *= 2;
+
+        switch ( p->x86_vendor )
+        {
+        case X86_VENDOR_INTEL:
+            for ( i = 0; (p->cache.subleaf[i].type &&
+                          i < ARRAY_SIZE(p->cache.raw)); ++i )
+            {
+                p->cache.subleaf[i].cores_per_package =
+                    (p->cache.subleaf[i].cores_per_package << 1) | 1;
+                p->cache.subleaf[i].threads_per_cache = 0;
+            }
+            break;
+
+        case X86_VENDOR_AMD:
+        case X86_VENDOR_HYGON:
+            p->extd.nc = (p->extd.nc << 1) | 1;
+            p->extd.apic_id_size++;
+            break;
+        }
+
+        /*
+         * These settings are necessary to cause earlier HVM_PARAM_NESTEDHVM /
+         * XEN_DOMCTL_disable_migrate settings to be reflected correctly in
+         * CPUID.  Xen will discard these bits if configuration hasn't been
+         * set for the domain.
+         */
+        p->extd.itsc = true;
+        p->basic.vmx = true;
+        p->extd.svm = true;
+    }
+
+    rc = x86_cpuid_copy_to_buffer(p, leaves, &nr_leaves);
+    if ( rc )
+    {
+        ERROR("Failed to serialise CPUID (%d = %s)", -rc, strerror(-rc));
+        goto out;
+    }
+
+    rc = xc_set_domain_cpu_policy(xch, domid, nr_leaves, leaves, 0, NULL,
+                                  &err_leaf, &err_subleaf, &err_msr);
+    if ( rc )
+    {
+        PERROR("Failed to set d%d's policy (err leaf %#x, subleaf %#x, msr %#x)",
+               domid, err_leaf, err_subleaf, err_msr);
+        rc = -errno;
+        goto out;
+    }
+
+    rc = 0;
+
+out:
+    free(p);
+    free(leaves);
+
+    return rc;
+}
diff --git a/xen/include/xen/lib/x86/cpuid.h b/xen/include/xen/lib/x86/cpuid.h
index df5946b6b1..6c86c1a0d0 100644
--- a/xen/include/xen/lib/x86/cpuid.h
+++ b/xen/include/xen/lib/x86/cpuid.h
@@ -154,8 +154,12 @@  struct cpuid_policy
     union {
         struct cpuid_leaf raw[CPUID_GUEST_NR_CACHE];
         struct cpuid_cache_leaf {
-            uint32_t type:5,
-                :27, :32, :32, :32;
+            uint32_t /* a */ type:5, level:3;
+            bool self_init:1, fully_assoc:1;
+            uint32_t :4, threads_per_cache:12, cores_per_package:6;
+            uint32_t /* b */ line_size:12, partitions:10, ways:10;
+            uint32_t /* c */ sets;
+            bool /* d */ wbinvd:1, inclusive:1, complex:1;
         } subleaf[CPUID_GUEST_NR_CACHE];
     } cache;
 
@@ -259,7 +263,8 @@  struct cpuid_policy
                 uint32_t e8b;
                 struct { DECL_BITFIELD(e8b); };
             };
-            uint32_t /* c */:32, /* d */:32;
+            uint32_t nc:8, :4, apic_id_size:4, :16;
+            uint32_t /* d */:32;
         };
     } extd;