@@ -5,6 +5,7 @@
#include <asm/hvm/hvm.h>
#include <asm/hvm/vmx/vmcs.h>
#include <asm/processor.h>
+#include <asm/xstate.h>
const uint32_t known_features[] = INIT_KNOWN_FEATURES;
const uint32_t special_features[] = INIT_SPECIAL_FEATURES;
@@ -336,6 +337,529 @@ int init_domain_cpuid_policy(struct domain *d)
return 0;
}
+static void pv_cpuid(struct cpu_user_regs *regs)
+{
+ uint32_t leaf, subleaf, a, b, c, d;
+ struct vcpu *curr = current;
+ struct domain *currd = curr->domain;
+ const struct cpuid_policy *p = currd->arch.cpuid;
+
+ leaf = a = regs->_eax;
+ b = regs->_ebx;
+ subleaf = c = regs->_ecx;
+ d = regs->_edx;
+
+ if ( !is_control_domain(currd) && !is_hardware_domain(currd) )
+ domain_cpuid(currd, leaf, subleaf, &a, &b, &c, &d);
+ else
+ cpuid_count(leaf, subleaf, &a, &b, &c, &d);
+
+ switch ( leaf )
+ {
+ uint32_t tmp;
+
+ case 0x00000001:
+ c = p->basic._1c;
+ d = p->basic._1d;
+
+ if ( !is_pvh_domain(currd) )
+ {
+ /*
+ * Delete the PVH condition when HVMLite formally replaces PVH,
+ * and HVM guests no longer enter a PV codepath.
+ */
+
+ /*
+ * !!! OSXSAVE handling for PV guests is non-architectural !!!
+ *
+ * Architecturally, the correct code here is simply:
+ *
+ * if ( curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE )
+ * c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
+ *
+ * However because of bugs in Xen (before c/s bd19080b, Nov 2010,
+ * the XSAVE cpuid flag leaked into guests despite the feature not
+ * being available for use), buggy workarounds where introduced to
+ * Linux (c/s 947ccf9c, also Nov 2010) which relied on the fact
+ * that Xen also incorrectly leaked OSXSAVE into the guest.
+ *
+ * Furthermore, providing architectural OSXSAVE behaviour to a
+ * many Linux PV guests triggered a further kernel bug when the
+ * fpu code observes that XSAVEOPT is available, assumes that
+ * xsave state had been set up for the task, and follows a wild
+ * pointer.
+ *
+ * Older Linux PVOPS kernels however do require architectural
+ * behaviour. They observe Xen's leaked OSXSAVE and assume they
+ * can already use XSETBV, dying with a #UD because the shadowed
+ * CR4.OSXSAVE is clear. This behaviour has been adjusted in all
+ * observed cases via stable backports of the above changeset.
+ *
+ * Therefore, the leaking of Xen's OSXSAVE setting has become a
+ * defacto part of the PV ABI and can't reasonably be corrected.
+ * It can however be restricted to only the enlightened CPUID
+ * view, as seen by the guest kernel.
+ *
+ * The following situations and logic now applies:
+ *
+ * - Hardware without CPUID faulting support and native CPUID:
+ * There is nothing Xen can do here. The hosts XSAVE flag will
+ * leak through and Xen's OSXSAVE choice will leak through.
+ *
+ * In the case that the guest kernel has not set up OSXSAVE, only
+ * SSE will be set in xcr0, and guest userspace can't do too much
+ * damage itself.
+ *
+ * - Enlightened CPUID or CPUID faulting available:
+ * Xen can fully control what is seen here. Guest kernels need
+ * to see the leaked OSXSAVE via the enlightened path, but
+ * guest userspace and the native is given architectural
+ * behaviour.
+ *
+ * Emulated vs Faulted CPUID is distinguised based on whether a
+ * #UD or #GP is currently being serviced.
+ */
+ /* OSXSAVE clear in policy. Fast-forward CR4 back in. */
+ if ( (curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) ||
+ (regs->entry_vector == TRAP_invalid_op &&
+ guest_kernel_mode(curr, regs) &&
+ (read_cr4() & X86_CR4_OSXSAVE)) )
+ c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
+
+ /*
+ * At the time of writing, a PV domain is the only viable option
+ * for Dom0. Several interactions between dom0 and Xen for real
+ * hardware setup have unfortunately been implemented based on
+ * state which incorrectly leaked into dom0.
+ *
+ * These leaks are retained for backwards compatibility, but
+ * restricted to the hardware domains kernel only.
+ */
+ if ( is_hardware_domain(currd) && guest_kernel_mode(curr, regs) )
+ {
+ /*
+ * MTRR used to unconditionally leak into PV guests. They
+ * cannot MTRR infrastructure at all, and shouldn't be able to
+ * see the feature.
+ *
+ * Modern PVOPS Linux self-clobbers the MTRR feature, to avoid
+ * trying to use the associated MSRs. Xenolinux-based PV dom0's
+ * however use the MTRR feature as an indication of the presence
+ * of the XENPF_{add,del,read}_memtype hypercalls.
+ */
+ if ( cpu_has_mtrr )
+ d |= cpufeat_mask(X86_FEATURE_MTRR);
+
+ /*
+ * MONITOR never leaked into PV guests, as PV guests cannot
+ * use the MONITOR/MWAIT instructions. As such, they require
+ * the feature to not being present in emulated CPUID.
+ *
+ * Modern PVOPS Linux try to be cunning and use native CPUID
+ * to see if the hardware actually supports MONITOR, and by
+ * extension, deep C states.
+ *
+ * If the feature is seen, deep-C state information is
+ * obtained from the DSDT and handed back to Xen via the
+ * XENPF_set_processor_pminfo hypercall.
+ *
+ * This mechanism is incompatible with an HVM-based hardware
+ * domain, and also with CPUID Faulting.
+ *
+ * Luckily, Xen can be just as 'cunning', and distinguish an
+ * emulated CPUID from a faulted CPUID by whether a #UD or #GP
+ * fault is currently being serviced. Yuck...
+ */
+ if ( cpu_has_monitor && regs->entry_vector == TRAP_gp_fault )
+ c |= cpufeat_mask(X86_FEATURE_MONITOR);
+
+ /*
+ * While MONITOR never leaked into PV guests, EIST always used
+ * to.
+ *
+ * Modern PVOPS will only parse P state information from the
+ * DSDT and return it to Xen if EIST is seen in the emulated
+ * CPUID information.
+ */
+ if ( cpu_has_eist )
+ c |= cpufeat_mask(X86_FEATURE_EIST);
+ }
+ }
+
+ if ( vpmu_enabled(curr) &&
+ vpmu_is_set(vcpu_vpmu(curr), VPMU_CPU_HAS_DS) )
+ {
+ d |= cpufeat_mask(X86_FEATURE_DS);
+ if ( cpu_has(¤t_cpu_data, X86_FEATURE_DTES64) )
+ c |= cpufeat_mask(X86_FEATURE_DTES64);
+ if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) )
+ c |= cpufeat_mask(X86_FEATURE_DSCPL);
+ }
+ break;
+
+ case 0x0000000a: /* Architectural Performance Monitor Features (Intel) */
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ !vpmu_enabled(curr) )
+ goto unsupported;
+
+ /* Report at most version 3 since that's all we currently emulate. */
+ if ( (a & 0xff) > 3 )
+ a = (a & ~0xff) | 3;
+ break;
+
+ case XSTATE_CPUID:
+ if ( !p->basic.xsave || subleaf >= 63 )
+ goto unsupported;
+ switch ( subleaf )
+ {
+ case 0:
+ {
+ uint64_t xfeature_mask = XSTATE_FP_SSE;
+ uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
+
+ if ( p->basic.avx )
+ {
+ xfeature_mask |= XSTATE_YMM;
+ xstate_size = (xstate_offsets[_XSTATE_YMM] +
+ xstate_sizes[_XSTATE_YMM]);
+ }
+
+ if ( p->feat.avx512f )
+ {
+ xfeature_mask |= XSTATE_OPMASK | XSTATE_ZMM | XSTATE_HI_ZMM;
+ xstate_size = max(xstate_size,
+ xstate_offsets[_XSTATE_OPMASK] +
+ xstate_sizes[_XSTATE_OPMASK]);
+ xstate_size = max(xstate_size,
+ xstate_offsets[_XSTATE_ZMM] +
+ xstate_sizes[_XSTATE_ZMM]);
+ xstate_size = max(xstate_size,
+ xstate_offsets[_XSTATE_HI_ZMM] +
+ xstate_sizes[_XSTATE_HI_ZMM]);
+ }
+
+ a = (uint32_t)xfeature_mask;
+ d = (uint32_t)(xfeature_mask >> 32);
+ c = xstate_size;
+
+ /*
+ * Always read CPUID.0xD[ECX=0].EBX from hardware, rather than
+ * domain policy. It varies with enabled xstate, and the correct
+ * xcr0 is in context.
+ */
+ cpuid_count(leaf, subleaf, &tmp, &b, &tmp, &tmp);
+ break;
+ }
+
+ case 1:
+ a = p->xstate.Da1;
+ b = c = d = 0;
+ break;
+ }
+ break;
+
+ case 0x80000001:
+ c = p->extd.e1c;
+ d = p->extd.e1d;
+
+ /* If not emulating AMD, clear the duplicated features in e1d. */
+ if ( currd->arch.x86_vendor != X86_VENDOR_AMD )
+ d &= ~CPUID_COMMON_1D_FEATURES;
+
+ /*
+ * MTRR used to unconditionally leak into PV guests. They cannot MTRR
+ * infrastructure at all, and shouldn't be able to see the feature.
+ *
+ * Modern PVOPS Linux self-clobbers the MTRR feature, to avoid trying
+ * to use the associated MSRs. Xenolinux-based PV dom0's however use
+ * the MTRR feature as an indication of the presence of the
+ * XENPF_{add,del,read}_memtype hypercalls.
+ */
+ if ( is_hardware_domain(currd) && guest_kernel_mode(curr, regs) &&
+ cpu_has_mtrr )
+ d |= cpufeat_mask(X86_FEATURE_MTRR);
+ break;
+
+ case 0x80000007:
+ d = p->extd.e7d;
+ break;
+
+ case 0x80000008:
+ a = paddr_bits | (vaddr_bits << 8);
+ b = p->extd.e8b;
+ break;
+
+ case 0x00000005: /* MONITOR/MWAIT */
+ case 0x0000000b: /* Extended Topology Enumeration */
+ case 0x8000000a: /* SVM revision and features */
+ case 0x8000001b: /* Instruction Based Sampling */
+ case 0x8000001c: /* Light Weight Profiling */
+ case 0x8000001e: /* Extended topology reporting */
+ unsupported:
+ a = b = c = d = 0;
+ break;
+
+ case 0x7:
+ ASSERT_UNREACHABLE();
+ /* Now handled in guest_cpuid(). */
+ }
+
+ regs->rax = a;
+ regs->rbx = b;
+ regs->rcx = c;
+ regs->rdx = d;
+}
+
+static void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ struct vcpu *v = current;
+ struct domain *d = v->domain;
+ const struct cpuid_policy *p = d->arch.cpuid;
+ unsigned int count, dummy = 0;
+
+ if ( !eax )
+ eax = &dummy;
+ if ( !ebx )
+ ebx = &dummy;
+ if ( !ecx )
+ ecx = &dummy;
+ count = *ecx;
+ if ( !edx )
+ edx = &dummy;
+
+ domain_cpuid(d, input, count, eax, ebx, ecx, edx);
+
+ switch ( input )
+ {
+ case 0x1:
+ /* Fix up VLAPIC details. */
+ *ebx &= 0x00FFFFFFu;
+ *ebx |= (v->vcpu_id * 2) << 24;
+
+ *ecx = p->basic._1c;
+ *edx = p->basic._1d;
+
+ /* APIC exposed to guests, but Fast-forward MSR_APIC_BASE.EN back in. */
+ if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
+ *edx &= ~cpufeat_bit(X86_FEATURE_APIC);
+
+ /* OSXSAVE clear in policy. Fast-forward CR4 back in. */
+ if ( v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE )
+ *ecx |= cpufeat_mask(X86_FEATURE_OSXSAVE);
+
+ /*
+ * PSE36 is not supported in shadow mode. This bit should be
+ * unilaterally cleared.
+ *
+ * However, an unspecified version of Hyper-V from 2011 refuses
+ * to start as the "cpu does not provide required hw features" if
+ * it can't see PSE36.
+ *
+ * As a workaround, leak the toolstack-provided PSE36 value into a
+ * shadow guest if the guest is already using PAE paging (and won't
+ * care about reverting back to PSE paging). Otherwise, knoble it, so
+ * a 32bit guest doesn't get the impression that it could try to use
+ * PSE36 paging.
+ */
+ if ( !hap_enabled(d) && !(hvm_pae_enabled(v) || hvm_long_mode_enabled(v)) )
+ *edx &= ~cpufeat_mask(X86_FEATURE_PSE36);
+
+ if ( vpmu_enabled(v) &&
+ vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_DS) )
+ {
+ *edx |= cpufeat_mask(X86_FEATURE_DS);
+ if ( cpu_has(¤t_cpu_data, X86_FEATURE_DTES64) )
+ *ecx |= cpufeat_mask(X86_FEATURE_DTES64);
+ if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) )
+ *ecx |= cpufeat_mask(X86_FEATURE_DSCPL);
+ }
+
+ break;
+
+ case 0xb:
+ /* Fix the x2APIC identifier. */
+ *edx = v->vcpu_id * 2;
+ break;
+
+ case XSTATE_CPUID:
+ if ( !p->basic.xsave || count >= 63 )
+ {
+ *eax = *ebx = *ecx = *edx = 0;
+ break;
+ }
+ switch ( count )
+ {
+ case 0:
+ {
+ uint64_t xfeature_mask = XSTATE_FP_SSE;
+ uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
+
+ if ( p->basic.avx )
+ {
+ xfeature_mask |= XSTATE_YMM;
+ xstate_size = max(xstate_size,
+ xstate_offsets[_XSTATE_YMM] +
+ xstate_sizes[_XSTATE_YMM]);
+ }
+
+ if ( p->feat.mpx )
+ {
+ xfeature_mask |= XSTATE_BNDREGS | XSTATE_BNDCSR;
+ xstate_size = max(xstate_size,
+ xstate_offsets[_XSTATE_BNDCSR] +
+ xstate_sizes[_XSTATE_BNDCSR]);
+ }
+
+ if ( p->feat.avx512f )
+ {
+ xfeature_mask |= XSTATE_OPMASK | XSTATE_ZMM | XSTATE_HI_ZMM;
+ xstate_size = max(xstate_size,
+ xstate_offsets[_XSTATE_OPMASK] +
+ xstate_sizes[_XSTATE_OPMASK]);
+ xstate_size = max(xstate_size,
+ xstate_offsets[_XSTATE_ZMM] +
+ xstate_sizes[_XSTATE_ZMM]);
+ xstate_size = max(xstate_size,
+ xstate_offsets[_XSTATE_HI_ZMM] +
+ xstate_sizes[_XSTATE_HI_ZMM]);
+ }
+
+ if ( p->feat.pku )
+ {
+ xfeature_mask |= XSTATE_PKRU;
+ xstate_size = max(xstate_size,
+ xstate_offsets[_XSTATE_PKRU] +
+ xstate_sizes[_XSTATE_PKRU]);
+ }
+
+ if ( p->extd.lwp )
+ {
+ xfeature_mask |= XSTATE_LWP;
+ xstate_size = max(xstate_size,
+ xstate_offsets[_XSTATE_LWP] +
+ xstate_sizes[_XSTATE_LWP]);
+ }
+
+ *eax = (uint32_t)xfeature_mask;
+ *edx = (uint32_t)(xfeature_mask >> 32);
+ *ecx = xstate_size;
+
+ /*
+ * Always read CPUID[0xD,0].EBX from hardware, rather than domain
+ * policy. It varies with enabled xstate, and the correct xcr0 is
+ * in context.
+ */
+ cpuid_count(input, count, &dummy, ebx, &dummy, &dummy);
+ break;
+ }
+
+ case 1:
+ *eax = p->xstate.Da1;
+
+ if ( p->xstate.xsaves )
+ {
+ /*
+ * Always read CPUID[0xD,1].EBX from hardware, rather than
+ * domain policy. It varies with enabled xstate, and the
+ * correct xcr0/xss are in context.
+ */
+ cpuid_count(input, count, &dummy, ebx, &dummy, &dummy);
+ }
+ else
+ *ebx = 0;
+
+ *ecx = *edx = 0;
+ break;
+ }
+ break;
+
+ case 0x0000000a: /* Architectural Performance Monitor Features (Intel) */
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || !vpmu_enabled(v) )
+ {
+ *eax = *ebx = *ecx = *edx = 0;
+ break;
+ }
+
+ /* Report at most version 3 since that's all we currently emulate */
+ if ( (*eax & 0xff) > 3 )
+ *eax = (*eax & ~0xff) | 3;
+ break;
+
+ case 0x80000001:
+ *ecx = p->extd.e1c;
+ *edx = p->extd.e1d;
+
+ /* If not emulating AMD, clear the duplicated features in e1d. */
+ if ( d->arch.x86_vendor != X86_VENDOR_AMD )
+ *edx &= ~CPUID_COMMON_1D_FEATURES;
+ /* fast-forward MSR_APIC_BASE.EN if it hasn't already been clobbered. */
+ else if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
+ *edx &= ~cpufeat_bit(X86_FEATURE_APIC);
+
+ /*
+ * PSE36 is not supported in shadow mode. This bit should be
+ * unilaterally cleared.
+ *
+ * However, an unspecified version of Hyper-V from 2011 refuses
+ * to start as the "cpu does not provide required hw features" if
+ * it can't see PSE36.
+ *
+ * As a workaround, leak the toolstack-provided PSE36 value into a
+ * shadow guest if the guest is already using PAE paging (and won't
+ * care about reverting back to PSE paging). Otherwise, knoble it, so
+ * a 32bit guest doesn't get the impression that it could try to use
+ * PSE36 paging.
+ */
+ if ( !hap_enabled(d) && !(hvm_pae_enabled(v) || hvm_long_mode_enabled(v)) )
+ *edx &= ~cpufeat_mask(X86_FEATURE_PSE36);
+
+ /* SYSCALL is hidden outside of long mode on Intel. */
+ if ( d->arch.x86_vendor == X86_VENDOR_INTEL &&
+ !hvm_long_mode_enabled(v))
+ *edx &= ~cpufeat_mask(X86_FEATURE_SYSCALL);
+
+ break;
+
+ case 0x80000007:
+ *edx = p->extd.e7d;
+ break;
+
+ case 0x80000008:
+ *eax &= 0xff;
+ count = d->arch.paging.gfn_bits + PAGE_SHIFT;
+ if ( *eax > count )
+ *eax = count;
+
+ count = (p->basic.pae || p->basic.pse36) ? 36 : 32;
+ if ( *eax < count )
+ *eax = count;
+
+ *eax |= (p->extd.lm ? vaddr_bits : 32) << 8;
+
+ *ebx = p->extd.e8b;
+ break;
+
+ case 0x8000001c:
+ if ( !cpu_has_svm )
+ {
+ *eax = *ebx = *ecx = *edx = 0;
+ break;
+ }
+
+ if ( cpu_has_lwp && (v->arch.xcr0 & XSTATE_LWP) )
+ /* Turn on available bit and other features specified in lwp_cfg. */
+ *eax = (*edx & v->arch.hvm_svm.guest_lwp_cfg) | 1;
+ else
+ *eax = 0;
+ break;
+
+ case 0x7:
+ ASSERT_UNREACHABLE();
+ /* Now handled in guest_cpuid(). */
+ }
+}
+
void guest_cpuid(const struct vcpu *v, unsigned int leaf,
unsigned int subleaf, struct cpuid_leaf *res)
{
@@ -3287,256 +3287,6 @@ unsigned long copy_from_user_hvm(void *to, const void *from, unsigned len)
return rc ? len : 0; /* fake a copy_from_user() return code */
}
-void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- struct vcpu *v = current;
- struct domain *d = v->domain;
- const struct cpuid_policy *p = d->arch.cpuid;
- unsigned int count, dummy = 0;
-
- if ( !eax )
- eax = &dummy;
- if ( !ebx )
- ebx = &dummy;
- if ( !ecx )
- ecx = &dummy;
- count = *ecx;
- if ( !edx )
- edx = &dummy;
-
- domain_cpuid(d, input, count, eax, ebx, ecx, edx);
-
- switch ( input )
- {
- case 0x1:
- /* Fix up VLAPIC details. */
- *ebx &= 0x00FFFFFFu;
- *ebx |= (v->vcpu_id * 2) << 24;
-
- *ecx = p->basic._1c;
- *edx = p->basic._1d;
-
- /* APIC exposed to guests, but Fast-forward MSR_APIC_BASE.EN back in. */
- if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
- *edx &= ~cpufeat_bit(X86_FEATURE_APIC);
-
- /* OSXSAVE clear in policy. Fast-forward CR4 back in. */
- if ( v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE )
- *ecx |= cpufeat_mask(X86_FEATURE_OSXSAVE);
-
- /*
- * PSE36 is not supported in shadow mode. This bit should be
- * unilaterally cleared.
- *
- * However, an unspecified version of Hyper-V from 2011 refuses
- * to start as the "cpu does not provide required hw features" if
- * it can't see PSE36.
- *
- * As a workaround, leak the toolstack-provided PSE36 value into a
- * shadow guest if the guest is already using PAE paging (and won't
- * care about reverting back to PSE paging). Otherwise, knoble it, so
- * a 32bit guest doesn't get the impression that it could try to use
- * PSE36 paging.
- */
- if ( !hap_enabled(d) && !(hvm_pae_enabled(v) || hvm_long_mode_enabled(v)) )
- *edx &= ~cpufeat_mask(X86_FEATURE_PSE36);
-
- if ( vpmu_enabled(v) &&
- vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_DS) )
- {
- *edx |= cpufeat_mask(X86_FEATURE_DS);
- if ( cpu_has(¤t_cpu_data, X86_FEATURE_DTES64) )
- *ecx |= cpufeat_mask(X86_FEATURE_DTES64);
- if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) )
- *ecx |= cpufeat_mask(X86_FEATURE_DSCPL);
- }
-
- break;
-
- case 0xb:
- /* Fix the x2APIC identifier. */
- *edx = v->vcpu_id * 2;
- break;
-
- case XSTATE_CPUID:
- if ( !p->basic.xsave || count >= 63 )
- {
- *eax = *ebx = *ecx = *edx = 0;
- break;
- }
- switch ( count )
- {
- case 0:
- {
- uint64_t xfeature_mask = XSTATE_FP_SSE;
- uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
-
- if ( p->basic.avx )
- {
- xfeature_mask |= XSTATE_YMM;
- xstate_size = max(xstate_size,
- xstate_offsets[_XSTATE_YMM] +
- xstate_sizes[_XSTATE_YMM]);
- }
-
- if ( p->feat.mpx )
- {
- xfeature_mask |= XSTATE_BNDREGS | XSTATE_BNDCSR;
- xstate_size = max(xstate_size,
- xstate_offsets[_XSTATE_BNDCSR] +
- xstate_sizes[_XSTATE_BNDCSR]);
- }
-
- if ( p->feat.avx512f )
- {
- xfeature_mask |= XSTATE_OPMASK | XSTATE_ZMM | XSTATE_HI_ZMM;
- xstate_size = max(xstate_size,
- xstate_offsets[_XSTATE_OPMASK] +
- xstate_sizes[_XSTATE_OPMASK]);
- xstate_size = max(xstate_size,
- xstate_offsets[_XSTATE_ZMM] +
- xstate_sizes[_XSTATE_ZMM]);
- xstate_size = max(xstate_size,
- xstate_offsets[_XSTATE_HI_ZMM] +
- xstate_sizes[_XSTATE_HI_ZMM]);
- }
-
- if ( p->feat.pku )
- {
- xfeature_mask |= XSTATE_PKRU;
- xstate_size = max(xstate_size,
- xstate_offsets[_XSTATE_PKRU] +
- xstate_sizes[_XSTATE_PKRU]);
- }
-
- if ( p->extd.lwp )
- {
- xfeature_mask |= XSTATE_LWP;
- xstate_size = max(xstate_size,
- xstate_offsets[_XSTATE_LWP] +
- xstate_sizes[_XSTATE_LWP]);
- }
-
- *eax = (uint32_t)xfeature_mask;
- *edx = (uint32_t)(xfeature_mask >> 32);
- *ecx = xstate_size;
-
- /*
- * Always read CPUID[0xD,0].EBX from hardware, rather than domain
- * policy. It varies with enabled xstate, and the correct xcr0 is
- * in context.
- */
- cpuid_count(input, count, &dummy, ebx, &dummy, &dummy);
- break;
- }
-
- case 1:
- *eax = p->xstate.Da1;
-
- if ( p->xstate.xsaves )
- {
- /*
- * Always read CPUID[0xD,1].EBX from hardware, rather than
- * domain policy. It varies with enabled xstate, and the
- * correct xcr0/xss are in context.
- */
- cpuid_count(input, count, &dummy, ebx, &dummy, &dummy);
- }
- else
- *ebx = 0;
-
- *ecx = *edx = 0;
- break;
- }
- break;
-
- case 0x0000000a: /* Architectural Performance Monitor Features (Intel) */
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || !vpmu_enabled(v) )
- {
- *eax = *ebx = *ecx = *edx = 0;
- break;
- }
-
- /* Report at most version 3 since that's all we currently emulate */
- if ( (*eax & 0xff) > 3 )
- *eax = (*eax & ~0xff) | 3;
- break;
-
- case 0x80000001:
- *ecx = p->extd.e1c;
- *edx = p->extd.e1d;
-
- /* If not emulating AMD, clear the duplicated features in e1d. */
- if ( d->arch.x86_vendor != X86_VENDOR_AMD )
- *edx &= ~CPUID_COMMON_1D_FEATURES;
- /* fast-forward MSR_APIC_BASE.EN if it hasn't already been clobbered. */
- else if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
- *edx &= ~cpufeat_bit(X86_FEATURE_APIC);
-
- /*
- * PSE36 is not supported in shadow mode. This bit should be
- * unilaterally cleared.
- *
- * However, an unspecified version of Hyper-V from 2011 refuses
- * to start as the "cpu does not provide required hw features" if
- * it can't see PSE36.
- *
- * As a workaround, leak the toolstack-provided PSE36 value into a
- * shadow guest if the guest is already using PAE paging (and won't
- * care about reverting back to PSE paging). Otherwise, knoble it, so
- * a 32bit guest doesn't get the impression that it could try to use
- * PSE36 paging.
- */
- if ( !hap_enabled(d) && !(hvm_pae_enabled(v) || hvm_long_mode_enabled(v)) )
- *edx &= ~cpufeat_mask(X86_FEATURE_PSE36);
-
- /* SYSCALL is hidden outside of long mode on Intel. */
- if ( d->arch.x86_vendor == X86_VENDOR_INTEL &&
- !hvm_long_mode_enabled(v))
- *edx &= ~cpufeat_mask(X86_FEATURE_SYSCALL);
-
- break;
-
- case 0x80000007:
- *edx = p->extd.e7d;
- break;
-
- case 0x80000008:
- *eax &= 0xff;
- count = d->arch.paging.gfn_bits + PAGE_SHIFT;
- if ( *eax > count )
- *eax = count;
-
- count = (p->basic.pae || p->basic.pse36) ? 36 : 32;
- if ( *eax < count )
- *eax = count;
-
- *eax |= (p->extd.lm ? vaddr_bits : 32) << 8;
-
- *ebx = p->extd.e8b;
- break;
-
- case 0x8000001c:
- if ( !cpu_has_svm )
- {
- *eax = *ebx = *ecx = *edx = 0;
- break;
- }
-
- if ( cpu_has_lwp && (v->arch.xcr0 & XSTATE_LWP) )
- /* Turn on available bit and other features specified in lwp_cfg. */
- *eax = (*edx & v->arch.hvm_svm.guest_lwp_cfg) | 1;
- else
- *eax = 0;
- break;
-
- case 0x7:
- ASSERT_UNREACHABLE();
- /* Now handled in guest_cpuid(). */
- }
-}
-
bool hvm_check_cpuid_faulting(struct vcpu *v)
{
if ( !v->arch.cpuid_faulting )
@@ -1020,279 +1020,6 @@ void cpuid_hypervisor_leaves(const struct vcpu *v, unsigned int leaf,
}
}
-void pv_cpuid(struct cpu_user_regs *regs)
-{
- uint32_t leaf, subleaf, a, b, c, d;
- struct vcpu *curr = current;
- struct domain *currd = curr->domain;
- const struct cpuid_policy *p = currd->arch.cpuid;
-
- leaf = a = regs->_eax;
- b = regs->_ebx;
- subleaf = c = regs->_ecx;
- d = regs->_edx;
-
- if ( !is_control_domain(currd) && !is_hardware_domain(currd) )
- domain_cpuid(currd, leaf, subleaf, &a, &b, &c, &d);
- else
- cpuid_count(leaf, subleaf, &a, &b, &c, &d);
-
- switch ( leaf )
- {
- uint32_t tmp;
-
- case 0x00000001:
- c = p->basic._1c;
- d = p->basic._1d;
-
- if ( !is_pvh_domain(currd) )
- {
- /*
- * Delete the PVH condition when HVMLite formally replaces PVH,
- * and HVM guests no longer enter a PV codepath.
- */
-
- /*
- * !!! OSXSAVE handling for PV guests is non-architectural !!!
- *
- * Architecturally, the correct code here is simply:
- *
- * if ( curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE )
- * c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
- *
- * However because of bugs in Xen (before c/s bd19080b, Nov 2010,
- * the XSAVE cpuid flag leaked into guests despite the feature not
- * being available for use), buggy workarounds where introduced to
- * Linux (c/s 947ccf9c, also Nov 2010) which relied on the fact
- * that Xen also incorrectly leaked OSXSAVE into the guest.
- *
- * Furthermore, providing architectural OSXSAVE behaviour to a
- * many Linux PV guests triggered a further kernel bug when the
- * fpu code observes that XSAVEOPT is available, assumes that
- * xsave state had been set up for the task, and follows a wild
- * pointer.
- *
- * Older Linux PVOPS kernels however do require architectural
- * behaviour. They observe Xen's leaked OSXSAVE and assume they
- * can already use XSETBV, dying with a #UD because the shadowed
- * CR4.OSXSAVE is clear. This behaviour has been adjusted in all
- * observed cases via stable backports of the above changeset.
- *
- * Therefore, the leaking of Xen's OSXSAVE setting has become a
- * defacto part of the PV ABI and can't reasonably be corrected.
- * It can however be restricted to only the enlightened CPUID
- * view, as seen by the guest kernel.
- *
- * The following situations and logic now applies:
- *
- * - Hardware without CPUID faulting support and native CPUID:
- * There is nothing Xen can do here. The hosts XSAVE flag will
- * leak through and Xen's OSXSAVE choice will leak through.
- *
- * In the case that the guest kernel has not set up OSXSAVE, only
- * SSE will be set in xcr0, and guest userspace can't do too much
- * damage itself.
- *
- * - Enlightened CPUID or CPUID faulting available:
- * Xen can fully control what is seen here. Guest kernels need
- * to see the leaked OSXSAVE via the enlightened path, but
- * guest userspace and the native is given architectural
- * behaviour.
- *
- * Emulated vs Faulted CPUID is distinguised based on whether a
- * #UD or #GP is currently being serviced.
- */
- /* OSXSAVE clear in policy. Fast-forward CR4 back in. */
- if ( (curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) ||
- (regs->entry_vector == TRAP_invalid_op &&
- guest_kernel_mode(curr, regs) &&
- (read_cr4() & X86_CR4_OSXSAVE)) )
- c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
-
- /*
- * At the time of writing, a PV domain is the only viable option
- * for Dom0. Several interactions between dom0 and Xen for real
- * hardware setup have unfortunately been implemented based on
- * state which incorrectly leaked into dom0.
- *
- * These leaks are retained for backwards compatibility, but
- * restricted to the hardware domains kernel only.
- */
- if ( is_hardware_domain(currd) && guest_kernel_mode(curr, regs) )
- {
- /*
- * MTRR used to unconditionally leak into PV guests. They
- * cannot MTRR infrastructure at all, and shouldn't be able to
- * see the feature.
- *
- * Modern PVOPS Linux self-clobbers the MTRR feature, to avoid
- * trying to use the associated MSRs. Xenolinux-based PV dom0's
- * however use the MTRR feature as an indication of the presence
- * of the XENPF_{add,del,read}_memtype hypercalls.
- */
- if ( cpu_has_mtrr )
- d |= cpufeat_mask(X86_FEATURE_MTRR);
-
- /*
- * MONITOR never leaked into PV guests, as PV guests cannot
- * use the MONITOR/MWAIT instructions. As such, they require
- * the feature to not being present in emulated CPUID.
- *
- * Modern PVOPS Linux try to be cunning and use native CPUID
- * to see if the hardware actually supports MONITOR, and by
- * extension, deep C states.
- *
- * If the feature is seen, deep-C state information is
- * obtained from the DSDT and handed back to Xen via the
- * XENPF_set_processor_pminfo hypercall.
- *
- * This mechanism is incompatible with an HVM-based hardware
- * domain, and also with CPUID Faulting.
- *
- * Luckily, Xen can be just as 'cunning', and distinguish an
- * emulated CPUID from a faulted CPUID by whether a #UD or #GP
- * fault is currently being serviced. Yuck...
- */
- if ( cpu_has_monitor && regs->entry_vector == TRAP_gp_fault )
- c |= cpufeat_mask(X86_FEATURE_MONITOR);
-
- /*
- * While MONITOR never leaked into PV guests, EIST always used
- * to.
- *
- * Modern PVOPS will only parse P state information from the
- * DSDT and return it to Xen if EIST is seen in the emulated
- * CPUID information.
- */
- if ( cpu_has_eist )
- c |= cpufeat_mask(X86_FEATURE_EIST);
- }
- }
-
- if ( vpmu_enabled(curr) &&
- vpmu_is_set(vcpu_vpmu(curr), VPMU_CPU_HAS_DS) )
- {
- d |= cpufeat_mask(X86_FEATURE_DS);
- if ( cpu_has(¤t_cpu_data, X86_FEATURE_DTES64) )
- c |= cpufeat_mask(X86_FEATURE_DTES64);
- if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) )
- c |= cpufeat_mask(X86_FEATURE_DSCPL);
- }
- break;
-
- case 0x0000000a: /* Architectural Performance Monitor Features (Intel) */
- if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
- !vpmu_enabled(curr) )
- goto unsupported;
-
- /* Report at most version 3 since that's all we currently emulate. */
- if ( (a & 0xff) > 3 )
- a = (a & ~0xff) | 3;
- break;
-
- case XSTATE_CPUID:
- if ( !p->basic.xsave || subleaf >= 63 )
- goto unsupported;
- switch ( subleaf )
- {
- case 0:
- {
- uint64_t xfeature_mask = XSTATE_FP_SSE;
- uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
-
- if ( p->basic.avx )
- {
- xfeature_mask |= XSTATE_YMM;
- xstate_size = (xstate_offsets[_XSTATE_YMM] +
- xstate_sizes[_XSTATE_YMM]);
- }
-
- if ( p->feat.avx512f )
- {
- xfeature_mask |= XSTATE_OPMASK | XSTATE_ZMM | XSTATE_HI_ZMM;
- xstate_size = max(xstate_size,
- xstate_offsets[_XSTATE_OPMASK] +
- xstate_sizes[_XSTATE_OPMASK]);
- xstate_size = max(xstate_size,
- xstate_offsets[_XSTATE_ZMM] +
- xstate_sizes[_XSTATE_ZMM]);
- xstate_size = max(xstate_size,
- xstate_offsets[_XSTATE_HI_ZMM] +
- xstate_sizes[_XSTATE_HI_ZMM]);
- }
-
- a = (uint32_t)xfeature_mask;
- d = (uint32_t)(xfeature_mask >> 32);
- c = xstate_size;
-
- /*
- * Always read CPUID.0xD[ECX=0].EBX from hardware, rather than
- * domain policy. It varies with enabled xstate, and the correct
- * xcr0 is in context.
- */
- cpuid_count(leaf, subleaf, &tmp, &b, &tmp, &tmp);
- break;
- }
-
- case 1:
- a = p->xstate.Da1;
- b = c = d = 0;
- break;
- }
- break;
-
- case 0x80000001:
- c = p->extd.e1c;
- d = p->extd.e1d;
-
- /* If not emulating AMD, clear the duplicated features in e1d. */
- if ( currd->arch.x86_vendor != X86_VENDOR_AMD )
- d &= ~CPUID_COMMON_1D_FEATURES;
-
- /*
- * MTRR used to unconditionally leak into PV guests. They cannot MTRR
- * infrastructure at all, and shouldn't be able to see the feature.
- *
- * Modern PVOPS Linux self-clobbers the MTRR feature, to avoid trying
- * to use the associated MSRs. Xenolinux-based PV dom0's however use
- * the MTRR feature as an indication of the presence of the
- * XENPF_{add,del,read}_memtype hypercalls.
- */
- if ( is_hardware_domain(currd) && guest_kernel_mode(curr, regs) &&
- cpu_has_mtrr )
- d |= cpufeat_mask(X86_FEATURE_MTRR);
- break;
-
- case 0x80000007:
- d = p->extd.e7d;
- break;
-
- case 0x80000008:
- a = paddr_bits | (vaddr_bits << 8);
- b = p->extd.e8b;
- break;
-
- case 0x00000005: /* MONITOR/MWAIT */
- case 0x0000000b: /* Extended Topology Enumeration */
- case 0x8000000a: /* SVM revision and features */
- case 0x8000001b: /* Instruction Based Sampling */
- case 0x8000001c: /* Light Weight Profiling */
- case 0x8000001e: /* Extended topology reporting */
- unsupported:
- a = b = c = d = 0;
- break;
-
- case 0x7:
- ASSERT_UNREACHABLE();
- /* Now handled in guest_cpuid(). */
- }
-
- regs->rax = a;
- regs->rbx = b;
- regs->rcx = c;
- regs->rdx = d;
-}
-
static int emulate_invalid_rdtscp(struct cpu_user_regs *regs)
{
char opcode[3];
@@ -392,8 +392,6 @@ bool hvm_set_guest_bndcfgs(struct vcpu *v, u64 val);
#define has_viridian_apic_assist(d) \
(is_viridian_domain(d) && (viridian_feature_mask(d) & HVMPV_apic_assist))
-void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx);
bool hvm_check_cpuid_faulting(struct vcpu *v);
void hvm_migrate_timers(struct vcpu *v);
void hvm_do_resume(struct vcpu *v);
@@ -627,8 +627,6 @@ enum get_cpu_vendor {
int get_cpu_vendor(uint32_t b, uint32_t c, uint32_t d, enum get_cpu_vendor mode);
uint8_t get_cpu_family(uint32_t raw, uint8_t *model, uint8_t *stepping);
-void pv_cpuid(struct cpu_user_regs *regs);
-
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_X86_PROCESSOR_H */
All callers of pv_cpuid() and hvm_cpuid() (other than guest_cpuid() legacy path) have been removed from the codebase. Move them into cpuid.c to avoid any further use, leaving guest_cpuid() as the sole API to use. Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> --- CC: Jan Beulich <JBeulich@suse.com> --- xen/arch/x86/cpuid.c | 524 ++++++++++++++++++++++++++++++++++++++++ xen/arch/x86/hvm/hvm.c | 250 ------------------- xen/arch/x86/traps.c | 273 --------------------- xen/include/asm-x86/hvm/hvm.h | 2 - xen/include/asm-x86/processor.h | 2 - 5 files changed, 524 insertions(+), 527 deletions(-)