@@ -129,7 +129,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u64 tsc_offset;
u32 asid;
u8 tlb_ctl;
- u8 reserved_2[3];
+ u8 erap_ctl;
+ u8 reserved_2[2];
u32 int_ctl;
u32 int_vector;
u32 int_state;
@@ -175,6 +176,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define TLB_CONTROL_FLUSH_ASID 3
#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
+#define ERAP_CONTROL_ALLOW_LARGER_RAP 0
+#define ERAP_CONTROL_FLUSH_RAP 1
+
#define V_TPR_MASK 0x0f
#define V_IRQ_SHIFT 8
@@ -797,6 +797,8 @@ void kvm_set_cpu_caps(void)
F(WRMSR_XX_BASE_NS)
);
+ if (tdp_enabled)
+ kvm_cpu_cap_check_and_set(X86_FEATURE_ERAPS);
kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB);
kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE);
kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO);
@@ -1356,10 +1358,22 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
case 0x80000020:
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
break;
- case 0x80000021:
- entry->ebx = entry->ecx = entry->edx = 0;
+ case 0x80000021: {
+ unsigned int ebx_mask = 0;
+
+ entry->ecx = entry->edx = 0;
cpuid_entry_override(entry, CPUID_8000_0021_EAX);
+
+ /*
+ * Bits 23:16 in EBX indicate the size of the RSB.
+ * Expose the value in the hardware to the guest.
+ */
+ if (kvm_cpu_cap_has(X86_FEATURE_ERAPS))
+ ebx_mask |= GENMASK(23, 16);
+
+ entry->ebx &= ebx_mask;
break;
+ }
/* AMD Extended Performance Monitoring and Debug */
case 0x80000022: {
union cpuid_0x80000022_ebx ebx;
@@ -1360,6 +1360,28 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ /*
+ * If the hardware has a larger RSB, use it in the guest context as
+ * well.
+ *
+ * When running nested guests: the hardware tags host and guest RSB
+ * entries, but the entries are ASID agnostic. Differentiating L1 and
+ * L2 guests isn't possible in hardware. To prevent L2->L1 RSB
+ * poisoning attacks in this case, the L0 hypervisor must set
+ * FLUSH_RAP_ON_VMRUN in the L1's VMCB on a nested #VMEXIT to ensure
+ * the next VMRUN flushes the RSB.
+ *
+ * For shadow paging / NPT disabled case: the CPU's CR3 does not
+ * contain the CR3 of the running guest process, and hence intra-guest
+ * context switches will not cause a hardware TLB flush, which in turn
+ * does not result in a guest RSB flush that the ERAPS feature
+ * provides. Do not expose ERAPS or the larger RSB to the guest in
+ * this case, so the guest continues implementing software mitigations
+ * as well as only sees 32 entries for the RSB.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_ERAPS) && npt_enabled)
+ vmcb_set_larger_rap(svm->vmcb);
+
if (kvm_vcpu_apicv_active(vcpu))
avic_init_vmcb(svm, vmcb);
@@ -3393,6 +3415,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
pr_err("%-20s%d\n", "asid:", control->asid);
pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
+ pr_err("%-20s%d\n", "erap_ctl:", control->erap_ctl);
pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
pr_err("%-20s%08x\n", "int_state:", control->int_state);
@@ -3559,6 +3582,27 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
+ if (boot_cpu_has(X86_FEATURE_ERAPS)
+ && vmcb_is_larger_rap(svm->vmcb01.ptr)) {
+ /*
+ * XXX a few further optimizations can be made:
+ *
+ * 1. In pre_svm_run() we can reset this bit when a hw
+ * TLB flush has happened - any context switch on a
+ * CPU (which causes a TLB flush) auto-flushes the RSB
+ * - eg when this vCPU is scheduled on a different
+ * pCPU.
+ *
+ * 2. This is also not needed in the case where the
+ * vCPU is being scheduled on the same pCPU, but there
+ * was a context switch between the #VMEXIT and VMRUN.
+ *
+ * 3. If the guest returns to L2 again after this
+ * #VMEXIT, there's no need to flush the RSB.
+ */
+ vmcb_set_flush_rap(svm->vmcb01.ptr);
+ }
+
vmexit = nested_svm_exit_special(svm);
if (vmexit == NESTED_EXIT_CONTINUE)
@@ -500,6 +500,21 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
return vmcb_is_intercept(&svm->vmcb->control, bit);
}
+static inline void vmcb_set_flush_rap(struct vmcb *vmcb)
+{
+ __set_bit(ERAP_CONTROL_FLUSH_RAP, (unsigned long *)&vmcb->control.erap_ctl);
+}
+
+static inline void vmcb_set_larger_rap(struct vmcb *vmcb)
+{
+ __set_bit(ERAP_CONTROL_ALLOW_LARGER_RAP, (unsigned long *)&vmcb->control.erap_ctl);
+}
+
+static inline bool vmcb_is_larger_rap(struct vmcb *vmcb)
+{
+ return test_bit(ERAP_CONTROL_ALLOW_LARGER_RAP, (unsigned long *)&vmcb->control.erap_ctl);
+}
+
static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
{
return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) &&