@@ -7909,6 +7909,25 @@ apply some other policy-based mitigation. When exiting to userspace, KVM sets
KVM_RUN_X86_BUS_LOCK in vcpu-run->flags, and conditionally sets the exit_reason
to KVM_EXIT_X86_BUS_LOCK.
+Note! KVM_CAP_X86_BUS_LOCK_EXIT on AMD CPUs with the Bus Lock Threshold is close
+enough to INTEL's Bus Lock Detection VM-Exit to allow using
+KVM_CAP_X86_BUS_LOCK_EXIT for AMD CPUs.
+
+The biggest difference between the two features is that Threshold (AMD CPUs) is
+fault-like i.e. the bus lock exit to user space occurs with RIP pointing at the
+offending instruction, whereas Detection (Intel CPUs) is trap-like i.e. the bus
+lock exit to user space occurs with RIP pointing at the instruction right after
+the guilty one.
+
+The bus lock threshold on AMD CPUs provides a per-VMCB counter which is
+decremented every time a bus lock occurs, and a VM-Exit is triggered if and only
+if the bus lock counter is '0'.
+
+To provide Detection-like semantics for AMD CPUs, the bus lock counter has been
+initialized to '0', i.e. exit on every bus lock, and when re-executing the
+guilty instruction, the bus lock counter has been set to '1' to effectively step
+past the instruction.
+
Note! Detected bus locks may be coincident with other exits to userspace, i.e.
KVM_RUN_X86_BUS_LOCK should be checked regardless of the primary exit reason if
userspace wants to take action on all detected bus locks.
@@ -678,6 +678,36 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
+ /*
+ * Stash vmcb02's counter if the guest hasn't moved past the guilty
+ * instrution; otherwise, reset the counter to '0'.
+ *
+ * In order to detect if L2 has made forward progress or not, track the
+ * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP
+ * is changed, guest has clearly made forward progress, bus_lock_counter
+ * still remained '1', so reset bus_lock_counter to '0'. Eg. In the
+ * scenario, where a buslock happened in L1 before VMRUN, the bus lock
+ * firmly happened on an instruction in the past. Even if vmcb01's
+ * counter is still '1', (because the guilty instruction got patched),
+ * the vCPU has clearly made forward progress and so KVM should reset
+ * vmcb02's counter to '0'.
+ *
+ * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN
+ * to prevent the same guilty instruction from triggering a VM-Exit. Eg.
+ * if userspace rate-limits the vCPU, then it's entirely possible that
+ * L1's tick interrupt is pending by the time userspace re-runs the
+ * vCPU. If KVM unconditionally clears the counter on VMRUN, then when
+ * L1 re-enters L2, the same instruction will trigger a VM-Exit and the
+ * entire cycle start over.
+ */
+ if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) {
+ vmcb02->control.bus_lock_counter = 1;
+ svm->bus_lock_rip = svm->nested.ctl.bus_lock_rip;
+ } else {
+ vmcb02->control.bus_lock_counter = 0;
+ }
+ svm->nested.ctl.bus_lock_rip = INVALID_GPA;
+
/* Done at vmrun: asid. */
/* Also overwritten later if necessary. */
@@ -1039,6 +1069,18 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
}
+ /*
+ * If bus_lock_counter is nonzero and the guest has not moved past the
+ * guilty instruction, save bus_lock_rip in svm_nested_state. This will
+ * help determine at nested VMRUN whether to stash vmcb02's counter or
+ * reset it to '0'.
+ */
+ if (vmcb02->control.bus_lock_counter &&
+ svm->bus_lock_rip == vmcb02->save.rip)
+ svm->nested.ctl.bus_lock_rip = svm->bus_lock_rip;
+ else
+ svm->nested.ctl.bus_lock_rip = INVALID_GPA;
+
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
svm_switch_vmcb(svm, &svm->vmcb01);
@@ -3310,6 +3310,35 @@ static int invpcid_interception(struct kvm_vcpu *vcpu)
return kvm_handle_invpcid(vcpu, type, gva);
}
+static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If userspace has NOT changed RIP, then KVM's ABI is to let the guest
+ * execute the bus-locking instruction. Set the bus lock counter to '1'
+ * to effectively step past the bus lock.
+ */
+ if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))
+ svm->vmcb->control.bus_lock_counter = 1;
+
+ return 1;
+}
+
+static int bus_lock_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
+ svm->bus_lock_rip = vcpu->arch.cui_linear_rip;
+ vcpu->arch.complete_userspace_io = complete_userspace_buslock;
+
+ return 0;
+}
+
static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
@@ -3379,6 +3408,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_INVPCID] = invpcid_interception,
[SVM_EXIT_IDLE_HLT] = kvm_emulate_halt,
[SVM_EXIT_NPF] = npf_interception,
+ [SVM_EXIT_BUS_LOCK] = bus_lock_exit,
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
@@ -169,6 +169,7 @@ struct vmcb_ctrl_area_cached {
u64 nested_cr3;
u64 virt_ext;
u32 clean;
+ u64 bus_lock_rip;
union {
#if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV)
struct hv_vmcb_enlightenments hv_enlightenments;
@@ -327,6 +328,7 @@ struct vcpu_svm {
/* Guest GIF value, used when vGIF is not enabled */
bool guest_gif;
+ u64 bus_lock_rip;
};
struct svm_cpu_data {
Add support for KVM_CAP_X86_BUS_LOCK_EXIT on SVM CPUs with Bus Lock Threshold, which is close enough to VMX's Bus Lock Detection VM-Exit to allow reusing KVM_CAP_X86_BUS_LOCK_EXIT. The biggest difference between the two features is that Threshold is fault-like, whereas Detection is trap-like. To allow the guest to make forward progress, Threshold provides a per-VMCB counter which is decremented every time a bus lock occurs, and a VM-Exit is triggered if and only if the counter is '0'. To provide Detection-like semantics, initialize the counter to '0', i.e. exit on every bus lock, and when re-executing the guilty instruction, set the counter to '1' to effectively step past the instruction. Note, in the unlikely scenario that re-executing the instruction doesn't trigger a bus lock, e.g. because the guest has changed memory types or patched the guilty instruction, the bus lock counter will be left at '1', i.e. the guest will be able to do a bus lock on a different instruction. In a perfect world, KVM would ensure the counter is '0' if the guest has made forward progress, e.g. if RIP has changed. But trying to close that hole would incur non-trivial complexity, for marginal benefit; the intent of KVM_CAP_X86_BUS_LOCK_EXIT is to allow userspace rate-limit bus locks, not to allow for precise detection of problematic guest code. And, it's simply not feasible to fully close the hole, e.g. if an interrupt arrives before the original instruction can re-execute, the guest could step past a different bus lock. Suggested-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Manali Shukla <manali.shukla@amd.com> --- Documentation/virt/kvm/api.rst | 19 +++++++++++++++ arch/x86/kvm/svm/nested.c | 42 ++++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 30 ++++++++++++++++++++++++ arch/x86/kvm/svm/svm.h | 2 ++ 4 files changed, 93 insertions(+)