diff mbox series

[v4,4/5] KVM: SVM: Add support for KVM_CAP_X86_BUS_LOCK_EXIT on SVM CPUs

Message ID 20250324130248.126036-5-manali.shukla@amd.com (mailing list archive)
State New
Headers show
Series Add support for the Bus Lock Threshold | expand

Commit Message

Manali Shukla March 24, 2025, 1:02 p.m. UTC
Add support for KVM_CAP_X86_BUS_LOCK_EXIT on SVM CPUs with Bus Lock
Threshold, which is close enough to VMX's Bus Lock Detection VM-Exit to
allow reusing KVM_CAP_X86_BUS_LOCK_EXIT.

The biggest difference between the two features is that Threshold is
fault-like, whereas Detection is trap-like.  To allow the guest to make
forward progress, Threshold provides a per-VMCB counter which is
decremented every time a bus lock occurs, and a VM-Exit is triggered if
and only if the counter is '0'.

To provide Detection-like semantics, initialize the counter to '0', i.e.
exit on every bus lock, and when re-executing the guilty instruction, set
the counter to '1' to effectively step past the instruction.

Note, in the unlikely scenario that re-executing the instruction doesn't
trigger a bus lock, e.g. because the guest has changed memory types or
patched the guilty instruction, the bus lock counter will be left at '1',
i.e. the guest will be able to do a bus lock on a different instruction.
In a perfect world, KVM would ensure the counter is '0' if the guest has
made forward progress, e.g. if RIP has changed.  But trying to close that
hole would incur non-trivial complexity, for marginal benefit; the intent
of KVM_CAP_X86_BUS_LOCK_EXIT is to allow userspace rate-limit bus locks,
not to allow for precise detection of problematic guest code.  And, it's
simply not feasible to fully close the hole, e.g. if an interrupt arrives
before the original instruction can re-execute, the guest could step past
a different bus lock.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Manali Shukla <manali.shukla@amd.com>
---
 Documentation/virt/kvm/api.rst | 19 +++++++++++++++
 arch/x86/kvm/svm/nested.c      | 42 ++++++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c         | 30 ++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.h         |  2 ++
 4 files changed, 93 insertions(+)
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 5fe84f2427b5..f7c925aa0c4f 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7909,6 +7909,25 @@  apply some other policy-based mitigation. When exiting to userspace, KVM sets
 KVM_RUN_X86_BUS_LOCK in vcpu-run->flags, and conditionally sets the exit_reason
 to KVM_EXIT_X86_BUS_LOCK.
 
+Note! KVM_CAP_X86_BUS_LOCK_EXIT on AMD CPUs with the Bus Lock Threshold is close
+enough  to INTEL's Bus Lock Detection VM-Exit to allow using
+KVM_CAP_X86_BUS_LOCK_EXIT for AMD CPUs.
+
+The biggest difference between the two features is that Threshold (AMD CPUs) is
+fault-like i.e. the bus lock exit to user space occurs with RIP pointing at the
+offending instruction, whereas Detection (Intel CPUs) is trap-like i.e. the bus
+lock exit to user space occurs with RIP pointing at the instruction right after
+the guilty one.
+
+The bus lock threshold on AMD CPUs provides a per-VMCB counter which is
+decremented every time a bus lock occurs, and a VM-Exit is triggered if and only
+if the bus lock counter is '0'.
+
+To provide Detection-like semantics for AMD CPUs, the bus lock counter has been
+initialized to '0', i.e. exit on every bus lock, and when re-executing the
+guilty instruction, the bus lock counter has been set to '1' to effectively step
+past the instruction.
+
 Note! Detected bus locks may be coincident with other exits to userspace, i.e.
 KVM_RUN_X86_BUS_LOCK should be checked regardless of the primary exit reason if
 userspace wants to take action on all detected bus locks.
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 834b67672d50..a42ef7dd9143 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -678,6 +678,36 @@  static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
 	vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
 	vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
 
+	/*
+	 * Stash vmcb02's counter if the guest hasn't moved past the guilty
+	 * instrution; otherwise, reset the counter to '0'.
+	 *
+	 * In order to detect if L2 has made forward progress or not, track the
+	 * RIP at which a bus lock has occurred on a per-vmcb12 basis.  If RIP
+	 * is changed, guest has clearly made forward progress, bus_lock_counter
+	 * still remained '1', so reset bus_lock_counter to '0'. Eg. In the
+	 * scenario, where a buslock happened in L1 before VMRUN, the bus lock
+	 * firmly happened on an instruction in the past. Even if vmcb01's
+	 * counter is still '1', (because the guilty instruction got patched),
+	 * the vCPU has clearly made forward progress and so KVM should reset
+	 * vmcb02's counter to '0'.
+	 *
+	 * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN
+	 * to prevent the same guilty instruction from triggering a VM-Exit. Eg.
+	 * if userspace rate-limits the vCPU, then it's entirely possible that
+	 * L1's tick interrupt is pending by the time userspace re-runs the
+	 * vCPU.  If KVM unconditionally clears the counter on VMRUN, then when
+	 * L1 re-enters L2, the same instruction will trigger a VM-Exit and the
+	 * entire cycle start over.
+	 */
+	if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) {
+		vmcb02->control.bus_lock_counter = 1;
+		svm->bus_lock_rip = svm->nested.ctl.bus_lock_rip;
+	} else {
+		vmcb02->control.bus_lock_counter = 0;
+	}
+	svm->nested.ctl.bus_lock_rip = INVALID_GPA;
+
 	/* Done at vmrun: asid.  */
 
 	/* Also overwritten later if necessary.  */
@@ -1039,6 +1069,18 @@  int nested_svm_vmexit(struct vcpu_svm *svm)
 
 	}
 
+	/*
+	 * If bus_lock_counter is nonzero and the guest has not moved past the
+	 * guilty instruction, save bus_lock_rip in svm_nested_state. This will
+	 * help determine at nested VMRUN whether to stash vmcb02's counter or
+	 * reset it to '0'.
+	 */
+	if (vmcb02->control.bus_lock_counter &&
+	    svm->bus_lock_rip == vmcb02->save.rip)
+		svm->nested.ctl.bus_lock_rip = svm->bus_lock_rip;
+	else
+		svm->nested.ctl.bus_lock_rip = INVALID_GPA;
+
 	nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
 
 	svm_switch_vmcb(svm, &svm->vmcb01);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 244e099e7262..ea12e93ae983 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3310,6 +3310,35 @@  static int invpcid_interception(struct kvm_vcpu *vcpu)
 	return kvm_handle_invpcid(vcpu, type, gva);
 }
 
+static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	/*
+	 * If userspace has NOT changed RIP, then KVM's ABI is to let the guest
+	 * execute the bus-locking instruction.  Set the bus lock counter to '1'
+	 * to effectively step past the bus lock.
+	 */
+	if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))
+		svm->vmcb->control.bus_lock_counter = 1;
+
+	return 1;
+}
+
+static int bus_lock_exit(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+	vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+
+	vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
+	svm->bus_lock_rip = vcpu->arch.cui_linear_rip;
+	vcpu->arch.complete_userspace_io = complete_userspace_buslock;
+
+	return 0;
+}
+
 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[SVM_EXIT_READ_CR0]			= cr_interception,
 	[SVM_EXIT_READ_CR3]			= cr_interception,
@@ -3379,6 +3408,7 @@  static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[SVM_EXIT_INVPCID]                      = invpcid_interception,
 	[SVM_EXIT_IDLE_HLT]			= kvm_emulate_halt,
 	[SVM_EXIT_NPF]				= npf_interception,
+	[SVM_EXIT_BUS_LOCK]                     = bus_lock_exit,
 	[SVM_EXIT_RSM]                          = rsm_interception,
 	[SVM_EXIT_AVIC_INCOMPLETE_IPI]		= avic_incomplete_ipi_interception,
 	[SVM_EXIT_AVIC_UNACCELERATED_ACCESS]	= avic_unaccelerated_access_interception,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index d4490eaed55d..7a4c5848c952 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -169,6 +169,7 @@  struct vmcb_ctrl_area_cached {
 	u64 nested_cr3;
 	u64 virt_ext;
 	u32 clean;
+	u64 bus_lock_rip;
 	union {
 #if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV)
 		struct hv_vmcb_enlightenments hv_enlightenments;
@@ -327,6 +328,7 @@  struct vcpu_svm {
 
 	/* Guest GIF value, used when vGIF is not enabled */
 	bool guest_gif;
+	u64 bus_lock_rip;
 };
 
 struct svm_cpu_data {