@@ -28,6 +28,8 @@
#include "irq.h"
#include "svm.h"
+#include "../../../drivers/iommu/amd/amd_iommu_types.h"
+
/*
* Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that
* KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't
@@ -141,11 +143,7 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm)
svm_set_x2apic_msr_interception(svm, true);
}
-/* Note:
- * This function is called from IOMMU driver to notify
- * SVM to schedule in a particular vCPU of a particular VM.
- */
-int avic_ga_log_notifier(u32 ga_tag)
+static struct kvm_vcpu *avic_ga_log_get_vcpu(u32 ga_tag)
{
unsigned long flags;
struct kvm_svm *kvm_svm;
@@ -165,6 +163,17 @@ int avic_ga_log_notifier(u32 ga_tag)
}
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+ return vcpu;
+}
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+int avic_ga_log_notifier(u32 ga_tag)
+{
+ struct kvm_vcpu *vcpu = avic_ga_log_get_vcpu(ga_tag);
+
/* Note:
* At this point, the IOMMU should have already set the pending
* bit in the vAPIC backing page. So, we just need to schedule
@@ -750,6 +759,8 @@ static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
}
+extern struct amd_iommu_pi_data amd_iommu_fake_irte;
+
int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
unsigned int host_irq, uint32_t guest_irq,
struct kvm_kernel_irq_routing_entry *new,
@@ -1055,6 +1066,58 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
avic_vcpu_load(vcpu, vcpu->cpu);
}
+static void avic_pi_handler(void)
+{
+ struct amd_iommu_pi_data pi;
+ struct kvm_vcpu *vcpu;
+
+ memcpy(&pi, &amd_iommu_fake_irte, sizeof(pi));
+
+ if (!pi.is_guest_mode) {
+ pr_warn("IRQ %u arrived with !is_guest_mode\n", pi.vector);
+ return;
+ }
+
+ vcpu = avic_ga_log_get_vcpu(pi.ga_tag);
+ if (!vcpu) {
+ pr_warn("No vCPU for IRQ %u\n", pi.vector);
+ return;
+ }
+ WARN_ON_ONCE(pi.vapic_addr << 12 != avic_get_backing_page_address(to_svm(vcpu)));
+
+ /*
+ * When updating a vCPU's IRTE, the fake posted IRQ can race with the
+ * IRTE update. Take ir_list_lock so that the IRQ can be processed
+ * atomically. In real hardware, the IOMMU will complete IRQ delivery
+ * before accepting the new IRTE.
+ */
+ guard(spinlock_irqsave)(&to_svm(vcpu)->ir_list_lock);
+
+ if (amd_iommu_fake_irte.ga_tag != pi.ga_tag) {
+ WARN_ON_ONCE(amd_iommu_fake_irte.is_guest_mode);
+ return;
+ }
+
+ memcpy(&pi, &amd_iommu_fake_irte, sizeof(pi));
+
+#if 0
+ pr_warn("In PI handler, guest = %u, cpu = %d, tag = %x, intr = %u, vector = %u\n",
+ pi.is_guest_mode, pi.cpu,
+ pi.ga_tag, pi.ga_log_intr, pi.vector);
+#endif
+
+ if (!pi.is_guest_mode)
+ return;
+
+ kvm_lapic_set_irr(pi.vector, vcpu->arch.apic);
+ smp_mb__after_atomic();
+
+ if (pi.cpu >= 0)
+ avic_ring_doorbell(vcpu);
+ else if (pi.ga_log_intr)
+ avic_ga_log_notifier(pi.ga_tag);
+}
+
/*
* Note:
* - The module param avic enable both xAPIC and x2APIC mode.
@@ -1107,5 +1170,8 @@ bool avic_hardware_setup(void)
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+ pr_warn("Register AVIC PI wakeup handler\n");
+ kvm_set_posted_intr_wakeup_handler(avic_pi_handler);
+
return true;
}
@@ -1122,6 +1122,8 @@ static void svm_hardware_unsetup(void)
{
int cpu;
+ kvm_set_posted_intr_wakeup_handler(NULL);
+
sev_hardware_unsetup();
for_each_possible_cpu(cpu)
@@ -2863,8 +2863,12 @@ static void enable_iommus_vapic(void)
return;
}
- if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
- !check_feature(FEATURE_GAM_VAPIC)) {
+ if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+ return;
+
+ if (!check_feature(FEATURE_GAM_VAPIC)) {
+ pr_warn("IOMMU lacks GAM_VAPIC, fudging IRQ posting\n");
+ amd_iommu_irq_ops.capability |= (1 << IRQ_POSTING_CAP);
amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA;
return;
}
@@ -3775,6 +3775,15 @@ static const struct irq_domain_ops amd_ir_domain_ops = {
.deactivate = irq_remapping_deactivate,
};
+struct amd_iommu_pi_data amd_iommu_fake_irte;
+EXPORT_SYMBOL_GPL(amd_iommu_fake_irte);
+
+static bool amd_iommu_fudge_pi(void)
+{
+ return irq_remapping_cap(IRQ_POSTING_CAP) &&
+ !AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir);
+}
+
static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu,
bool ga_log_intr)
{
@@ -3796,6 +3805,12 @@ int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr)
struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
+ if (amd_iommu_fudge_pi()) {
+ amd_iommu_fake_irte.cpu = cpu;
+ amd_iommu_fake_irte.ga_log_intr = ga_log_intr;
+ return 0;
+ }
+
if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
return -EINVAL;
@@ -3818,6 +3833,26 @@ int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr)
struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
u64 valid;
+ if (amd_iommu_fudge_pi()) {
+ if (WARN_ON_ONCE(!entry->lo.fields_remap.valid))
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(entry->lo.fields_remap.int_type != APIC_DELIVERY_MODE_FIXED))
+ return -EINVAL;
+
+ amd_iommu_fake_irte.cpu = cpu;
+ amd_iommu_fake_irte.vapic_addr = ir_data->ga_root_ptr;
+ amd_iommu_fake_irte.vector = ir_data->ga_vector;
+ amd_iommu_fake_irte.ga_tag = ir_data->ga_tag;
+ amd_iommu_fake_irte.ga_log_intr = ga_log_intr;
+ amd_iommu_fake_irte.is_guest_mode = true;
+
+ entry->hi.fields.vector = POSTED_INTR_WAKEUP_VECTOR;
+
+ return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
+ ir_data->irq_2_irte.index, entry);
+ }
+
if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
return -EINVAL;
@@ -3849,12 +3884,18 @@ int amd_iommu_deactivate_guest_mode(void *data)
struct irq_cfg *cfg = ir_data->cfg;
u64 valid;
+ if (amd_iommu_fudge_pi() && entry) {
+ memset(&amd_iommu_fake_irte, 0, sizeof(amd_iommu_fake_irte));
+ goto fudge;
+ }
+
if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
return -EINVAL;
if (!entry || !entry->lo.fields_vapic.guest_mode)
return 0;
+fudge:
valid = entry->lo.fields_remap.valid;
entry->lo.val = 0;
@@ -3891,12 +3932,19 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info)
* This device has never been set up for guest mode.
* we should not modify the IRTE
*/
- if (!dev_data || !dev_data->use_vapic)
+ if (!dev_data)
+ return -EINVAL;
+
+ if (amd_iommu_fudge_pi())
+ goto fudge;
+
+ if (!dev_data->use_vapic)
return -EINVAL;
if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
return -EINVAL;
+fudge:
ir_data->cfg = irqd_cfg(data);
if (pi_data) {