@@ -1112,6 +1112,14 @@ following flags are specified:
/* Depends on KVM_CAP_IOMMU */
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
+/* The following two depend on KVM_CAP_PCI_2_3 */
+#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
+
+If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
+via the PCI-2.3-compliant device-level mask, but only if IRQ sharing with other
+assigned or host devices requires it. KVM_DEV_ASSIGN_MASK_INTX specifies the
+guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
4.48 KVM_DEASSIGN_PCI_DEVICE
@@ -1263,6 +1271,25 @@ struct kvm_assigned_msix_entry {
__u16 padding[3];
};
+4.54 KVM_ASSIGN_SET_INTX_MASK
+
+Capability: KVM_CAP_PCI_2_3
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_assigned_pci_dev (in)
+Returns: 0 on success, -1 on error
+
+Informs the kernel about the guest's view on the INTx mask. As long as the
+guest masks the legacy INTx, the kernel will refrain from unmasking it at
+hardware level and will not assert the guest's IRQ line. User space is still
+responsible for applying this state to the assigned device's real config space.
+To avoid that the kernel overwrites the state user space wants to set,
+KVM_ASSIGN_SET_INTX_MASK has to be called prior to updating the config space.
+
+See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
+by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
+evaluated.
+
5. The kvm_run structure
Application code obtains a pointer to the kvm_run structure by
@@ -1965,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
+ case KVM_CAP_PCI_2_3:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -541,6 +541,7 @@ struct kvm_ppc_pvinfo {
#define KVM_CAP_PPC_GET_PVINFO 57
#define KVM_CAP_PPC_IRQ_LEVEL 58
#define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_PCI_2_3 60
#ifdef KVM_CAP_IRQ_ROUTING
@@ -677,6 +678,9 @@ struct kvm_clock_data {
#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2)
/* Available with KVM_CAP_PPC_GET_PVINFO */
#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo)
+/* Available with KVM_CAP_PCI_2_3 */
+#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa2, \
+ struct kvm_assigned_pci_dev)
/*
* ioctls for vcpu fds
@@ -742,6 +746,8 @@ struct kvm_clock_data {
#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs)
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
+#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
struct kvm_assigned_pci_dev {
__u32 assigned_dev_id;
@@ -477,6 +477,12 @@ struct kvm_irq_ack_notifier {
void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
};
+enum kvm_intx_state {
+ KVM_INTX_ENABLED,
+ KVM_INTX_LINE_DISABLED,
+ KVM_INTX_DEVICE_DISABLED,
+};
+
struct kvm_assigned_dev_kernel {
struct kvm_irq_ack_notifier ack_notifier;
struct list_head list;
@@ -486,7 +492,7 @@ struct kvm_assigned_dev_kernel {
int host_devfn;
unsigned int entries_nr;
int host_irq;
- bool host_irq_disabled;
+ unsigned long last_irq_status;
struct msix_entry *host_msix_entries;
int guest_irq;
struct msix_entry *guest_msix_entries;
@@ -496,6 +502,8 @@ struct kvm_assigned_dev_kernel {
struct pci_dev *dev;
struct kvm *kvm;
spinlock_t intx_lock;
+ spinlock_t intx_mask_lock;
+ enum kvm_intx_state intx_state;
char irq_name[32];
};
@@ -55,22 +55,141 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
return index;
}
-static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
+static bool
+pci_2_3_set_irq_mask(struct pci_dev *dev, bool mask, bool check_status)
+{
+ u32 cmd_status_dword;
+ u16 origcmd, newcmd;
+ bool mask_updated = true;
+
+ /*
+ * We do a single dword read to retrieve both command and status.
+ * Document assumptions that make this possible.
+ */
+ BUILD_BUG_ON(PCI_COMMAND % 4);
+ BUILD_BUG_ON(PCI_COMMAND + 2 != PCI_STATUS);
+
+ pci_block_user_cfg_access(dev);
+
+ /*
+ * Read both command and status registers in a single 32-bit operation.
+ * Note: we could cache the value for command and move the status read
+ * out of the lock if there was a way to get notified of user changes
+ * to command register through sysfs. Should be good for shared irqs.
+ */
+ pci_read_config_dword(dev, PCI_COMMAND, &cmd_status_dword);
+
+ if (check_status) {
+ bool irq_pending =
+ (cmd_status_dword >> 16) & PCI_STATUS_INTERRUPT;
+
+ /*
+ * Check interrupt status register to see whether our device
+ * triggered the interrupt (when masking) or the next IRQ is
+ * already pending (when unmasking).
+ */
+ if (mask != irq_pending) {
+ mask_updated = false;
+ goto done;
+ }
+ }
+
+ origcmd = cmd_status_dword;
+ newcmd = origcmd & ~PCI_COMMAND_INTX_DISABLE;
+ if (mask)
+ newcmd |= PCI_COMMAND_INTX_DISABLE;
+ if (newcmd != origcmd)
+ pci_write_config_word(dev, PCI_COMMAND, newcmd);
+
+done:
+ pci_unblock_user_cfg_access(dev);
+ return mask_updated;
+}
+
+static void pci_2_3_irq_mask(struct pci_dev *dev)
+{
+ pci_2_3_set_irq_mask(dev, true, false);
+}
+
+static bool pci_2_3_irq_check_and_mask(struct pci_dev *dev)
+{
+ return pci_2_3_set_irq_mask(dev, true, true);
+}
+
+static void pci_2_3_irq_unmask(struct pci_dev *dev)
+{
+ pci_2_3_set_irq_mask(dev, false, false);
+}
+
+static bool pci_2_3_irq_check_and_unmask(struct pci_dev *dev)
+{
+ return pci_2_3_set_irq_mask(dev, false, true);
+}
+
+static irqreturn_t kvm_assigned_dev_intr_intx(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ unsigned long irq_status = get_irq_status(irq);
+ int ret;
+
+ assigned_dev->last_irq_status = irq_status;
+
+ if (!(irq_status & (IRQS_SHARED | IRQS_MAKE_SHAREABLE)))
+ return IRQ_WAKE_THREAD;
+
+ spin_lock(&assigned_dev->intx_lock);
+
+ if (irq_status & IRQS_MAKE_SHAREABLE) {
+ if (assigned_dev->intx_state == KVM_INTX_LINE_DISABLED) {
+ pci_2_3_irq_mask(assigned_dev->dev);
+ enable_irq(irq);
+ assigned_dev->intx_state = KVM_INTX_DEVICE_DISABLED;
+ }
+ ret = IRQ_HANDLED;
+ } else if (pci_2_3_irq_check_and_mask(assigned_dev->dev)) {
+ assigned_dev->intx_state = KVM_INTX_DEVICE_DISABLED;
+ ret = IRQ_WAKE_THREAD;
+ } else
+ ret =IRQ_NONE;
+
+ spin_unlock(&assigned_dev->intx_lock);
+
+ return ret;
+}
+
+static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
- if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
- spin_lock(&assigned_dev->intx_lock);
- disable_irq_nosync(irq);
- assigned_dev->host_irq_disabled = true;
- spin_unlock(&assigned_dev->intx_lock);
+ if (!(assigned_dev->last_irq_status & IRQS_SHARED)) {
+ spin_lock_irq(&assigned_dev->intx_lock);
+ if (assigned_dev->intx_state == KVM_INTX_ENABLED) {
+ disable_irq_nosync(irq);
+ assigned_dev->intx_state = KVM_INTX_LINE_DISABLED;
+ }
+ spin_unlock_irq(&assigned_dev->intx_lock);
}
+ spin_lock(&assigned_dev->intx_mask_lock);
+ if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
+ kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+ assigned_dev->guest_irq, 1);
+ spin_unlock(&assigned_dev->intx_mask_lock);
+
+ return IRQ_HANDLED;
+}
+
+#ifdef __KVM_HAVE_MSI
+static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
assigned_dev->guest_irq, 1);
return IRQ_HANDLED;
}
+#endif
#ifdef __KVM_HAVE_MSIX
static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
@@ -102,15 +221,36 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
- /* The guest irq may be shared so this ack may be
- * from another device.
- */
- spin_lock(&dev->intx_lock);
- if (dev->host_irq_disabled) {
- enable_irq(dev->host_irq);
- dev->host_irq_disabled = false;
+ if (likely(!(dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX)))
+ return;
+
+ spin_lock(&dev->intx_mask_lock);
+
+ if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
+ bool reassert = false;
+
+ spin_lock_irq(&dev->intx_lock);
+ /*
+ * The guest IRQ may be shared so this ack can come from an
+ * IRQ for another guest device.
+ */
+ if (dev->intx_state == KVM_INTX_LINE_DISABLED) {
+ enable_irq(dev->host_irq);
+ dev->intx_state = KVM_INTX_ENABLED;
+ } else if (dev->intx_state == KVM_INTX_DEVICE_DISABLED) {
+ if (pci_2_3_irq_check_and_unmask(dev->dev))
+ dev->intx_state = KVM_INTX_ENABLED;
+ else
+ reassert = true;
+ }
+ spin_unlock_irq(&dev->intx_lock);
+
+ if (reassert)
+ kvm_set_irq(dev->kvm, dev->irq_source_id,
+ dev->guest_irq, 1);
}
- spin_unlock(&dev->intx_lock);
+
+ spin_unlock(&dev->intx_mask_lock);
}
static void deassign_guest_irq(struct kvm *kvm,
@@ -155,14 +295,21 @@ static void deassign_host_irq(struct kvm *kvm,
kfree(assigned_dev->host_msix_entries);
kfree(assigned_dev->guest_msix_entries);
pci_disable_msix(assigned_dev->dev);
+ } else if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) {
+ free_irq(assigned_dev->host_irq, assigned_dev);
+ pci_disable_msi(assigned_dev->dev);
} else {
- /* Deal with MSI and INTx */
- disable_irq(assigned_dev->host_irq);
+ if (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+ spin_lock_irq(&assigned_dev->intx_lock);
+ pci_2_3_irq_mask(assigned_dev->dev);
+ /* prevent re-enabling by kvm_assigned_dev_ack_irq */
+ assigned_dev->intx_state = KVM_INTX_ENABLED;
+ spin_unlock_irq(&assigned_dev->intx_lock);
+ synchronize_irq(assigned_dev->host_irq);
+ } else
+ disable_irq(assigned_dev->host_irq);
free_irq(assigned_dev->host_irq, assigned_dev);
-
- if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
- pci_disable_msi(assigned_dev->dev);
}
assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
@@ -231,16 +378,41 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
static int assigned_device_enable_host_intx(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
+ irq_handler_t handler;
+ unsigned long flags;
+ int err;
+
dev->host_irq = dev->dev->irq;
- /* Even though this is PCI, we don't want to use shared
- * interrupts. Sharing host devices with guest-assigned devices
- * on the same interrupt line is not a happy situation: there
- * are going to be long delays in accepting, acking, etc.
+ dev->intx_state = KVM_INTX_ENABLED;
+ dev->last_irq_status = 0;
+
+ /*
+ * We can only share the IRQ line with other host devices if we are
+ * able to disable the IRQ source at device-level - independently of
+ * the guest driver. Otherwise host devices may suffer from unbounded
+ * IRQ latencies when the guest keeps the line asserted.
+ * If PCI 2.3 support is available, we can instal a sharing notifier
+ * and apply the required disabling pattern on demand.
*/
- if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
- IRQF_ONESHOT, dev->irq_name, dev))
- return -EIO;
- return 0;
+ if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+ handler = kvm_assigned_dev_intr_intx;
+ flags = IRQF_SHARED | IRQF_ADAPTIVE | IRQF_COND_ONESHOT;
+ } else {
+ handler = NULL;
+ flags = IRQF_ONESHOT;
+ }
+
+ err = request_threaded_irq(dev->host_irq, handler,
+ kvm_assigned_dev_thread_intx, flags,
+ dev->irq_name, dev);
+
+ if (!err && dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+ spin_lock_irq(&dev->intx_lock);
+ pci_2_3_irq_unmask(dev->dev);
+ spin_unlock_irq(&dev->intx_lock);
+ }
+
+ return err;
}
#ifdef __KVM_HAVE_MSI
@@ -256,8 +428,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
}
dev->host_irq = dev->dev->irq;
- if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
- 0, dev->irq_name, dev)) {
+ if (request_threaded_irq(dev->host_irq, NULL,
+ kvm_assigned_dev_thread_msi, 0,
+ dev->irq_name, dev)) {
pci_disable_msi(dev->dev);
return -EIO;
}
@@ -296,7 +469,6 @@ err:
pci_disable_msix(dev->dev);
return r;
}
-
#endif
static int assigned_device_enable_guest_intx(struct kvm *kvm,
@@ -315,7 +487,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
{
dev->guest_irq = irq->guest_irq;
dev->ack_notifier.gsi = -1;
- dev->host_irq_disabled = false;
return 0;
}
#endif
@@ -327,7 +498,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
{
dev->guest_irq = irq->guest_irq;
dev->ack_notifier.gsi = -1;
- dev->host_irq_disabled = false;
return 0;
}
#endif
@@ -461,6 +631,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
{
int r = -ENODEV;
struct kvm_assigned_dev_kernel *match;
+ unsigned long irq_type;
mutex_lock(&kvm->lock);
@@ -469,12 +640,51 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
if (!match)
goto out;
- r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+ irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
+ KVM_DEV_IRQ_GUEST_MASK);
+ r = kvm_deassign_irq(kvm, match, irq_type);
out:
mutex_unlock(&kvm->lock);
return r;
}
+/*
+ * Verify that the device supports Interrupt Disable bit in command register,
+ * per PCI 2.3, by flipping this bit and reading it back: this bit was readonly
+ * in PCI 2.2.
+ */
+static bool pci_2_3_supported(struct pci_dev *pdev)
+{
+ u16 orig, new;
+
+ pci_block_user_cfg_access(pdev);
+
+ pci_read_config_word(pdev, PCI_COMMAND, &orig);
+ pci_write_config_word(pdev, PCI_COMMAND,
+ orig ^ PCI_COMMAND_INTX_DISABLE);
+ pci_read_config_word(pdev, PCI_COMMAND, &new);
+ pci_write_config_word(pdev, PCI_COMMAND, orig);
+
+ pci_unblock_user_cfg_access(pdev);
+
+ /*
+ * There's no way to protect against
+ * hardware bugs or detect them reliably, but as long as we know
+ * what the value should be, let's go ahead and check it.
+ */
+ if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) {
+ dev_err(&pdev->dev, "Command changed from 0x%x to 0x%x: "
+ "driver or HW bug?\n", orig, new);
+ return false;
+ }
+ if (!((new ^ orig) & PCI_COMMAND_INTX_DISABLE)) {
+ dev_warn(&pdev->dev, "Device does not support disabling "
+ "interrupts, IRQ sharing impossible.\n");
+ return false;
+ }
+ return true;
+}
+
static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
struct kvm_assigned_pci_dev *assigned_dev)
{
@@ -523,6 +733,9 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
pci_reset_function(dev);
pci_save_state(dev);
+ if (!pci_2_3_supported(dev))
+ assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
+
match->assigned_dev_id = assigned_dev->assigned_dev_id;
match->host_segnr = assigned_dev->segnr;
match->host_busnr = assigned_dev->busnr;
@@ -530,6 +743,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
match->flags = assigned_dev->flags;
match->dev = dev;
spin_lock_init(&match->intx_lock);
+ spin_lock_init(&match->intx_mask_lock);
match->irq_source_id = -1;
match->kvm = kvm;
match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
@@ -676,6 +890,53 @@ msix_entry_out:
}
#endif
+static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+ int r = 0;
+ struct kvm_assigned_dev_kernel *match;
+
+ mutex_lock(&kvm->lock);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_dev->assigned_dev_id);
+ if (!match) {
+ r = -ENODEV;
+ goto out;
+ }
+
+ spin_lock(&match->intx_mask_lock);
+
+ match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
+ match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
+
+ if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
+ kvm_set_irq(match->kvm, match->irq_source_id,
+ match->guest_irq, 0);
+ /*
+ * Masking at hardware-level is performed on demand, i.e. when
+ * an IRQ actually arrives at the host.
+ */
+ } else {
+ /*
+ * Unmask the IRQ line. It may have been masked meanwhile if
+ * we aren't using PCI 2.3 INTx masking on the host side.
+ */
+ spin_lock_irq(&match->intx_lock);
+ if (match->intx_state == KVM_INTX_LINE_DISABLED) {
+ enable_irq(match->host_irq);
+ match->intx_state = KVM_INTX_ENABLED;
+ }
+ spin_unlock_irq(&match->intx_lock);
+ }
+
+ spin_unlock(&match->intx_mask_lock);
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
unsigned long arg)
{
@@ -783,6 +1044,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
break;
}
#endif
+ case KVM_ASSIGN_SET_INTX_MASK: {
+ struct kvm_assigned_pci_dev assigned_dev;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+ goto out;
+ r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
+ break;
+ }
default:
r = -ENOTTY;
break;