diff mbox series

[V3,13/33] x86/apic/vector: Provide MSI parent domain

Message ID 20221124232326.034672592@linutronix.de (mailing list archive)
State Accepted
Commit b6d5fc3a5245c65f7c83440460a1566d09cc9038
Headers show
Series [V3,01/33] genirq/msi: Rearrange MSI domain flags | expand

Commit Message

Thomas Gleixner Nov. 24, 2022, 11:26 p.m. UTC
Enable MSI parent domain support in the x86 vector domain and fixup the
checks in the iommu implementations to check whether device::msi::domain is
the default MSI parent domain. That keeps the existing logic to protect
e.g. devices behind VMD working.

The interrupt remap PCI/MSI code still works because the underlying vector
domain still provides the same functionality.

None of the other x86 PCI/MSI, e.g. XEN and HyperV, implementations are
affected either. They still work the same way both at the low level and the
PCI/MSI implementations they provide.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
V2: Fix kernel doc (robot)
---
 arch/x86/include/asm/msi.h          |    6 +
 arch/x86/include/asm/pci.h          |    1 
 arch/x86/kernel/apic/msi.c          |  176 ++++++++++++++++++++++++++----------
 drivers/iommu/amd/iommu.c           |    2 
 drivers/iommu/intel/irq_remapping.c |    2 
 5 files changed, 138 insertions(+), 49 deletions(-)

Comments

Jason Gunthorpe Jan. 4, 2023, 12:34 p.m. UTC | #1
On Fri, Nov 25, 2022 at 12:26:05AM +0100, Thomas Gleixner wrote:
> Enable MSI parent domain support in the x86 vector domain and fixup the
> checks in the iommu implementations to check whether device::msi::domain is
> the default MSI parent domain. That keeps the existing logic to protect
> e.g. devices behind VMD working.
> 
> The interrupt remap PCI/MSI code still works because the underlying vector
> domain still provides the same functionality.
> 
> None of the other x86 PCI/MSI, e.g. XEN and HyperV, implementations are
> affected either. They still work the same way both at the low level and the
> PCI/MSI implementations they provide.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
> V2: Fix kernel doc (robot)
> ---
>  arch/x86/include/asm/msi.h          |    6 +
>  arch/x86/include/asm/pci.h          |    1 
>  arch/x86/kernel/apic/msi.c          |  176 ++++++++++++++++++++++++++----------
>  drivers/iommu/amd/iommu.c           |    2 
>  drivers/iommu/intel/irq_remapping.c |    2
>  5 files changed, 138 insertions(+), 49 deletions(-)

Our test team has discovered some kmem leak complaints on rc1 and
bisected it to this patch.

I don't see an obvious way that fwnode gets destroyed here. So maybe
it should be like this?

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 955267bbc2be63..cbbcb7fd2bd00d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1000,7 +1000,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
 fail:
 	msi_unlock_descs(dev);
 free_fwnode:
-	kfree(fwnode);
+	irq_domain_free_fwnode(fwnode); // ???
 free_bundle:
 	kfree(bundle);
 	return false;
@@ -1013,6 +1013,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
  */
 void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
 {
+	struct fwnode_handle *fwnode = NULL;
 	struct msi_domain_info *info;
 	struct irq_domain *domain;
 
@@ -1025,7 +1026,10 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
 
 	dev->msi.data->__domains[domid].domain = NULL;
 	info = domain->host_data;
+	if (domain->flags & IRQ_DOMAIN_FLAG_MSI_DEVICE)
+		fwnode = domain->fwnode;
 	irq_domain_remove(domain);
+	irq_domain_free_fwnode(fwnode);
 	kfree(container_of(info, struct msi_domain_template, info));
 
 unlock:

Thanks,
Jason

kmemleak trace
unreferenced object 0xffff888120ba9a00 (size 96):
  comm "systemd-modules", pid 221, jiffies 4294893411 (age 635.732s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 e0 19 8b 83 ff ff ff ff  ................
    00 00 00 00 00 00 00 00 18 9a ba 20 81 88 ff ff  ........... ....
  backtrace:
    [<00000000bcb7f3b1>] kmalloc_trace+0x27/0x110
    [<000000008cdbc98d>] __irq_domain_alloc_fwnode+0x51/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000002aec9527>] driver_probe_device+0x49/0x120
    [<000000005f45a989>] __driver_attach+0x1ff/0x4a0
    [<0000000000dcaab2>] bus_for_each_dev+0x11e/0x1a0
unreferenced object 0xffff888120baa800 (size 32):
  comm "systemd-modules", pid 221, jiffies 4294893411 (age 635.732s)
  hex dump (first 32 bytes):
    50 43 49 2d 4d 53 49 58 2d 30 30 30 30 3a 30 38  PCI-MSIX-0000:08
    3a 30 30 2e 30 00 ff ff 00 00 00 00 00 00 00 00  :00.0...........
  backtrace:
    [<00000000bef783eb>] __kmalloc_node_track_caller+0x4c/0x1b0
    [<00000000f16b54a8>] kvasprintf+0xb0/0x130
    [<0000000078634624>] kasprintf+0xa6/0xd0
    [<00000000f17eea1c>] __irq_domain_alloc_fwnode+0x1ce/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000002aec9527>] driver_probe_device+0x49/0x120
unreferenced object 0xffff88812bc8ca80 (size 96):
  comm "systemd-modules", pid 221, jiffies 4294893596 (age 634.996s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 e0 19 8b 83 ff ff ff ff  ................
    00 00 00 00 00 00 00 00 98 ca c8 2b 81 88 ff ff  ...........+....
  backtrace:
    [<00000000bcb7f3b1>] kmalloc_trace+0x27/0x110
    [<000000008cdbc98d>] __irq_domain_alloc_fwnode+0x51/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000002aec9527>] driver_probe_device+0x49/0x120
    [<000000005f45a989>] __driver_attach+0x1ff/0x4a0
    [<0000000000dcaab2>] bus_for_each_dev+0x11e/0x1a0
unreferenced object 0xffff88812bc8dcc0 (size 32):
  comm "systemd-modules", pid 221, jiffies 4294893596 (age 635.000s)
  hex dump (first 32 bytes):
    50 43 49 2d 4d 53 49 58 2d 30 30 30 30 3a 30 38  PCI-MSIX-0000:08
    3a 30 30 2e 31 00 ff ff 82 97 0b 00 00 00 00 00  :00.1...........
  backtrace:
    [<00000000bef783eb>] __kmalloc_node_track_caller+0x4c/0x1b0
    [<00000000f16b54a8>] kvasprintf+0xb0/0x130
    [<0000000078634624>] kasprintf+0xa6/0xd0
    [<00000000f17eea1c>] __irq_domain_alloc_fwnode+0x1ce/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000002aec9527>] driver_probe_device+0x49/0x120
unreferenced object 0xffff888108177580 (size 96):
  comm "sh", pid 9721, jiffies 4294943281 (age 436.568s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 e0 19 8b 83 ff ff ff ff  ................
    00 00 00 00 00 00 00 00 98 75 17 08 81 88 ff ff  .........u......
  backtrace:
    [<00000000bcb7f3b1>] kmalloc_trace+0x27/0x110
    [<000000008cdbc98d>] __irq_domain_alloc_fwnode+0x51/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000002aec9527>] driver_probe_device+0x49/0x120
    [<000000004aebbb6e>] __device_attach_driver+0x157/0x280
    [<00000000c3894808>] bus_for_each_drv+0x123/0x1a0
unreferenced object 0xffff8881525f1680 (size 32):
  comm "sh", pid 9721, jiffies 4294943281 (age 436.568s)
  hex dump (first 32 bytes):
    50 43 49 2d 4d 53 49 58 2d 30 30 30 30 3a 30 38  PCI-MSIX-0000:08
    3a 30 30 2e 32 00 ff ff 00 00 00 00 00 00 00 00  :00.2...........
  backtrace:
    [<00000000bef783eb>] __kmalloc_node_track_caller+0x4c/0x1b0
    [<00000000f16b54a8>] kvasprintf+0xb0/0x130
    [<0000000078634624>] kasprintf+0xa6/0xd0
    [<00000000f17eea1c>] __irq_domain_alloc_fwnode+0x1ce/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000002aec9527>] driver_probe_device+0x49/0x120
unreferenced object 0xffff888155ac9f00 (size 96):
  comm "sh", pid 9721, jiffies 4294943493 (age 435.768s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 e0 19 8b 83 ff ff ff ff  ................
    00 00 00 00 00 00 00 00 18 9f ac 55 81 88 ff ff  ...........U....
  backtrace:
    [<00000000bcb7f3b1>] kmalloc_trace+0x27/0x110
    [<000000008cdbc98d>] __irq_domain_alloc_fwnode+0x51/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000002aec9527>] driver_probe_device+0x49/0x120
    [<000000004aebbb6e>] __device_attach_driver+0x157/0x280
    [<00000000c3894808>] bus_for_each_drv+0x123/0x1a0
unreferenced object 0xffff88816b4dfd40 (size 32):
  comm "sh", pid 9721, jiffies 4294943493 (age 435.808s)
  hex dump (first 32 bytes):
    50 43 49 2d 4d 53 49 58 2d 30 30 30 30 3a 30 38  PCI-MSIX-0000:08
    3a 30 30 2e 33 00 ff ff 00 00 00 00 00 00 00 00  :00.3...........
  backtrace:
    [<00000000bef783eb>] __kmalloc_node_track_caller+0x4c/0x1b0
    [<00000000f16b54a8>] kvasprintf+0xb0/0x130
    [<0000000078634624>] kasprintf+0xa6/0xd0
    [<00000000f17eea1c>] __irq_domain_alloc_fwnode+0x1ce/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000002aec9527>] driver_probe_device+0x49/0x120
unreferenced object 0xffff88812e17e380 (size 96):
  comm "sh", pid 9828, jiffies 4294944405 (age 432.160s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 e0 19 8b 83 ff ff ff ff  ................
    00 00 00 00 00 00 00 00 98 e3 17 2e 81 88 ff ff  ................
  backtrace:
    [<00000000bcb7f3b1>] kmalloc_trace+0x27/0x110
    [<000000008cdbc98d>] __irq_domain_alloc_fwnode+0x51/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000002aec9527>] driver_probe_device+0x49/0x120
    [<000000004aebbb6e>] __device_attach_driver+0x157/0x280
    [<00000000c3894808>] bus_for_each_drv+0x123/0x1a0
unreferenced object 0xffff8881557a9bc0 (size 32):
  comm "sh", pid 9828, jiffies 4294944405 (age 432.160s)
  hex dump (first 32 bytes):
    50 43 49 2d 4d 53 49 58 2d 30 30 30 30 3a 30 38  PCI-MSIX-0000:08
    3a 30 30 2e 36 00 ff ff 00 00 00 00 00 00 00 00  :00.6...........
  backtrace:
    [<00000000bef783eb>] __kmalloc_node_track_caller+0x4c/0x1b0
    [<00000000f16b54a8>] kvasprintf+0xb0/0x130
    [<0000000078634624>] kasprintf+0xa6/0xd0
    [<00000000f17eea1c>] __irq_domain_alloc_fwnode+0x1ce/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000002aec9527>] driver_probe_device+0x49/0x120
unreferenced object 0xffff88813f624380 (size 96):
  comm "sh", pid 9828, jiffies 4294944654 (age 431.208s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 e0 19 8b 83 ff ff ff ff  ................
    00 00 00 00 00 00 00 00 98 43 62 3f 81 88 ff ff  .........Cb?....
  backtrace:
    [<00000000bcb7f3b1>] kmalloc_trace+0x27/0x110
    [<000000008cdbc98d>] __irq_domain_alloc_fwnode+0x51/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000002aec9527>] driver_probe_device+0x49/0x120
    [<000000004aebbb6e>] __device_attach_driver+0x157/0x280
    [<00000000c3894808>] bus_for_each_drv+0x123/0x1a0
unreferenced object 0xffff88813a95c440 (size 32):
  comm "sh", pid 9828, jiffies 4294944654 (age 431.208s)
  hex dump (first 32 bytes):
    50 43 49 2d 4d 53 49 58 2d 30 30 30 30 3a 30 38  PCI-MSIX-0000:08
    3a 30 30 2e 37 00 ff ff 2f 5f 5f 70 79 63 61 63  :00.7.../__pycac
  backtrace:
    [<00000000bef783eb>] __kmalloc_node_track_caller+0x4c/0x1b0
    [<00000000f16b54a8>] kvasprintf+0xb0/0x130
    [<0000000078634624>] kasprintf+0xa6/0xd0
    [<00000000f17eea1c>] __irq_domain_alloc_fwnode+0x1ce/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000002aec9527>] driver_probe_device+0x49/0x120
unreferenced object 0xffff88813aa3b880 (size 96):
  comm "sh", pid 10020, jiffies 4294950696 (age 407.044s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 e0 19 8b 83 ff ff ff ff  ................
    00 00 00 00 00 00 00 00 98 b8 a3 3a 81 88 ff ff  ...........:....
  backtrace:
    [<00000000bcb7f3b1>] kmalloc_trace+0x27/0x110
    [<000000008cdbc98d>] __irq_domain_alloc_fwnode+0x51/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000000d688957>] device_driver_attach+0xae/0x1b0
    [<00000000003e203b>] bind_store+0x150/0x1f0
    [<000000003b2d7ae5>] kernfs_fop_write_iter+0x348/0x520
unreferenced object 0xffff888142df4b80 (size 32):
  comm "sh", pid 10020, jiffies 4294950696 (age 407.088s)
  hex dump (first 32 bytes):
    50 43 49 2d 4d 53 49 58 2d 30 30 30 30 3a 30 38  PCI-MSIX-0000:08
    3a 30 30 2e 32 00 ff ff 00 b0 f4 60 01 00 00 00  :00.2......`....
  backtrace:
    [<00000000bef783eb>] __kmalloc_node_track_caller+0x4c/0x1b0
    [<00000000f16b54a8>] kvasprintf+0xb0/0x130
    [<0000000078634624>] kasprintf+0xa6/0xd0
    [<00000000f17eea1c>] __irq_domain_alloc_fwnode+0x1ce/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000000d688957>] device_driver_attach+0xae/0x1b0
unreferenced object 0xffff88816cd32780 (size 96):
  comm "sh", pid 10050, jiffies 4294950903 (age 406.300s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 e0 19 8b 83 ff ff ff ff  ................
    00 00 00 00 00 00 00 00 98 27 d3 6c 81 88 ff ff  .........'.l....
  backtrace:
    [<00000000bcb7f3b1>] kmalloc_trace+0x27/0x110
    [<000000008cdbc98d>] __irq_domain_alloc_fwnode+0x51/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000000d688957>] device_driver_attach+0xae/0x1b0
    [<00000000003e203b>] bind_store+0x150/0x1f0
    [<000000003b2d7ae5>] kernfs_fop_write_iter+0x348/0x520
unreferenced object 0xffff88816b1df980 (size 32):
  comm "sh", pid 10050, jiffies 4294950903 (age 406.300s)
  hex dump (first 32 bytes):
    50 43 49 2d 4d 53 49 58 2d 30 30 30 30 3a 30 38  PCI-MSIX-0000:08
    3a 30 30 2e 33 00 ff ff 00 00 00 00 00 00 00 00  :00.3...........
  backtrace:
    [<00000000bef783eb>] __kmalloc_node_track_caller+0x4c/0x1b0
    [<00000000f16b54a8>] kvasprintf+0xb0/0x130
    [<0000000078634624>] kasprintf+0xa6/0xd0
    [<00000000f17eea1c>] __irq_domain_alloc_fwnode+0x1ce/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000000d688957>] device_driver_attach+0xae/0x1b0
unreferenced object 0xffff8881620cd580 (size 96):
  comm "test-ovn-2-swit", pid 10619, jiffies 4294958587 (age 375.592s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 e0 19 8b 83 ff ff ff ff  ................
    00 00 00 00 00 00 00 00 98 d5 0c 62 81 88 ff ff  ...........b....
  backtrace:
    [<00000000bcb7f3b1>] kmalloc_trace+0x27/0x110
    [<000000008cdbc98d>] __irq_domain_alloc_fwnode+0x51/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000000d688957>] device_driver_attach+0xae/0x1b0
    [<00000000003e203b>] bind_store+0x150/0x1f0
    [<000000003b2d7ae5>] kernfs_fop_write_iter+0x348/0x520
unreferenced object 0xffff88815cd13700 (size 32):
  comm "test-ovn-2-swit", pid 10619, jiffies 4294958587 (age 375.636s)
  hex dump (first 32 bytes):
    50 43 49 2d 4d 53 49 58 2d 30 30 30 30 3a 30 38  PCI-MSIX-0000:08
    3a 30 30 2e 32 00 ff ff 80 55 5a 07 00 ea ff ff  :00.2....UZ.....
  backtrace:
    [<00000000bef783eb>] __kmalloc_node_track_caller+0x4c/0x1b0
    [<00000000f16b54a8>] kvasprintf+0xb0/0x130
    [<0000000078634624>] kasprintf+0xa6/0xd0
    [<00000000f17eea1c>] __irq_domain_alloc_fwnode+0x1ce/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000000d688957>] device_driver_attach+0xae/0x1b0
unreferenced object 0xffff88816c302400 (size 96):
  comm "test-ovn-2-swit", pid 10619, jiffies 4294958796 (age 374.800s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 e0 19 8b 83 ff ff ff ff  ................
    00 00 00 00 00 00 00 00 18 24 30 6c 81 88 ff ff  .........$0l....
  backtrace:
    [<00000000bcb7f3b1>] kmalloc_trace+0x27/0x110
    [<000000008cdbc98d>] __irq_domain_alloc_fwnode+0x51/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000000d688957>] device_driver_attach+0xae/0x1b0
    [<00000000003e203b>] bind_store+0x150/0x1f0
    [<000000003b2d7ae5>] kernfs_fop_write_iter+0x348/0x520
unreferenced object 0xffff88812cfd9180 (size 32):
  comm "test-ovn-2-swit", pid 10619, jiffies 4294958796 (age 374.800s)
  hex dump (first 32 bytes):
    50 43 49 2d 4d 53 49 58 2d 30 30 30 30 3a 30 38  PCI-MSIX-0000:08
    3a 30 30 2e 33 00 ff ff 73 00 00 00 00 00 00 00  :00.3...s.......
  backtrace:
    [<00000000bef783eb>] __kmalloc_node_track_caller+0x4c/0x1b0
    [<00000000f16b54a8>] kvasprintf+0xb0/0x130
    [<0000000078634624>] kasprintf+0xa6/0xd0
    [<00000000f17eea1c>] __irq_domain_alloc_fwnode+0x1ce/0x2b0
    [<00000000c57acf9d>] msi_create_device_irq_domain+0x283/0x670
    [<000000009b567982>] __pci_enable_msix_range+0x49e/0xdb0
    [<0000000077cc1445>] pci_alloc_irq_vectors_affinity+0x11f/0x1c0
    [<00000000532e9ef5>] mlx5_irq_table_create+0x24c/0x940 [mlx5_core]
    [<00000000fabd2b80>] mlx5_load+0x1fa/0x680 [mlx5_core]
    [<000000006bb22ae4>] mlx5_init_one+0x485/0x670 [mlx5_core]
    [<00000000eaa5e1ad>] probe_one+0x4c2/0x720 [mlx5_core]
    [<00000000df8efb43>] local_pci_probe+0xd6/0x170
    [<0000000085cb9924>] pci_device_probe+0x231/0x6e0
    [<000000002671d86e>] really_probe+0x1cf/0xaa0
    [<000000002aeba218>] __driver_probe_device+0x18f/0x470
    [<000000000d688957>] device_driver_attach+0xae/0x1b0
Thomas Gleixner Jan. 9, 2023, 8:32 p.m. UTC | #2
Jason!

On Wed, Jan 04 2023 at 08:34, Jason Gunthorpe wrote:
>
> Our test team has discovered some kmem leak complaints on rc1 and
> bisected it to this patch.
>
> I don't see an obvious way that fwnode gets destroyed here. So maybe
> it should be like this?

I'm back from vacation now. Will have a look tomorrow.

Thanks,

        tglx
Thomas Gleixner Jan. 10, 2023, 12:14 p.m. UTC | #3
Jason,

On Wed, Jan 04 2023 at 08:34, Jason Gunthorpe wrote:
> Our test team has discovered some kmem leak complaints on rc1 and
> bisected it to this patch.
>
> I don't see an obvious way that fwnode gets destroyed here. So maybe
> it should be like this?
>
> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
> index 955267bbc2be63..cbbcb7fd2bd00d 100644
> --- a/kernel/irq/msi.c
> +++ b/kernel/irq/msi.c
> @@ -1000,7 +1000,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
>  fail:
>  	msi_unlock_descs(dev);
>  free_fwnode:
> -	kfree(fwnode);
> +	irq_domain_free_fwnode(fwnode); // ???

That's correct. kfree(fwnode) leaks fwnode->name

>  free_bundle:
>  	kfree(bundle);
>  	return false;
> @@ -1013,6 +1013,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
>   */
>  void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
>  {
> +	struct fwnode_handle *fwnode = NULL;
>  	struct msi_domain_info *info;
>  	struct irq_domain *domain;
>  
> @@ -1025,7 +1026,10 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
>  
>  	dev->msi.data->__domains[domid].domain = NULL;
>  	info = domain->host_data;
> +	if (domain->flags & IRQ_DOMAIN_FLAG_MSI_DEVICE)
> +		fwnode = domain->fwnode;

irq_domain_is_msi_device() ?

>  	irq_domain_remove(domain);
> +	irq_domain_free_fwnode(fwnode);

For some reason I thought the fwnode would be handled by
irq_domain_remove() but fwnode_handle_put() is a NOP for the named
fwnodes.

Care to send a proper patch with changelog?

Thanks,

        tglx
Jason Gunthorpe Jan. 10, 2023, 2:59 p.m. UTC | #4
On Tue, Jan 10, 2023 at 01:14:00PM +0100, Thomas Gleixner wrote:

> Care to send a proper patch with changelog?

Yes, I'll post it in a few days once the test team confirms it

Thanks,
Jason
Kalle Valo Jan. 11, 2023, 4:02 p.m. UTC | #5
Jason Gunthorpe <jgg@nvidia.com> writes:

> On Tue, Jan 10, 2023 at 01:14:00PM +0100, Thomas Gleixner wrote:
>
>> Care to send a proper patch with changelog?
>
> Yes, I'll post it in a few days once the test team confirms it

I think I'm seeing the same leak and it's spamming logs on my test box a
lot. Let me know if you need any help with testing, I can do that pretty
quickly.

unreferenced object 0xffff888113dc7520 (size 96):
comm "insmod", pid 50676, jiffies 4301551867 (age 1463.666s)
hex dump (first 32 bytes):
00 00 00 00 00 00 00 00 00 25 68 a5 ff ff ff ff  .........%h.....
00 00 00 00 00 00 00 00 38 75 dc 13 81 88 ff ff  ........8u......
backtrace:
[<ffffffffa3105532>] __kmem_cache_alloc_node+0x1d2/0x2b0
[<ffffffffa2fdfb45>] kmalloc_trace+0x25/0x60
[<ffffffffa2cb8b42>] __irq_domain_alloc_fwnode+0x52/0x2b0
[<ffffffffa2cc6add>] msi_create_device_irq_domain+0x27d/0x630
[<ffffffffa3aaf5a9>] pci_setup_msi_device_domain+0xe9/0x120
[<ffffffffa3aababd>] __pci_enable_msi_range+0x3fd/0x5a0
[<ffffffffa3aa8ac3>] pci_alloc_irq_vectors_affinity+0x153/0x200
[<ffffffffa3aa8b7c>] pci_alloc_irq_vectors+0xc/0x10
[<ffffffffc0b75287>] ath11k_pci_alloc_msi+0xb7/0x610 [ath11k_pci]
[<ffffffffc0b7696e>] ath11k_pci_probe+0x5be/0x1090 [ath11k_pci]
[<ffffffffa3a8d4e9>] local_pci_probe+0xd9/0x170
[<ffffffffa3a8f687>] pci_call_probe+0x167/0x440
[<ffffffffa3a919f6>] pci_device_probe+0xa6/0x100
[<ffffffffa43c2c09>] really_probe+0x1c9/0xa50
[<ffffffffa43c361a>] __driver_probe_device+0x18a/0x460
[<ffffffffa43c393a>] driver_probe_device+0x4a/0x120
Jason Gunthorpe Jan. 11, 2023, 4:35 p.m. UTC | #6
On Wed, Jan 11, 2023 at 06:02:13PM +0200, Kalle Valo wrote:
> Jason Gunthorpe <jgg@nvidia.com> writes:
> 
> > On Tue, Jan 10, 2023 at 01:14:00PM +0100, Thomas Gleixner wrote:
> >
> >> Care to send a proper patch with changelog?
> >
> > Yes, I'll post it in a few days once the test team confirms it
> 
> I think I'm seeing the same leak and it's spamming logs on my test box a
> lot. Let me know if you need any help with testing, I can do that pretty
> quickly.

https://github.com/jgunthorpe/linux/commits/msi_fwnode_leak

Jason
Kalle Valo Jan. 11, 2023, 5:07 p.m. UTC | #7
Jason Gunthorpe <jgg@nvidia.com> writes:

> On Wed, Jan 11, 2023 at 06:02:13PM +0200, Kalle Valo wrote:
>> Jason Gunthorpe <jgg@nvidia.com> writes:
>> 
>> > On Tue, Jan 10, 2023 at 01:14:00PM +0100, Thomas Gleixner wrote:
>> >
>> >> Care to send a proper patch with changelog?
>> >
>> > Yes, I'll post it in a few days once the test team confirms it
>> 
>> I think I'm seeing the same leak and it's spamming logs on my test box a
>> lot. Let me know if you need any help with testing, I can do that pretty
>> quickly.
>
> https://github.com/jgunthorpe/linux/commits/msi_fwnode_leak

Nice, this fixes the issue for me. I don't see memleaks anymore while
running my ath11k regression tests. Thanks!

Tested-by: Kalle Valo <kvalo@kernel.org>
diff mbox series

Patch

--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -62,4 +62,10 @@  typedef struct x86_msi_addr_hi {
 struct msi_msg;
 u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid);
 
+#define X86_VECTOR_MSI_FLAGS_SUPPORTED					\
+	(MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX)
+
+#define X86_VECTOR_MSI_FLAGS_REQUIRED					\
+	(MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS)
+
 #endif /* _ASM_X86_MSI_H */
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -92,6 +92,7 @@  void pcibios_scan_root(int bus);
 struct irq_routing_table *pcibios_get_irq_routing_table(void);
 int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
 
+bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev);
 
 #define HAVE_PCI_MMAP
 #define arch_can_pci_mmap_wc()	pat_enabled()
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -142,67 +142,131 @@  msi_set_affinity(struct irq_data *irqd,
 	return ret;
 }
 
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
+/**
+ * pci_dev_has_default_msi_parent_domain - Check whether the device has the default
+ *					   MSI parent domain associated
+ * @dev:	Pointer to the PCI device
  */
-static struct irq_chip pci_msi_controller = {
-	.name			= "PCI-MSI",
-	.irq_unmask		= pci_msi_unmask_irq,
-	.irq_mask		= pci_msi_mask_irq,
-	.irq_ack		= irq_chip_ack_parent,
-	.irq_retrigger		= irq_chip_retrigger_hierarchy,
-	.irq_set_affinity	= msi_set_affinity,
-	.flags			= IRQCHIP_SKIP_SET_WAKE |
-				  IRQCHIP_AFFINITY_PRE_STARTUP,
-};
+bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev)
+{
+	struct irq_domain *domain = dev_get_msi_domain(&dev->dev);
 
-int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
-		    msi_alloc_info_t *arg)
+	if (!domain)
+		domain = dev_get_msi_domain(&dev->bus->dev);
+	if (!domain)
+		return false;
+
+	return domain == x86_vector_domain;
+}
+
+/**
+ * x86_msi_prepare - Setup of msi_alloc_info_t for allocations
+ * @domain:	The domain for which this setup happens
+ * @dev:	The device for which interrupts are allocated
+ * @nvec:	The number of vectors to allocate
+ * @alloc:	The allocation info structure to initialize
+ *
+ * This function is to be used for all types of MSI domains above the x86
+ * vector domain and any intermediates. It is always invoked from the
+ * top level interrupt domain. The domain specific allocation
+ * functionality is determined via the @domain's bus token which allows to
+ * map the X86 specific allocation type.
+ */
+static int x86_msi_prepare(struct irq_domain *domain, struct device *dev,
+			   int nvec, msi_alloc_info_t *alloc)
 {
-	init_irq_alloc_info(arg, NULL);
-	if (to_pci_dev(dev)->msix_enabled)
-		arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
-	else
-		arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
+	struct msi_domain_info *info = domain->host_data;
 
-	return 0;
+	init_irq_alloc_info(alloc, NULL);
+
+	switch (info->bus_token) {
+	case DOMAIN_BUS_PCI_DEVICE_MSI:
+		alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
+		return 0;
+	case DOMAIN_BUS_PCI_DEVICE_MSIX:
+		alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
+		return 0;
+	default:
+		return -EINVAL;
+	}
 }
-EXPORT_SYMBOL_GPL(pci_msi_prepare);
 
-static struct msi_domain_ops pci_msi_domain_ops = {
-	.msi_prepare	= pci_msi_prepare,
-};
+/**
+ * x86_init_dev_msi_info - Domain info setup for MSI domains
+ * @dev:		The device for which the domain should be created
+ * @domain:		The (root) domain providing this callback
+ * @real_parent:	The real parent domain of the to initialize domain
+ * @info:		The domain info for the to initialize domain
+ *
+ * This function is to be used for all types of MSI domains above the x86
+ * vector domain and any intermediates. The domain specific functionality
+ * is determined via the @real_parent.
+ */
+static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+				  struct irq_domain *real_parent, struct msi_domain_info *info)
+{
+	const struct msi_parent_ops *pops = real_parent->msi_parent_ops;
+
+	/* MSI parent domain specific settings */
+	switch (real_parent->bus_token) {
+	case DOMAIN_BUS_ANY:
+		/* Only the vector domain can have the ANY token */
+		if (WARN_ON_ONCE(domain != real_parent))
+			return false;
+		info->chip->irq_set_affinity = msi_set_affinity;
+		/* See msi_set_affinity() for the gory details */
+		info->flags |= MSI_FLAG_NOMASK_QUIRK;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return false;
+	}
+
+	/* Is the target supported? */
+	switch(info->bus_token) {
+	case DOMAIN_BUS_PCI_DEVICE_MSI:
+	case DOMAIN_BUS_PCI_DEVICE_MSIX:
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return false;
+	}
+
+	/*
+	 * Mask out the domain specific MSI feature flags which are not
+	 * supported by the real parent.
+	 */
+	info->flags			&= pops->supported_flags;
+	/* Enforce the required flags */
+	info->flags			|= X86_VECTOR_MSI_FLAGS_REQUIRED;
+
+	/* This is always invoked from the top level MSI domain! */
+	info->ops->msi_prepare		= x86_msi_prepare;
+
+	info->chip->irq_ack		= irq_chip_ack_parent;
+	info->chip->irq_retrigger	= irq_chip_retrigger_hierarchy;
+	info->chip->flags		|= IRQCHIP_SKIP_SET_WAKE |
+					   IRQCHIP_AFFINITY_PRE_STARTUP;
 
-static struct msi_domain_info pci_msi_domain_info = {
-	.flags		= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
-			  MSI_FLAG_PCI_MSIX | MSI_FLAG_NOMASK_QUIRK,
-
-	.ops		= &pci_msi_domain_ops,
-	.chip		= &pci_msi_controller,
-	.handler	= handle_edge_irq,
-	.handler_name	= "edge",
+	info->handler			= handle_edge_irq;
+	info->handler_name		= "edge";
+
+	return true;
+}
+
+static const struct msi_parent_ops x86_vector_msi_parent_ops = {
+	.supported_flags	= X86_VECTOR_MSI_FLAGS_SUPPORTED,
+	.init_dev_msi_info	= x86_init_dev_msi_info,
 };
 
 struct irq_domain * __init native_create_pci_msi_domain(void)
 {
-	struct fwnode_handle *fn;
-	struct irq_domain *d;
-
 	if (disable_apic)
 		return NULL;
 
-	fn = irq_domain_alloc_named_fwnode("PCI-MSI");
-	if (!fn)
-		return NULL;
-
-	d = pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
-				      x86_vector_domain);
-	if (!d) {
-		irq_domain_free_fwnode(fn);
-		pr_warn("Failed to initialize PCI-MSI irqdomain.\n");
-	}
-	return d;
+	x86_vector_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT;
+	x86_vector_domain->msi_parent_ops = &x86_vector_msi_parent_ops;
+	return x86_vector_domain;
 }
 
 void __init x86_create_pci_msi_domain(void)
@@ -210,7 +274,25 @@  void __init x86_create_pci_msi_domain(vo
 	x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain();
 }
 
+/* Keep around for hyperV and the remap code below */
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+		    msi_alloc_info_t *arg)
+{
+	init_irq_alloc_info(arg, NULL);
+
+	if (to_pci_dev(dev)->msix_enabled)
+		arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
+	else
+		arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_msi_prepare);
+
 #ifdef CONFIG_IRQ_REMAP
+static struct msi_domain_ops pci_msi_domain_ops = {
+	.msi_prepare	= pci_msi_prepare,
+};
+
 static struct irq_chip pci_msi_ir_controller = {
 	.name			= "IR-PCI-MSI",
 	.irq_unmask		= pci_msi_unmask_irq,
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -812,7 +812,7 @@  static void
 amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu)
 {
 	if (!irq_remapping_enabled || !dev_is_pci(dev) ||
-	    pci_dev_has_special_msi_domain(to_pci_dev(dev)))
+	    !pci_dev_has_default_msi_parent_domain(to_pci_dev(dev)))
 		return;
 
 	dev_set_msi_domain(dev, iommu->msi_domain);
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1107,7 +1107,7 @@  static int reenable_irq_remapping(int ei
  */
 void intel_irq_remap_add_device(struct dmar_pci_notify_info *info)
 {
-	if (!irq_remapping_enabled || pci_dev_has_special_msi_domain(info->dev))
+	if (!irq_remapping_enabled || !pci_dev_has_default_msi_parent_domain(info->dev))
 		return;
 
 	dev_set_msi_domain(&info->dev->dev, map_dev_to_ir(info->dev));