diff mbox series

[v9,5/7] PCI: mediatek-gen3: Add MSI support

Message ID 20210324030510.29177-6-jianjun.wang@mediatek.com (mailing list archive)
State Superseded
Delegated to: Lorenzo Pieralisi
Headers show
Series PCI: mediatek: Add new generation controller support | expand

Commit Message

Jianjun Wang (王建军) March 24, 2021, 3:05 a.m. UTC
Add MSI support for MediaTek Gen3 PCIe controller.

This PCIe controller supports up to 256 MSI vectors, the MSI hardware
block diagram is as follows:

                  +-----+
                  | GIC |
                  +-----+
                     ^
                     |
                 port->irq
                     |
             +-+-+-+-+-+-+-+-+
             |0|1|2|3|4|5|6|7| (PCIe intc)
             +-+-+-+-+-+-+-+-+
              ^ ^           ^
              | |    ...    |
      +-------+ +------+    +-----------+
      |                |                |
+-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
|0|1|...|30|31|  |0|1|...|30|31|  |0|1|...|30|31| (MSI sets)
+-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
 ^ ^      ^  ^    ^ ^      ^  ^    ^ ^      ^  ^
 | |      |  |    | |      |  |    | |      |  |  (MSI vectors)
 | |      |  |    | |      |  |    | |      |  |

  (MSI SET0)       (MSI SET1)  ...   (MSI SET7)

With 256 MSI vectors supported, the MSI vectors are composed of 8 sets,
each set has its own address for MSI message, and supports 32 MSI vectors
to generate interrupt.

Signed-off-by: Jianjun Wang <jianjun.wang@mediatek.com>
Acked-by: Ryder Lee <ryder.lee@mediatek.com>
---
 drivers/pci/controller/pcie-mediatek-gen3.c | 276 ++++++++++++++++++++
 1 file changed, 276 insertions(+)

Comments

Marc Zyngier March 24, 2021, 4:18 p.m. UTC | #1
On Wed, 24 Mar 2021 03:05:08 +0000,
Jianjun Wang <jianjun.wang@mediatek.com> wrote:
> 
> Add MSI support for MediaTek Gen3 PCIe controller.
> 
> This PCIe controller supports up to 256 MSI vectors, the MSI hardware
> block diagram is as follows:
> 
>                   +-----+
>                   | GIC |
>                   +-----+
>                      ^
>                      |
>                  port->irq
>                      |
>              +-+-+-+-+-+-+-+-+
>              |0|1|2|3|4|5|6|7| (PCIe intc)
>              +-+-+-+-+-+-+-+-+
>               ^ ^           ^
>               | |    ...    |
>       +-------+ +------+    +-----------+
>       |                |                |
> +-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
> |0|1|...|30|31|  |0|1|...|30|31|  |0|1|...|30|31| (MSI sets)
> +-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
>  ^ ^      ^  ^    ^ ^      ^  ^    ^ ^      ^  ^
>  | |      |  |    | |      |  |    | |      |  |  (MSI vectors)
>  | |      |  |    | |      |  |    | |      |  |
> 
>   (MSI SET0)       (MSI SET1)  ...   (MSI SET7)
> 
> With 256 MSI vectors supported, the MSI vectors are composed of 8 sets,
> each set has its own address for MSI message, and supports 32 MSI vectors
> to generate interrupt.
> 
> Signed-off-by: Jianjun Wang <jianjun.wang@mediatek.com>
> Acked-by: Ryder Lee <ryder.lee@mediatek.com>

Reviewed-by: Marc Zyngier <maz@kernel.org>

	M.
Pali Rohár March 27, 2021, 7:28 p.m. UTC | #2
On Wednesday 24 March 2021 11:05:08 Jianjun Wang wrote:
> +static void mtk_pcie_msi_handler(struct mtk_pcie_port *port, int set_idx)
> +{
> +	struct mtk_msi_set *msi_set = &port->msi_sets[set_idx];
> +	unsigned long msi_enable, msi_status;
> +	unsigned int virq;
> +	irq_hw_number_t bit, hwirq;
> +
> +	msi_enable = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
> +
> +	do {
> +		msi_status = readl_relaxed(msi_set->base +
> +					   PCIE_MSI_SET_STATUS_OFFSET);
> +		msi_status &= msi_enable;
> +		if (!msi_status)
> +			break;
> +
> +		for_each_set_bit(bit, &msi_status, PCIE_MSI_IRQS_PER_SET) {
> +			hwirq = bit + set_idx * PCIE_MSI_IRQS_PER_SET;
> +			virq = irq_find_mapping(port->msi_bottom_domain, hwirq);
> +			generic_handle_irq(virq);
> +		}
> +	} while (true);

Hello!

Just a question, cannot this while-loop cause block of processing other
interrupts?

I have done tests with different HW (aardvark) but with same while(true)
loop logic. One XHCI PCIe controller was sending MSI interrupts too fast
and interrupt handler with this while(true) logic was in infinite loop.
During one IRQ it was calling infinite many times generic_handle_irq()
as HW was feeding new and new MSI hwirq into status register.

But this is different HW, so it can have different behavior and does not
have to cause above issue.

I have just spotted same code pattern for processing MSI interrupts...

> +}
> +
>  static void mtk_pcie_irq_handler(struct irq_desc *desc)
>  {
>  	struct mtk_pcie_port *port = irq_desc_get_handler_data(desc);
> @@ -405,6 +673,14 @@ static void mtk_pcie_irq_handler(struct irq_desc *desc)
>  		generic_handle_irq(virq);
>  	}
>  
> +	irq_bit = PCIE_MSI_SHIFT;
> +	for_each_set_bit_from(irq_bit, &status, PCIE_MSI_SET_NUM +
> +			      PCIE_MSI_SHIFT) {
> +		mtk_pcie_msi_handler(port, irq_bit - PCIE_MSI_SHIFT);
> +
> +		writel_relaxed(BIT(irq_bit), port->base + PCIE_INT_STATUS_REG);
> +	}
> +
>  	chained_irq_exit(irqchip, desc);
>  }
>  
> -- 
> 2.25.1
>
Marc Zyngier March 27, 2021, 7:44 p.m. UTC | #3
On Sat, 27 Mar 2021 19:28:37 +0000,
Pali Rohár <pali@kernel.org> wrote:
> 
> On Wednesday 24 March 2021 11:05:08 Jianjun Wang wrote:
> > +static void mtk_pcie_msi_handler(struct mtk_pcie_port *port, int set_idx)
> > +{
> > +	struct mtk_msi_set *msi_set = &port->msi_sets[set_idx];
> > +	unsigned long msi_enable, msi_status;
> > +	unsigned int virq;
> > +	irq_hw_number_t bit, hwirq;
> > +
> > +	msi_enable = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
> > +
> > +	do {
> > +		msi_status = readl_relaxed(msi_set->base +
> > +					   PCIE_MSI_SET_STATUS_OFFSET);
> > +		msi_status &= msi_enable;
> > +		if (!msi_status)
> > +			break;
> > +
> > +		for_each_set_bit(bit, &msi_status, PCIE_MSI_IRQS_PER_SET) {
> > +			hwirq = bit + set_idx * PCIE_MSI_IRQS_PER_SET;
> > +			virq = irq_find_mapping(port->msi_bottom_domain, hwirq);
> > +			generic_handle_irq(virq);
> > +		}
> > +	} while (true);
> 
> Hello!
> 
> Just a question, cannot this while-loop cause block of processing other
> interrupts?

This is a level interrupt. You don't have much choice but to handle it
immediately, although an alternative would be to mask it and deal with
it in a thread. And since Linux doesn't deal with interrupt priority,
a screaming interrupt is never a good thing.

> I have done tests with different HW (aardvark) but with same while(true)
> loop logic. One XHCI PCIe controller was sending MSI interrupts too fast
> and interrupt handler with this while(true) logic was in infinite loop.
> During one IRQ it was calling infinite many times generic_handle_irq()
> as HW was feeding new and new MSI hwirq into status register.

Define "too fast". If something in the system is able to program the
XHCI device in such a way that it causes a screaming interrupt, that's
the place to look for problems, and probably not in the interrupt
handling itself, which does what it is supposed to do.

> But this is different HW, so it can have different behavior and does not
> have to cause above issue.
> 
> I have just spotted same code pattern for processing MSI interrupts...

This is a common pattern that you will find in pretty much any
interrupt handling/demuxing, and is done this way when the cost of
taking the exception is high compared to that of handling it.

Which is pretty much any of the badly designed, level-driving,
DW-inspired, sorry excuse for MSI implementations that are popular on
low-end ARM SoCs.

Thanks,

	M.
Pali Rohár March 27, 2021, 8:29 p.m. UTC | #4
On Saturday 27 March 2021 19:44:30 Marc Zyngier wrote:
> On Sat, 27 Mar 2021 19:28:37 +0000,
> Pali Rohár <pali@kernel.org> wrote:
> > 
> > On Wednesday 24 March 2021 11:05:08 Jianjun Wang wrote:
> > > +static void mtk_pcie_msi_handler(struct mtk_pcie_port *port, int set_idx)
> > > +{
> > > +	struct mtk_msi_set *msi_set = &port->msi_sets[set_idx];
> > > +	unsigned long msi_enable, msi_status;
> > > +	unsigned int virq;
> > > +	irq_hw_number_t bit, hwirq;
> > > +
> > > +	msi_enable = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
> > > +
> > > +	do {
> > > +		msi_status = readl_relaxed(msi_set->base +
> > > +					   PCIE_MSI_SET_STATUS_OFFSET);
> > > +		msi_status &= msi_enable;
> > > +		if (!msi_status)
> > > +			break;
> > > +
> > > +		for_each_set_bit(bit, &msi_status, PCIE_MSI_IRQS_PER_SET) {
> > > +			hwirq = bit + set_idx * PCIE_MSI_IRQS_PER_SET;
> > > +			virq = irq_find_mapping(port->msi_bottom_domain, hwirq);
> > > +			generic_handle_irq(virq);
> > > +		}
> > > +	} while (true);
> > 
> > Hello!
> > 
> > Just a question, cannot this while-loop cause block of processing other
> > interrupts?
> 
> This is a level interrupt. You don't have much choice but to handle it
> immediately, although an alternative would be to mask it and deal with
> it in a thread. And since Linux doesn't deal with interrupt priority,
> a screaming interrupt is never a good thing.

I see. Something like "interrupt priority" (which does not exist?) would
be needed to handle it.

> > I have done tests with different HW (aardvark) but with same while(true)
> > loop logic. One XHCI PCIe controller was sending MSI interrupts too fast
> > and interrupt handler with this while(true) logic was in infinite loop.
> > During one IRQ it was calling infinite many times generic_handle_irq()
> > as HW was feeding new and new MSI hwirq into status register.
> 
> Define "too fast".

Fast - next interrupt comes prior checking if while(true)-loop should stop.

> If something in the system is able to program the
> XHCI device in such a way that it causes a screaming interrupt, that's
> the place to look for problems, and probably not in the interrupt
> handling itself, which does what it is supposed to do.
> 
> > But this is different HW, so it can have different behavior and does not
> > have to cause above issue.
> > 
> > I have just spotted same code pattern for processing MSI interrupts...
> 
> This is a common pattern that you will find in pretty much any
> interrupt handling/demuxing, and is done this way when the cost of
> taking the exception is high compared to that of handling it.

And would not help if while(true)-loop is replaced by loop with upper
limit of iterations? Or just call only one iteration?

> Which is pretty much any of the badly designed, level-driving,
> DW-inspired, sorry excuse for MSI implementations that are popular on
> low-end ARM SoCs.

Ok. So thank you for information!

> Thanks,
> 
> 	M.
> 
> -- 
> Without deviation from the norm, progress is not possible.
Marc Zyngier March 27, 2021, 9:45 p.m. UTC | #5
On Sat, 27 Mar 2021 20:29:04 +0000,
Pali Rohár <pali@kernel.org> wrote:
> 
> On Saturday 27 March 2021 19:44:30 Marc Zyngier wrote:
> > On Sat, 27 Mar 2021 19:28:37 +0000,
> > Pali Rohár <pali@kernel.org> wrote:
> > > 
> > > On Wednesday 24 March 2021 11:05:08 Jianjun Wang wrote:
> > > > +static void mtk_pcie_msi_handler(struct mtk_pcie_port *port, int set_idx)
> > > > +{
> > > > +	struct mtk_msi_set *msi_set = &port->msi_sets[set_idx];
> > > > +	unsigned long msi_enable, msi_status;
> > > > +	unsigned int virq;
> > > > +	irq_hw_number_t bit, hwirq;
> > > > +
> > > > +	msi_enable = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
> > > > +
> > > > +	do {
> > > > +		msi_status = readl_relaxed(msi_set->base +
> > > > +					   PCIE_MSI_SET_STATUS_OFFSET);
> > > > +		msi_status &= msi_enable;
> > > > +		if (!msi_status)
> > > > +			break;
> > > > +
> > > > +		for_each_set_bit(bit, &msi_status, PCIE_MSI_IRQS_PER_SET) {
> > > > +			hwirq = bit + set_idx * PCIE_MSI_IRQS_PER_SET;
> > > > +			virq = irq_find_mapping(port->msi_bottom_domain, hwirq);
> > > > +			generic_handle_irq(virq);
> > > > +		}
> > > > +	} while (true);
> > > 
> > > Hello!
> > > 
> > > Just a question, cannot this while-loop cause block of processing other
> > > interrupts?
> > 
> > This is a level interrupt. You don't have much choice but to handle it
> > immediately, although an alternative would be to mask it and deal with
> > it in a thread. And since Linux doesn't deal with interrupt priority,
> > a screaming interrupt is never a good thing.
> 
> I see. Something like "interrupt priority" (which does not exist?) would
> be needed to handle it.

Interrupt priorities definitely exist, but Linux doesn't use
them. Furthermore, This wouldn't be relevant here as you get a bunch
of MSI multiplexed onto a single one. Where would you apply the
priority?

> 
> > > I have done tests with different HW (aardvark) but with same while(true)
> > > loop logic. One XHCI PCIe controller was sending MSI interrupts too fast
> > > and interrupt handler with this while(true) logic was in infinite loop.
> > > During one IRQ it was calling infinite many times generic_handle_irq()
> > > as HW was feeding new and new MSI hwirq into status register.
> > 
> > Define "too fast".
> 
> Fast - next interrupt comes prior checking if while(true)-loop should stop.

That's definitely not something you can easily fix at the interrupt
handling level. You need to prevent this from happening. That's
usually the result of a misprogramming or a HW bug.

> > If something in the system is able to program the
> > XHCI device in such a way that it causes a screaming interrupt, that's
> > the place to look for problems, and probably not in the interrupt
> > handling itself, which does what it is supposed to do.
> > 
> > > But this is different HW, so it can have different behavior and does not
> > > have to cause above issue.
> > > 
> > > I have just spotted same code pattern for processing MSI interrupts...
> > 
> > This is a common pattern that you will find in pretty much any
> > interrupt handling/demuxing, and is done this way when the cost of
> > taking the exception is high compared to that of handling it.
> 
> And would not help if while(true)-loop is replaced by loop with upper
> limit of iterations? Or just call only one iteration?

That wouldn't change much: you would still have the interrupt being
pending, and it would fire again at the earliest opportunity.

At best, the root interrupt controller is able to present you with
another interrupt before forcing you to deal with the one you have
ignored again. But you cannot rely on that either.

And to be honest, other interrupts are only a part of the problem you
are describing. With a screaming interrupt, you can't execute
userspace. This is as bad as it gets.

	M.
diff mbox series

Patch

diff --git a/drivers/pci/controller/pcie-mediatek-gen3.c b/drivers/pci/controller/pcie-mediatek-gen3.c
index ff91ad587461..ee1b51207d11 100644
--- a/drivers/pci/controller/pcie-mediatek-gen3.c
+++ b/drivers/pci/controller/pcie-mediatek-gen3.c
@@ -14,6 +14,7 @@ 
 #include <linux/irqdomain.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/msi.h>
 #include <linux/pci.h>
 #include <linux/phy/phy.h>
 #include <linux/platform_device.h>
@@ -48,12 +49,29 @@ 
 #define PCIE_LINK_STATUS_REG		0x154
 #define PCIE_PORT_LINKUP		BIT(8)
 
+#define PCIE_MSI_SET_NUM		8
+#define PCIE_MSI_IRQS_PER_SET		32
+#define PCIE_MSI_IRQS_NUM \
+	(PCIE_MSI_IRQS_PER_SET * PCIE_MSI_SET_NUM)
+
 #define PCIE_INT_ENABLE_REG		0x180
+#define PCIE_MSI_ENABLE			GENMASK(PCIE_MSI_SET_NUM + 8 - 1, 8)
+#define PCIE_MSI_SHIFT			8
 #define PCIE_INTX_SHIFT			24
 #define PCIE_INTX_ENABLE \
 	GENMASK(PCIE_INTX_SHIFT + PCI_NUM_INTX - 1, PCIE_INTX_SHIFT)
 
 #define PCIE_INT_STATUS_REG		0x184
+#define PCIE_MSI_SET_ENABLE_REG		0x190
+#define PCIE_MSI_SET_ENABLE		GENMASK(PCIE_MSI_SET_NUM - 1, 0)
+
+#define PCIE_MSI_SET_BASE_REG		0xc00
+#define PCIE_MSI_SET_OFFSET		0x10
+#define PCIE_MSI_SET_STATUS_OFFSET	0x04
+#define PCIE_MSI_SET_ENABLE_OFFSET	0x08
+
+#define PCIE_MSI_SET_ADDR_HI_BASE	0xc80
+#define PCIE_MSI_SET_ADDR_HI_OFFSET	0x04
 
 #define PCIE_TRANS_TABLE_BASE_REG	0x800
 #define PCIE_ATR_SRC_ADDR_MSB_OFFSET	0x4
@@ -73,6 +91,16 @@ 
 #define PCIE_ATR_TLP_TYPE_MEM		PCIE_ATR_TLP_TYPE(0)
 #define PCIE_ATR_TLP_TYPE_IO		PCIE_ATR_TLP_TYPE(2)
 
+/**
+ * struct mtk_msi_set - MSI information for each set
+ * @base: IO mapped register base
+ * @msg_addr: MSI message address
+ */
+struct mtk_msi_set {
+	void __iomem *base;
+	phys_addr_t msg_addr;
+};
+
 /**
  * struct mtk_pcie_port - PCIe port information
  * @dev: pointer to PCIe device
@@ -86,6 +114,11 @@ 
  * @irq: PCIe controller interrupt number
  * @irq_lock: lock protecting IRQ register access
  * @intx_domain: legacy INTx IRQ domain
+ * @msi_domain: MSI IRQ domain
+ * @msi_bottom_domain: MSI IRQ bottom domain
+ * @msi_sets: MSI sets information
+ * @lock: lock protecting IRQ bit map
+ * @msi_irq_in_use: bit map for assigned MSI IRQ
  */
 struct mtk_pcie_port {
 	struct device *dev;
@@ -100,6 +133,11 @@  struct mtk_pcie_port {
 	int irq;
 	raw_spinlock_t irq_lock;
 	struct irq_domain *intx_domain;
+	struct irq_domain *msi_domain;
+	struct irq_domain *msi_bottom_domain;
+	struct mtk_msi_set msi_sets[PCIE_MSI_SET_NUM];
+	struct mutex lock;
+	DECLARE_BITMAP(msi_irq_in_use, PCIE_MSI_IRQS_NUM);
 };
 
 /**
@@ -196,6 +234,35 @@  static int mtk_pcie_set_trans_table(struct mtk_pcie_port *port,
 	return 0;
 }
 
+static void mtk_pcie_enable_msi(struct mtk_pcie_port *port)
+{
+	int i;
+	u32 val;
+
+	for (i = 0; i < PCIE_MSI_SET_NUM; i++) {
+		struct mtk_msi_set *msi_set = &port->msi_sets[i];
+
+		msi_set->base = port->base + PCIE_MSI_SET_BASE_REG +
+				i * PCIE_MSI_SET_OFFSET;
+		msi_set->msg_addr = port->reg_base + PCIE_MSI_SET_BASE_REG +
+				    i * PCIE_MSI_SET_OFFSET;
+
+		/* Configure the MSI capture address */
+		writel_relaxed(lower_32_bits(msi_set->msg_addr), msi_set->base);
+		writel_relaxed(upper_32_bits(msi_set->msg_addr),
+			       port->base + PCIE_MSI_SET_ADDR_HI_BASE +
+			       i * PCIE_MSI_SET_ADDR_HI_OFFSET);
+	}
+
+	val = readl_relaxed(port->base + PCIE_MSI_SET_ENABLE_REG);
+	val |= PCIE_MSI_SET_ENABLE;
+	writel_relaxed(val, port->base + PCIE_MSI_SET_ENABLE_REG);
+
+	val = readl_relaxed(port->base + PCIE_INT_ENABLE_REG);
+	val |= PCIE_MSI_ENABLE;
+	writel_relaxed(val, port->base + PCIE_INT_ENABLE_REG);
+}
+
 static int mtk_pcie_startup_port(struct mtk_pcie_port *port)
 {
 	struct resource_entry *entry;
@@ -247,6 +314,8 @@  static int mtk_pcie_startup_port(struct mtk_pcie_port *port)
 		return err;
 	}
 
+	mtk_pcie_enable_msi(port);
+
 	/* Set PCIe translation windows */
 	resource_list_for_each_entry(entry, &host->windows) {
 		struct resource *res = entry->res;
@@ -289,6 +358,147 @@  static int mtk_pcie_set_affinity(struct irq_data *data,
 	return -EINVAL;
 }
 
+static void mtk_pcie_msi_irq_mask(struct irq_data *data)
+{
+	pci_msi_mask_irq(data);
+	irq_chip_mask_parent(data);
+}
+
+static void mtk_pcie_msi_irq_unmask(struct irq_data *data)
+{
+	pci_msi_unmask_irq(data);
+	irq_chip_unmask_parent(data);
+}
+
+static struct irq_chip mtk_msi_irq_chip = {
+	.irq_ack = irq_chip_ack_parent,
+	.irq_mask = mtk_pcie_msi_irq_mask,
+	.irq_unmask = mtk_pcie_msi_irq_unmask,
+	.name = "MSI",
+};
+
+static struct msi_domain_info mtk_msi_domain_info = {
+	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+		   MSI_FLAG_PCI_MSIX | MSI_FLAG_MULTI_PCI_MSI),
+	.chip	= &mtk_msi_irq_chip,
+};
+
+static void mtk_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
+	struct mtk_pcie_port *port = data->domain->host_data;
+	unsigned long hwirq;
+
+	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
+
+	msg->address_hi = upper_32_bits(msi_set->msg_addr);
+	msg->address_lo = lower_32_bits(msi_set->msg_addr);
+	msg->data = hwirq;
+	dev_dbg(port->dev, "msi#%#lx address_hi %#x address_lo %#x data %d\n",
+		hwirq, msg->address_hi, msg->address_lo, msg->data);
+}
+
+static void mtk_msi_bottom_irq_ack(struct irq_data *data)
+{
+	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
+	unsigned long hwirq;
+
+	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
+
+	writel_relaxed(BIT(hwirq), msi_set->base + PCIE_MSI_SET_STATUS_OFFSET);
+}
+
+static void mtk_msi_bottom_irq_mask(struct irq_data *data)
+{
+	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
+	struct mtk_pcie_port *port = data->domain->host_data;
+	unsigned long hwirq, flags;
+	u32 val;
+
+	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
+
+	raw_spin_lock_irqsave(&port->irq_lock, flags);
+	val = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+	val &= ~BIT(hwirq);
+	writel_relaxed(val, msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+	raw_spin_unlock_irqrestore(&port->irq_lock, flags);
+}
+
+static void mtk_msi_bottom_irq_unmask(struct irq_data *data)
+{
+	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
+	struct mtk_pcie_port *port = data->domain->host_data;
+	unsigned long hwirq, flags;
+	u32 val;
+
+	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
+
+	raw_spin_lock_irqsave(&port->irq_lock, flags);
+	val = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+	val |= BIT(hwirq);
+	writel_relaxed(val, msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+	raw_spin_unlock_irqrestore(&port->irq_lock, flags);
+}
+
+static struct irq_chip mtk_msi_bottom_irq_chip = {
+	.irq_ack		= mtk_msi_bottom_irq_ack,
+	.irq_mask		= mtk_msi_bottom_irq_mask,
+	.irq_unmask		= mtk_msi_bottom_irq_unmask,
+	.irq_compose_msi_msg	= mtk_compose_msi_msg,
+	.irq_set_affinity	= mtk_pcie_set_affinity,
+	.name			= "MSI",
+};
+
+static int mtk_msi_bottom_domain_alloc(struct irq_domain *domain,
+				       unsigned int virq, unsigned int nr_irqs,
+				       void *arg)
+{
+	struct mtk_pcie_port *port = domain->host_data;
+	struct mtk_msi_set *msi_set;
+	int i, hwirq, set_idx;
+
+	mutex_lock(&port->lock);
+
+	hwirq = bitmap_find_free_region(port->msi_irq_in_use, PCIE_MSI_IRQS_NUM,
+					order_base_2(nr_irqs));
+
+	mutex_unlock(&port->lock);
+
+	if (hwirq < 0)
+		return -ENOSPC;
+
+	set_idx = hwirq / PCIE_MSI_IRQS_PER_SET;
+	msi_set = &port->msi_sets[set_idx];
+
+	for (i = 0; i < nr_irqs; i++)
+		irq_domain_set_info(domain, virq + i, hwirq + i,
+				    &mtk_msi_bottom_irq_chip, msi_set,
+				    handle_edge_irq, NULL, NULL);
+
+	return 0;
+}
+
+static void mtk_msi_bottom_domain_free(struct irq_domain *domain,
+				       unsigned int virq, unsigned int nr_irqs)
+{
+	struct mtk_pcie_port *port = domain->host_data;
+	struct irq_data *data = irq_domain_get_irq_data(domain, virq);
+
+	mutex_lock(&port->lock);
+
+	bitmap_release_region(port->msi_irq_in_use, data->hwirq,
+			      order_base_2(nr_irqs));
+
+	mutex_unlock(&port->lock);
+
+	irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static const struct irq_domain_ops mtk_msi_bottom_domain_ops = {
+	.alloc = mtk_msi_bottom_domain_alloc,
+	.free = mtk_msi_bottom_domain_free,
+};
+
 static void mtk_intx_mask(struct irq_data *data)
 {
 	struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data);
@@ -357,6 +567,7 @@  static int mtk_pcie_init_irq_domains(struct mtk_pcie_port *port)
 {
 	struct device *dev = port->dev;
 	struct device_node *intc_node, *node = dev->of_node;
+	int ret;
 
 	raw_spin_lock_init(&port->irq_lock);
 
@@ -374,7 +585,34 @@  static int mtk_pcie_init_irq_domains(struct mtk_pcie_port *port)
 		return -ENODEV;
 	}
 
+	/* Setup MSI */
+	mutex_init(&port->lock);
+
+	port->msi_bottom_domain = irq_domain_add_linear(node, PCIE_MSI_IRQS_NUM,
+				  &mtk_msi_bottom_domain_ops, port);
+	if (!port->msi_bottom_domain) {
+		dev_err(dev, "failed to create MSI bottom domain\n");
+		ret = -ENODEV;
+		goto err_msi_bottom_domain;
+	}
+
+	port->msi_domain = pci_msi_create_irq_domain(dev->fwnode,
+						     &mtk_msi_domain_info,
+						     port->msi_bottom_domain);
+	if (!port->msi_domain) {
+		dev_err(dev, "failed to create MSI domain\n");
+		ret = -ENODEV;
+		goto err_msi_domain;
+	}
+
 	return 0;
+
+err_msi_domain:
+	irq_domain_remove(port->msi_bottom_domain);
+err_msi_bottom_domain:
+	irq_domain_remove(port->intx_domain);
+
+	return ret;
 }
 
 static void mtk_pcie_irq_teardown(struct mtk_pcie_port *port)
@@ -384,9 +622,39 @@  static void mtk_pcie_irq_teardown(struct mtk_pcie_port *port)
 	if (port->intx_domain)
 		irq_domain_remove(port->intx_domain);
 
+	if (port->msi_domain)
+		irq_domain_remove(port->msi_domain);
+
+	if (port->msi_bottom_domain)
+		irq_domain_remove(port->msi_bottom_domain);
+
 	irq_dispose_mapping(port->irq);
 }
 
+static void mtk_pcie_msi_handler(struct mtk_pcie_port *port, int set_idx)
+{
+	struct mtk_msi_set *msi_set = &port->msi_sets[set_idx];
+	unsigned long msi_enable, msi_status;
+	unsigned int virq;
+	irq_hw_number_t bit, hwirq;
+
+	msi_enable = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+
+	do {
+		msi_status = readl_relaxed(msi_set->base +
+					   PCIE_MSI_SET_STATUS_OFFSET);
+		msi_status &= msi_enable;
+		if (!msi_status)
+			break;
+
+		for_each_set_bit(bit, &msi_status, PCIE_MSI_IRQS_PER_SET) {
+			hwirq = bit + set_idx * PCIE_MSI_IRQS_PER_SET;
+			virq = irq_find_mapping(port->msi_bottom_domain, hwirq);
+			generic_handle_irq(virq);
+		}
+	} while (true);
+}
+
 static void mtk_pcie_irq_handler(struct irq_desc *desc)
 {
 	struct mtk_pcie_port *port = irq_desc_get_handler_data(desc);
@@ -405,6 +673,14 @@  static void mtk_pcie_irq_handler(struct irq_desc *desc)
 		generic_handle_irq(virq);
 	}
 
+	irq_bit = PCIE_MSI_SHIFT;
+	for_each_set_bit_from(irq_bit, &status, PCIE_MSI_SET_NUM +
+			      PCIE_MSI_SHIFT) {
+		mtk_pcie_msi_handler(port, irq_bit - PCIE_MSI_SHIFT);
+
+		writel_relaxed(BIT(irq_bit), port->base + PCIE_INT_STATUS_REG);
+	}
+
 	chained_irq_exit(irqchip, desc);
 }