diff mbox series

[v8,5/7] PCI: mediatek-gen3: Add MSI support

Message ID 20210224061132.26526-6-jianjun.wang@mediatek.com (mailing list archive)
State New, archived
Headers show
Series PCI: mediatek: Add new generation controller support | expand

Commit Message

Jianjun Wang (王建军) Feb. 24, 2021, 6:11 a.m. UTC
Add MSI support for MediaTek Gen3 PCIe controller.

This PCIe controller supports up to 256 MSI vectors, the MSI hardware
block diagram is as follows:

                  +-----+
                  | GIC |
                  +-----+
                     ^
                     |
                 port->irq
                     |
             +-+-+-+-+-+-+-+-+
             |0|1|2|3|4|5|6|7| (PCIe intc)
             +-+-+-+-+-+-+-+-+
              ^ ^           ^
              | |    ...    |
      +-------+ +------+    +-----------+
      |                |                |
+-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
|0|1|...|30|31|  |0|1|...|30|31|  |0|1|...|30|31| (MSI sets)
+-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
 ^ ^      ^  ^    ^ ^      ^  ^    ^ ^      ^  ^
 | |      |  |    | |      |  |    | |      |  |  (MSI vectors)
 | |      |  |    | |      |  |    | |      |  |

  (MSI SET0)       (MSI SET1)  ...   (MSI SET7)

With 256 MSI vectors supported, the MSI vectors are composed of 8 sets,
each set has its own address for MSI message, and supports 32 MSI vectors
to generate interrupt.

Signed-off-by: Jianjun Wang <jianjun.wang@mediatek.com>
Acked-by: Ryder Lee <ryder.lee@mediatek.com>
---
 drivers/pci/controller/pcie-mediatek-gen3.c | 277 ++++++++++++++++++++
 1 file changed, 277 insertions(+)

Comments

Krzysztof Wilczyński Feb. 24, 2021, 2:31 p.m. UTC | #1
Hi Jianjun,

[...]
> +static struct irq_chip mtk_msi_irq_chip = {
> +	.name = "MSI",
> +	.irq_enable = mtk_pcie_irq_unmask,
> +	.irq_disable = mtk_pcie_irq_mask,
> +	.irq_ack = irq_chip_ack_parent,
> +	.irq_mask = mtk_pcie_irq_mask,
> +	.irq_unmask = mtk_pcie_irq_unmask,
> +};

For consistency sake, what about aligning this like the
struct mtk_msi_bottom_irq_chip has been?  See immediately below.

[...]
> +static struct irq_chip mtk_msi_bottom_irq_chip = {
> +	.irq_ack		= mtk_msi_bottom_irq_ack,
> +	.irq_mask		= mtk_msi_bottom_irq_mask,
> +	.irq_unmask		= mtk_msi_bottom_irq_unmask,
> +	.irq_compose_msi_msg	= mtk_compose_msi_msg,
> +	.irq_set_affinity	= mtk_pcie_set_affinity,
> +	.name			= "MSI",
> +};

Krzysztof
Jianjun Wang (王建军) Feb. 25, 2021, 3:09 a.m. UTC | #2
Hi Krzysztof,

Thanks for your review, I will fix it at next version.

On Wed, 2021-02-24 at 15:31 +0100, Krzysztof Wilczyński wrote:
> Hi Jianjun,
> 
> [...]
> > +static struct irq_chip mtk_msi_irq_chip = {
> > +	.name = "MSI",
> > +	.irq_enable = mtk_pcie_irq_unmask,
> > +	.irq_disable = mtk_pcie_irq_mask,
> > +	.irq_ack = irq_chip_ack_parent,
> > +	.irq_mask = mtk_pcie_irq_mask,
> > +	.irq_unmask = mtk_pcie_irq_unmask,
> > +};
> 
> For consistency sake, what about aligning this like the
> struct mtk_msi_bottom_irq_chip has been?  See immediately below.
> 
> [...]
> > +static struct irq_chip mtk_msi_bottom_irq_chip = {
> > +	.irq_ack		= mtk_msi_bottom_irq_ack,
> > +	.irq_mask		= mtk_msi_bottom_irq_mask,
> > +	.irq_unmask		= mtk_msi_bottom_irq_unmask,
> > +	.irq_compose_msi_msg	= mtk_compose_msi_msg,
> > +	.irq_set_affinity	= mtk_pcie_set_affinity,
> > +	.name			= "MSI",
> > +};
> 
> Krzysztof

Thanks.
Marc Zyngier March 9, 2021, 11:23 a.m. UTC | #3
On Wed, 24 Feb 2021 06:11:30 +0000,
Jianjun Wang <jianjun.wang@mediatek.com> wrote:
> 
> Add MSI support for MediaTek Gen3 PCIe controller.
> 
> This PCIe controller supports up to 256 MSI vectors, the MSI hardware
> block diagram is as follows:
> 
>                   +-----+
>                   | GIC |
>                   +-----+
>                      ^
>                      |
>                  port->irq
>                      |
>              +-+-+-+-+-+-+-+-+
>              |0|1|2|3|4|5|6|7| (PCIe intc)
>              +-+-+-+-+-+-+-+-+
>               ^ ^           ^
>               | |    ...    |
>       +-------+ +------+    +-----------+
>       |                |                |
> +-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
> |0|1|...|30|31|  |0|1|...|30|31|  |0|1|...|30|31| (MSI sets)
> +-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
>  ^ ^      ^  ^    ^ ^      ^  ^    ^ ^      ^  ^
>  | |      |  |    | |      |  |    | |      |  |  (MSI vectors)
>  | |      |  |    | |      |  |    | |      |  |
> 
>   (MSI SET0)       (MSI SET1)  ...   (MSI SET7)
> 
> With 256 MSI vectors supported, the MSI vectors are composed of 8 sets,
> each set has its own address for MSI message, and supports 32 MSI vectors
> to generate interrupt.
> 
> Signed-off-by: Jianjun Wang <jianjun.wang@mediatek.com>
> Acked-by: Ryder Lee <ryder.lee@mediatek.com>
> ---
>  drivers/pci/controller/pcie-mediatek-gen3.c | 277 ++++++++++++++++++++
>  1 file changed, 277 insertions(+)
> 
> diff --git a/drivers/pci/controller/pcie-mediatek-gen3.c b/drivers/pci/controller/pcie-mediatek-gen3.c
> index 8b3b5f838b69..3cbec22ece0c 100644
> --- a/drivers/pci/controller/pcie-mediatek-gen3.c
> +++ b/drivers/pci/controller/pcie-mediatek-gen3.c
> @@ -14,6 +14,7 @@
>  #include <linux/irqdomain.h>
>  #include <linux/kernel.h>
>  #include <linux/module.h>
> +#include <linux/msi.h>
>  #include <linux/pci.h>
>  #include <linux/phy/phy.h>
>  #include <linux/platform_device.h>
> @@ -48,12 +49,29 @@
>  #define PCIE_LINK_STATUS_REG		0x154
>  #define PCIE_PORT_LINKUP		BIT(8)
>  
> +#define PCIE_MSI_SET_NUM		8
> +#define PCIE_MSI_IRQS_PER_SET		32
> +#define PCIE_MSI_IRQS_NUM \
> +	(PCIE_MSI_IRQS_PER_SET * PCIE_MSI_SET_NUM)
> +
>  #define PCIE_INT_ENABLE_REG		0x180
> +#define PCIE_MSI_ENABLE			GENMASK(PCIE_MSI_SET_NUM + 8 - 1, 8)
> +#define PCIE_MSI_SHIFT			8
>  #define PCIE_INTX_SHIFT			24
>  #define PCIE_INTX_ENABLE \
>  	GENMASK(PCIE_INTX_SHIFT + PCI_NUM_INTX - 1, PCIE_INTX_SHIFT)
>  
>  #define PCIE_INT_STATUS_REG		0x184
> +#define PCIE_MSI_SET_ENABLE_REG		0x190
> +#define PCIE_MSI_SET_ENABLE		GENMASK(PCIE_MSI_SET_NUM - 1, 0)
> +
> +#define PCIE_MSI_SET_BASE_REG		0xc00
> +#define PCIE_MSI_SET_OFFSET		0x10
> +#define PCIE_MSI_SET_STATUS_OFFSET	0x04
> +#define PCIE_MSI_SET_ENABLE_OFFSET	0x08
> +
> +#define PCIE_MSI_SET_ADDR_HI_BASE	0xc80
> +#define PCIE_MSI_SET_ADDR_HI_OFFSET	0x04
>  
>  #define PCIE_TRANS_TABLE_BASE_REG	0x800
>  #define PCIE_ATR_SRC_ADDR_MSB_OFFSET	0x4
> @@ -73,6 +91,16 @@
>  #define PCIE_ATR_TLP_TYPE_MEM		PCIE_ATR_TLP_TYPE(0)
>  #define PCIE_ATR_TLP_TYPE_IO		PCIE_ATR_TLP_TYPE(2)
>  
> +/**
> + * struct mtk_pcie_msi - MSI information for each set
> + * @base: IO mapped register base
> + * @msg_addr: MSI message address
> + */
> +struct mtk_msi_set {
> +	void __iomem *base;
> +	phys_addr_t msg_addr;
> +};
> +
>  /**
>   * struct mtk_pcie_port - PCIe port information
>   * @dev: pointer to PCIe device
> @@ -86,6 +114,11 @@
>   * @irq: PCIe controller interrupt number
>   * @irq_lock: lock protecting IRQ register access
>   * @intx_domain: legacy INTx IRQ domain
> + * @msi_domain: MSI IRQ domain
> + * @msi_bottom_domain: MSI IRQ bottom domain
> + * @msi_sets: MSI sets information
> + * @lock: lock protecting IRQ bit map
> + * @msi_irq_in_use: bit map for assigned MSI IRQ
>   */
>  struct mtk_pcie_port {
>  	struct device *dev;
> @@ -100,6 +133,11 @@ struct mtk_pcie_port {
>  	int irq;
>  	raw_spinlock_t irq_lock;
>  	struct irq_domain *intx_domain;
> +	struct irq_domain *msi_domain;
> +	struct irq_domain *msi_bottom_domain;
> +	struct mtk_msi_set msi_sets[PCIE_MSI_SET_NUM];
> +	struct mutex lock;
> +	DECLARE_BITMAP(msi_irq_in_use, PCIE_MSI_IRQS_NUM);
>  };
>  
>  /**
> @@ -197,6 +235,35 @@ static int mtk_pcie_set_trans_table(struct mtk_pcie_port *port,
>  	return 0;
>  }
>  
> +static void mtk_pcie_enable_msi(struct mtk_pcie_port *port)
> +{
> +	int i;
> +	u32 val;
> +
> +	val = readl_relaxed(port->base + PCIE_MSI_SET_ENABLE_REG);
> +	val |= PCIE_MSI_SET_ENABLE;
> +	writel_relaxed(val, port->base + PCIE_MSI_SET_ENABLE_REG);
> +
> +	val = readl_relaxed(port->base + PCIE_INT_ENABLE_REG);
> +	val |= PCIE_MSI_ENABLE;
> +	writel_relaxed(val, port->base + PCIE_INT_ENABLE_REG);

Shouldn't you configure the capture addresses *before* enabling
things? Is there any need for locking here, given that you are
modifying global registers?

> +
> +	for (i = 0; i < PCIE_MSI_SET_NUM; i++) {
> +		struct mtk_msi_set *msi_set = &port->msi_sets[i];
> +
> +		msi_set->base = port->base + PCIE_MSI_SET_BASE_REG +
> +				i * PCIE_MSI_SET_OFFSET;
> +		msi_set->msg_addr = port->reg_base + PCIE_MSI_SET_BASE_REG +
> +				    i * PCIE_MSI_SET_OFFSET;
> +
> +		/* Configure the MSI capture address */
> +		writel_relaxed(lower_32_bits(msi_set->msg_addr), msi_set->base);
> +		writel_relaxed(upper_32_bits(msi_set->msg_addr),
> +			       port->base + PCIE_MSI_SET_ADDR_HI_BASE +
> +			       i * PCIE_MSI_SET_ADDR_HI_OFFSET);
> +	}
> +}
> +
>  static int mtk_pcie_startup_port(struct mtk_pcie_port *port)
>  {
>  	struct resource_entry *entry;
> @@ -247,6 +314,8 @@ static int mtk_pcie_startup_port(struct mtk_pcie_port *port)
>  		return err;
>  	}
>  
> +	mtk_pcie_enable_msi(port);
> +
>  	/* Set PCIe translation windows */
>  	resource_list_for_each_entry(entry, &host->windows) {
>  		struct resource *res = entry->res;
> @@ -290,6 +359,148 @@ static int mtk_pcie_set_affinity(struct irq_data *data,
>  	return -EINVAL;
>  }
>  
> +static void mtk_pcie_irq_mask(struct irq_data *data)

It'd be good if you used _msi_ in function names that deal with MSIs.

> +{
> +	pci_msi_mask_irq(data);
> +	irq_chip_mask_parent(data);
> +}
> +
> +static void mtk_pcie_irq_unmask(struct irq_data *data)
> +{
> +	pci_msi_unmask_irq(data);
> +	irq_chip_unmask_parent(data);
> +}
> +
> +static struct irq_chip mtk_msi_irq_chip = {
> +	.name = "MSI",
> +	.irq_enable = mtk_pcie_irq_unmask,
> +	.irq_disable = mtk_pcie_irq_mask,

Same comment as for the previous patch: enable/disable serve no
purpose here.

> +	.irq_ack = irq_chip_ack_parent,
> +	.irq_mask = mtk_pcie_irq_mask,
> +	.irq_unmask = mtk_pcie_irq_unmask,
> +};
> +
> +static struct msi_domain_info mtk_msi_domain_info = {
> +	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_PCI_MSIX |
> +		   MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI),

minor nit: keep the *_OPS flag on one line, and the *_PCI_* flags on
another.

> +	.chip	= &mtk_msi_irq_chip,
> +};
> +
> +static void mtk_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> +{
> +	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
> +	struct mtk_pcie_port *port = data->domain->host_data;
> +	unsigned long hwirq;
> +
> +	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
> +
> +	msg->address_hi = upper_32_bits(msi_set->msg_addr);
> +	msg->address_lo = lower_32_bits(msi_set->msg_addr);
> +	msg->data = hwirq;
> +	dev_dbg(port->dev, "msi#%#lx address_hi %#x address_lo %#x data %d\n",
> +		hwirq, msg->address_hi, msg->address_lo, msg->data);
> +}
> +
> +static void mtk_msi_bottom_irq_ack(struct irq_data *data)
> +{
> +	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
> +	unsigned long hwirq;
> +
> +	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
> +
> +	writel_relaxed(BIT(hwirq), msi_set->base + PCIE_MSI_SET_STATUS_OFFSET);
> +}
> +
> +static void mtk_msi_bottom_irq_mask(struct irq_data *data)
> +{
> +	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
> +	struct mtk_pcie_port *port = data->domain->host_data;
> +	unsigned long hwirq, flags;
> +	u32 val;
> +
> +	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
> +
> +	raw_spin_lock_irqsave(&port->irq_lock, flags);
> +	val = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
> +	val &= ~BIT(hwirq);
> +	writel_relaxed(val, msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
> +	raw_spin_unlock_irqrestore(&port->irq_lock, flags);
> +}
> +
> +static void mtk_msi_bottom_irq_unmask(struct irq_data *data)
> +{
> +	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
> +	struct mtk_pcie_port *port = data->domain->host_data;
> +	unsigned long hwirq, flags;
> +	u32 val;
> +
> +	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
> +
> +	raw_spin_lock_irqsave(&port->irq_lock, flags);
> +	val = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
> +	val |= BIT(hwirq);
> +	writel_relaxed(val, msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
> +	raw_spin_unlock_irqrestore(&port->irq_lock, flags);
> +}
> +
> +static struct irq_chip mtk_msi_bottom_irq_chip = {
> +	.irq_ack		= mtk_msi_bottom_irq_ack,
> +	.irq_mask		= mtk_msi_bottom_irq_mask,
> +	.irq_unmask		= mtk_msi_bottom_irq_unmask,
> +	.irq_compose_msi_msg	= mtk_compose_msi_msg,
> +	.irq_set_affinity	= mtk_pcie_set_affinity,
> +	.name			= "MSI",
> +};
> +
> +static int mtk_msi_bottom_domain_alloc(struct irq_domain *domain,
> +				       unsigned int virq, unsigned int nr_irqs,
> +				       void *arg)
> +{
> +	struct mtk_pcie_port *port = domain->host_data;
> +	struct mtk_msi_set *msi_set;
> +	int i, hwirq, set_idx;
> +
> +	mutex_lock(&port->lock);
> +
> +	hwirq = bitmap_find_free_region(port->msi_irq_in_use, PCIE_MSI_IRQS_NUM,
> +					order_base_2(nr_irqs));
> +
> +	mutex_unlock(&port->lock);
> +
> +	if (hwirq < 0)
> +		return -ENOSPC;
> +
> +	set_idx = hwirq / PCIE_MSI_IRQS_PER_SET;
> +	msi_set = &port->msi_sets[set_idx];
> +
> +	for (i = 0; i < nr_irqs; i++)
> +		irq_domain_set_info(domain, virq + i, hwirq + i,
> +				    &mtk_msi_bottom_irq_chip, msi_set,
> +				    handle_edge_irq, NULL, NULL);
> +
> +	return 0;
> +}
> +
> +static void mtk_msi_bottom_domain_free(struct irq_domain *domain,
> +				       unsigned int virq, unsigned int nr_irqs)
> +{
> +	struct mtk_pcie_port *port = domain->host_data;
> +	struct irq_data *data = irq_domain_get_irq_data(domain, virq);
> +
> +	mutex_lock(&port->lock);
> +
> +	bitmap_clear(port->msi_irq_in_use, data->hwirq, nr_irqs);
> +
> +	mutex_unlock(&port->lock);
> +
> +	irq_domain_free_irqs_common(domain, virq, nr_irqs);
> +}
> +
> +static const struct irq_domain_ops mtk_msi_bottom_domain_ops = {
> +	.alloc = mtk_msi_bottom_domain_alloc,
> +	.free = mtk_msi_bottom_domain_free,
> +};
> +
>  static void mtk_intx_mask(struct irq_data *data)
>  {
>  	struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data);
> @@ -360,6 +571,7 @@ static int mtk_pcie_init_irq_domains(struct mtk_pcie_port *port)
>  {
>  	struct device *dev = port->dev;
>  	struct device_node *intc_node, *node = dev->of_node;
> +	int ret;
>  
>  	raw_spin_lock_init(&port->irq_lock);
>  
> @@ -377,7 +589,34 @@ static int mtk_pcie_init_irq_domains(struct mtk_pcie_port *port)
>  		return -ENODEV;
>  	}
>  
> +	/* Setup MSI */
> +	mutex_init(&port->lock);
> +
> +	port->msi_bottom_domain = irq_domain_add_linear(node, PCIE_MSI_IRQS_NUM,
> +				  &mtk_msi_bottom_domain_ops, port);
> +	if (!port->msi_bottom_domain) {
> +		dev_info(dev, "failed to create MSI bottom domain\n");
> +		ret = -ENODEV;
> +		goto err_msi_bottom_domain;
> +	}
> +
> +	port->msi_domain = pci_msi_create_irq_domain(dev->fwnode,
> +						     &mtk_msi_domain_info,
> +						     port->msi_bottom_domain);
> +	if (!port->msi_domain) {
> +		dev_info(dev, "failed to create MSI domain\n");
> +		ret = -ENODEV;
> +		goto err_msi_domain;
> +	}
> +
>  	return 0;
> +
> +err_msi_domain:
> +	irq_domain_remove(port->msi_bottom_domain);
> +err_msi_bottom_domain:
> +	irq_domain_remove(port->intx_domain);
> +
> +	return ret;
>  }
>  
>  static void mtk_pcie_irq_teardown(struct mtk_pcie_port *port)
> @@ -387,9 +626,39 @@ static void mtk_pcie_irq_teardown(struct mtk_pcie_port *port)
>  	if (port->intx_domain)
>  		irq_domain_remove(port->intx_domain);
>  
> +	if (port->msi_domain)
> +		irq_domain_remove(port->msi_domain);
> +
> +	if (port->msi_bottom_domain)
> +		irq_domain_remove(port->msi_bottom_domain);
> +
>  	irq_dispose_mapping(port->irq);
>  }
>  
> +static void mtk_pcie_msi_handler(struct mtk_pcie_port *port, int set_idx)
> +{
> +	struct mtk_msi_set *msi_set = &port->msi_sets[set_idx];
> +	unsigned long msi_enable, msi_status;
> +	unsigned int virq;
> +	irq_hw_number_t bit, hwirq;
> +
> +	msi_enable = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
> +
> +	do {
> +		msi_status = readl_relaxed(msi_set->base +
> +					   PCIE_MSI_SET_STATUS_OFFSET);
> +		msi_status &= msi_enable;
> +		if (!msi_status)
> +			break;
> +
> +		for_each_set_bit(bit, &msi_status, PCIE_MSI_IRQS_PER_SET) {
> +			hwirq = bit + set_idx * PCIE_MSI_IRQS_PER_SET;
> +			virq = irq_find_mapping(port->msi_bottom_domain, hwirq);
> +			generic_handle_irq(virq);
> +		}
> +	} while (true);
> +}
> +
>  static void mtk_pcie_irq_handler(struct irq_desc *desc)
>  {
>  	struct mtk_pcie_port *port = irq_desc_get_handler_data(desc);
> @@ -408,6 +677,14 @@ static void mtk_pcie_irq_handler(struct irq_desc *desc)
>  		generic_handle_irq(virq);
>  	}
>  
> +	irq_bit = PCIE_MSI_SHIFT;
> +	for_each_set_bit_from(irq_bit, &status, PCIE_MSI_SET_NUM +
> +			      PCIE_MSI_SHIFT) {
> +		mtk_pcie_msi_handler(port, irq_bit - PCIE_MSI_SHIFT);
> +
> +		writel_relaxed(BIT(irq_bit), port->base + PCIE_INT_STATUS_REG);

Isn't this write the same thing you have for EOI in the INTx case?
While I could understand your description in that case (this is a
resampling operation), I don't get what this does here. Either this is
also an EOI, but your initial description doesn't make sense, or it is
an Ack, and it should be moved to the right place.

Which one is it?

Thanks,

	M.
Jianjun Wang (王建军) March 10, 2021, 6:48 a.m. UTC | #4
Hi Marc,

Thanks for your review.

On Tue, 2021-03-09 at 11:23 +0000, Marc Zyngier wrote:
> On Wed, 24 Feb 2021 06:11:30 +0000,
> Jianjun Wang <jianjun.wang@mediatek.com> wrote:
> > 
> > Add MSI support for MediaTek Gen3 PCIe controller.
> > 
> > This PCIe controller supports up to 256 MSI vectors, the MSI hardware
> > block diagram is as follows:
> > 
> >                   +-----+
> >                   | GIC |
> >                   +-----+
> >                      ^
> >                      |
> >                  port->irq
> >                      |
> >              +-+-+-+-+-+-+-+-+
> >              |0|1|2|3|4|5|6|7| (PCIe intc)
> >              +-+-+-+-+-+-+-+-+
> >               ^ ^           ^
> >               | |    ...    |
> >       +-------+ +------+    +-----------+
> >       |                |                |
> > +-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
> > |0|1|...|30|31|  |0|1|...|30|31|  |0|1|...|30|31| (MSI sets)
> > +-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
> >  ^ ^      ^  ^    ^ ^      ^  ^    ^ ^      ^  ^
> >  | |      |  |    | |      |  |    | |      |  |  (MSI vectors)
> >  | |      |  |    | |      |  |    | |      |  |
> > 
> >   (MSI SET0)       (MSI SET1)  ...   (MSI SET7)
> > 
> > With 256 MSI vectors supported, the MSI vectors are composed of 8 sets,
> > each set has its own address for MSI message, and supports 32 MSI vectors
> > to generate interrupt.
> > 
> > Signed-off-by: Jianjun Wang <jianjun.wang@mediatek.com>
> > Acked-by: Ryder Lee <ryder.lee@mediatek.com>
> > ---
> >  drivers/pci/controller/pcie-mediatek-gen3.c | 277 ++++++++++++++++++++
> >  1 file changed, 277 insertions(+)
> > 
> > diff --git a/drivers/pci/controller/pcie-mediatek-gen3.c b/drivers/pci/controller/pcie-mediatek-gen3.c
> > index 8b3b5f838b69..3cbec22ece0c 100644
> > --- a/drivers/pci/controller/pcie-mediatek-gen3.c
> > +++ b/drivers/pci/controller/pcie-mediatek-gen3.c
> > @@ -14,6 +14,7 @@
> >  #include <linux/irqdomain.h>
> >  #include <linux/kernel.h>
> >  #include <linux/module.h>
> > +#include <linux/msi.h>
> >  #include <linux/pci.h>
> >  #include <linux/phy/phy.h>
> >  #include <linux/platform_device.h>
> > @@ -48,12 +49,29 @@
> >  #define PCIE_LINK_STATUS_REG		0x154
> >  #define PCIE_PORT_LINKUP		BIT(8)
> >  
> > +#define PCIE_MSI_SET_NUM		8
> > +#define PCIE_MSI_IRQS_PER_SET		32
> > +#define PCIE_MSI_IRQS_NUM \
> > +	(PCIE_MSI_IRQS_PER_SET * PCIE_MSI_SET_NUM)
> > +
> >  #define PCIE_INT_ENABLE_REG		0x180
> > +#define PCIE_MSI_ENABLE			GENMASK(PCIE_MSI_SET_NUM + 8 - 1, 8)
> > +#define PCIE_MSI_SHIFT			8
> >  #define PCIE_INTX_SHIFT			24
> >  #define PCIE_INTX_ENABLE \
> >  	GENMASK(PCIE_INTX_SHIFT + PCI_NUM_INTX - 1, PCIE_INTX_SHIFT)
> >  
> >  #define PCIE_INT_STATUS_REG		0x184
> > +#define PCIE_MSI_SET_ENABLE_REG		0x190
> > +#define PCIE_MSI_SET_ENABLE		GENMASK(PCIE_MSI_SET_NUM - 1, 0)
> > +
> > +#define PCIE_MSI_SET_BASE_REG		0xc00
> > +#define PCIE_MSI_SET_OFFSET		0x10
> > +#define PCIE_MSI_SET_STATUS_OFFSET	0x04
> > +#define PCIE_MSI_SET_ENABLE_OFFSET	0x08
> > +
> > +#define PCIE_MSI_SET_ADDR_HI_BASE	0xc80
> > +#define PCIE_MSI_SET_ADDR_HI_OFFSET	0x04
> >  
> >  #define PCIE_TRANS_TABLE_BASE_REG	0x800
> >  #define PCIE_ATR_SRC_ADDR_MSB_OFFSET	0x4
> > @@ -73,6 +91,16 @@
> >  #define PCIE_ATR_TLP_TYPE_MEM		PCIE_ATR_TLP_TYPE(0)
> >  #define PCIE_ATR_TLP_TYPE_IO		PCIE_ATR_TLP_TYPE(2)
> >  
> > +/**
> > + * struct mtk_pcie_msi - MSI information for each set
> > + * @base: IO mapped register base
> > + * @msg_addr: MSI message address
> > + */
> > +struct mtk_msi_set {
> > +	void __iomem *base;
> > +	phys_addr_t msg_addr;
> > +};
> > +
> >  /**
> >   * struct mtk_pcie_port - PCIe port information
> >   * @dev: pointer to PCIe device
> > @@ -86,6 +114,11 @@
> >   * @irq: PCIe controller interrupt number
> >   * @irq_lock: lock protecting IRQ register access
> >   * @intx_domain: legacy INTx IRQ domain
> > + * @msi_domain: MSI IRQ domain
> > + * @msi_bottom_domain: MSI IRQ bottom domain
> > + * @msi_sets: MSI sets information
> > + * @lock: lock protecting IRQ bit map
> > + * @msi_irq_in_use: bit map for assigned MSI IRQ
> >   */
> >  struct mtk_pcie_port {
> >  	struct device *dev;
> > @@ -100,6 +133,11 @@ struct mtk_pcie_port {
> >  	int irq;
> >  	raw_spinlock_t irq_lock;
> >  	struct irq_domain *intx_domain;
> > +	struct irq_domain *msi_domain;
> > +	struct irq_domain *msi_bottom_domain;
> > +	struct mtk_msi_set msi_sets[PCIE_MSI_SET_NUM];
> > +	struct mutex lock;
> > +	DECLARE_BITMAP(msi_irq_in_use, PCIE_MSI_IRQS_NUM);
> >  };
> >  
> >  /**
> > @@ -197,6 +235,35 @@ static int mtk_pcie_set_trans_table(struct mtk_pcie_port *port,
> >  	return 0;
> >  }
> >  
> > +static void mtk_pcie_enable_msi(struct mtk_pcie_port *port)
> > +{
> > +	int i;
> > +	u32 val;
> > +
> > +	val = readl_relaxed(port->base + PCIE_MSI_SET_ENABLE_REG);
> > +	val |= PCIE_MSI_SET_ENABLE;
> > +	writel_relaxed(val, port->base + PCIE_MSI_SET_ENABLE_REG);
> > +
> > +	val = readl_relaxed(port->base + PCIE_INT_ENABLE_REG);
> > +	val |= PCIE_MSI_ENABLE;
> > +	writel_relaxed(val, port->base + PCIE_INT_ENABLE_REG);
> 
> Shouldn't you configure the capture addresses *before* enabling
> things? Is there any need for locking here, given that you are
> modifying global registers?

Yes, I will move these codes to the back of the configure capture
address in the next version.

I think the lock may not be needed because this function is only
executed once when driver probe.

> 
> > +
> > +	for (i = 0; i < PCIE_MSI_SET_NUM; i++) {
> > +		struct mtk_msi_set *msi_set = &port->msi_sets[i];
> > +
> > +		msi_set->base = port->base + PCIE_MSI_SET_BASE_REG +
> > +				i * PCIE_MSI_SET_OFFSET;
> > +		msi_set->msg_addr = port->reg_base + PCIE_MSI_SET_BASE_REG +
> > +				    i * PCIE_MSI_SET_OFFSET;
> > +
> > +		/* Configure the MSI capture address */
> > +		writel_relaxed(lower_32_bits(msi_set->msg_addr), msi_set->base);
> > +		writel_relaxed(upper_32_bits(msi_set->msg_addr),
> > +			       port->base + PCIE_MSI_SET_ADDR_HI_BASE +
> > +			       i * PCIE_MSI_SET_ADDR_HI_OFFSET);
> > +	}
> > +}
> > +
> >  static int mtk_pcie_startup_port(struct mtk_pcie_port *port)
> >  {
> >  	struct resource_entry *entry;
> > @@ -247,6 +314,8 @@ static int mtk_pcie_startup_port(struct mtk_pcie_port *port)
> >  		return err;
> >  	}
> >  
> > +	mtk_pcie_enable_msi(port);
> > +
> >  	/* Set PCIe translation windows */
> >  	resource_list_for_each_entry(entry, &host->windows) {
> >  		struct resource *res = entry->res;
> > @@ -290,6 +359,148 @@ static int mtk_pcie_set_affinity(struct irq_data *data,
> >  	return -EINVAL;
> >  }
> >  
> > +static void mtk_pcie_irq_mask(struct irq_data *data)
> 
> It'd be good if you used _msi_ in function names that deal with MSIs.

OK, I will fix it in the next version.

> 
> > +{
> > +	pci_msi_mask_irq(data);
> > +	irq_chip_mask_parent(data);
> > +}
> > +
> > +static void mtk_pcie_irq_unmask(struct irq_data *data)
> > +{
> > +	pci_msi_unmask_irq(data);
> > +	irq_chip_unmask_parent(data);
> > +}
> > +
> > +static struct irq_chip mtk_msi_irq_chip = {
> > +	.name = "MSI",
> > +	.irq_enable = mtk_pcie_irq_unmask,
> > +	.irq_disable = mtk_pcie_irq_mask,
> 
> Same comment as for the previous patch: enable/disable serve no
> purpose here.

Replied in the previous patch, the enable/disable callback is used when
the system suspend/resume.

> 
> > +	.irq_ack = irq_chip_ack_parent,
> > +	.irq_mask = mtk_pcie_irq_mask,
> > +	.irq_unmask = mtk_pcie_irq_unmask,
> > +};
> > +
> > +static struct msi_domain_info mtk_msi_domain_info = {
> > +	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_PCI_MSIX |
> > +		   MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI),
> 
> minor nit: keep the *_OPS flag on one line, and the *_PCI_* flags on
> another.

Sure, I will fix it in the next version.

> 
> > +	.chip	= &mtk_msi_irq_chip,
> > +};
> > +
> > +static void mtk_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> > +{
> > +	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
> > +	struct mtk_pcie_port *port = data->domain->host_data;
> > +	unsigned long hwirq;
> > +
> > +	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
> > +
> > +	msg->address_hi = upper_32_bits(msi_set->msg_addr);
> > +	msg->address_lo = lower_32_bits(msi_set->msg_addr);
> > +	msg->data = hwirq;
> > +	dev_dbg(port->dev, "msi#%#lx address_hi %#x address_lo %#x data %d\n",
> > +		hwirq, msg->address_hi, msg->address_lo, msg->data);
> > +}
> > +
> > +static void mtk_msi_bottom_irq_ack(struct irq_data *data)
> > +{
> > +	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
> > +	unsigned long hwirq;
> > +
> > +	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
> > +
> > +	writel_relaxed(BIT(hwirq), msi_set->base + PCIE_MSI_SET_STATUS_OFFSET);
> > +}
> > +
> > +static void mtk_msi_bottom_irq_mask(struct irq_data *data)
> > +{
> > +	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
> > +	struct mtk_pcie_port *port = data->domain->host_data;
> > +	unsigned long hwirq, flags;
> > +	u32 val;
> > +
> > +	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
> > +
> > +	raw_spin_lock_irqsave(&port->irq_lock, flags);
> > +	val = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
> > +	val &= ~BIT(hwirq);
> > +	writel_relaxed(val, msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
> > +	raw_spin_unlock_irqrestore(&port->irq_lock, flags);
> > +}
> > +
> > +static void mtk_msi_bottom_irq_unmask(struct irq_data *data)
> > +{
> > +	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
> > +	struct mtk_pcie_port *port = data->domain->host_data;
> > +	unsigned long hwirq, flags;
> > +	u32 val;
> > +
> > +	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
> > +
> > +	raw_spin_lock_irqsave(&port->irq_lock, flags);
> > +	val = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
> > +	val |= BIT(hwirq);
> > +	writel_relaxed(val, msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
> > +	raw_spin_unlock_irqrestore(&port->irq_lock, flags);
> > +}
> > +
> > +static struct irq_chip mtk_msi_bottom_irq_chip = {
> > +	.irq_ack		= mtk_msi_bottom_irq_ack,
> > +	.irq_mask		= mtk_msi_bottom_irq_mask,
> > +	.irq_unmask		= mtk_msi_bottom_irq_unmask,
> > +	.irq_compose_msi_msg	= mtk_compose_msi_msg,
> > +	.irq_set_affinity	= mtk_pcie_set_affinity,
> > +	.name			= "MSI",
> > +};
> > +
> > +static int mtk_msi_bottom_domain_alloc(struct irq_domain *domain,
> > +				       unsigned int virq, unsigned int nr_irqs,
> > +				       void *arg)
> > +{
> > +	struct mtk_pcie_port *port = domain->host_data;
> > +	struct mtk_msi_set *msi_set;
> > +	int i, hwirq, set_idx;
> > +
> > +	mutex_lock(&port->lock);
> > +
> > +	hwirq = bitmap_find_free_region(port->msi_irq_in_use, PCIE_MSI_IRQS_NUM,
> > +					order_base_2(nr_irqs));
> > +
> > +	mutex_unlock(&port->lock);
> > +
> > +	if (hwirq < 0)
> > +		return -ENOSPC;
> > +
> > +	set_idx = hwirq / PCIE_MSI_IRQS_PER_SET;
> > +	msi_set = &port->msi_sets[set_idx];
> > +
> > +	for (i = 0; i < nr_irqs; i++)
> > +		irq_domain_set_info(domain, virq + i, hwirq + i,
> > +				    &mtk_msi_bottom_irq_chip, msi_set,
> > +				    handle_edge_irq, NULL, NULL);
> > +
> > +	return 0;
> > +}
> > +
> > +static void mtk_msi_bottom_domain_free(struct irq_domain *domain,
> > +				       unsigned int virq, unsigned int nr_irqs)
> > +{
> > +	struct mtk_pcie_port *port = domain->host_data;
> > +	struct irq_data *data = irq_domain_get_irq_data(domain, virq);
> > +
> > +	mutex_lock(&port->lock);
> > +
> > +	bitmap_clear(port->msi_irq_in_use, data->hwirq, nr_irqs);
> > +
> > +	mutex_unlock(&port->lock);
> > +
> > +	irq_domain_free_irqs_common(domain, virq, nr_irqs);
> > +}
> > +
> > +static const struct irq_domain_ops mtk_msi_bottom_domain_ops = {
> > +	.alloc = mtk_msi_bottom_domain_alloc,
> > +	.free = mtk_msi_bottom_domain_free,
> > +};
> > +
> >  static void mtk_intx_mask(struct irq_data *data)
> >  {
> >  	struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data);
> > @@ -360,6 +571,7 @@ static int mtk_pcie_init_irq_domains(struct mtk_pcie_port *port)
> >  {
> >  	struct device *dev = port->dev;
> >  	struct device_node *intc_node, *node = dev->of_node;
> > +	int ret;
> >  
> >  	raw_spin_lock_init(&port->irq_lock);
> >  
> > @@ -377,7 +589,34 @@ static int mtk_pcie_init_irq_domains(struct mtk_pcie_port *port)
> >  		return -ENODEV;
> >  	}
> >  
> > +	/* Setup MSI */
> > +	mutex_init(&port->lock);
> > +
> > +	port->msi_bottom_domain = irq_domain_add_linear(node, PCIE_MSI_IRQS_NUM,
> > +				  &mtk_msi_bottom_domain_ops, port);
> > +	if (!port->msi_bottom_domain) {
> > +		dev_info(dev, "failed to create MSI bottom domain\n");
> > +		ret = -ENODEV;
> > +		goto err_msi_bottom_domain;
> > +	}
> > +
> > +	port->msi_domain = pci_msi_create_irq_domain(dev->fwnode,
> > +						     &mtk_msi_domain_info,
> > +						     port->msi_bottom_domain);
> > +	if (!port->msi_domain) {
> > +		dev_info(dev, "failed to create MSI domain\n");
> > +		ret = -ENODEV;
> > +		goto err_msi_domain;
> > +	}
> > +
> >  	return 0;
> > +
> > +err_msi_domain:
> > +	irq_domain_remove(port->msi_bottom_domain);
> > +err_msi_bottom_domain:
> > +	irq_domain_remove(port->intx_domain);
> > +
> > +	return ret;
> >  }
> >  
> >  static void mtk_pcie_irq_teardown(struct mtk_pcie_port *port)
> > @@ -387,9 +626,39 @@ static void mtk_pcie_irq_teardown(struct mtk_pcie_port *port)
> >  	if (port->intx_domain)
> >  		irq_domain_remove(port->intx_domain);
> >  
> > +	if (port->msi_domain)
> > +		irq_domain_remove(port->msi_domain);
> > +
> > +	if (port->msi_bottom_domain)
> > +		irq_domain_remove(port->msi_bottom_domain);
> > +
> >  	irq_dispose_mapping(port->irq);
> >  }
> >  
> > +static void mtk_pcie_msi_handler(struct mtk_pcie_port *port, int set_idx)
> > +{
> > +	struct mtk_msi_set *msi_set = &port->msi_sets[set_idx];
> > +	unsigned long msi_enable, msi_status;
> > +	unsigned int virq;
> > +	irq_hw_number_t bit, hwirq;
> > +
> > +	msi_enable = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
> > +
> > +	do {
> > +		msi_status = readl_relaxed(msi_set->base +
> > +					   PCIE_MSI_SET_STATUS_OFFSET);
> > +		msi_status &= msi_enable;
> > +		if (!msi_status)
> > +			break;
> > +
> > +		for_each_set_bit(bit, &msi_status, PCIE_MSI_IRQS_PER_SET) {
> > +			hwirq = bit + set_idx * PCIE_MSI_IRQS_PER_SET;
> > +			virq = irq_find_mapping(port->msi_bottom_domain, hwirq);
> > +			generic_handle_irq(virq);
> > +		}
> > +	} while (true);
> > +}
> > +
> >  static void mtk_pcie_irq_handler(struct irq_desc *desc)
> >  {
> >  	struct mtk_pcie_port *port = irq_desc_get_handler_data(desc);
> > @@ -408,6 +677,14 @@ static void mtk_pcie_irq_handler(struct irq_desc *desc)
> >  		generic_handle_irq(virq);
> >  	}
> >  
> > +	irq_bit = PCIE_MSI_SHIFT;
> > +	for_each_set_bit_from(irq_bit, &status, PCIE_MSI_SET_NUM +
> > +			      PCIE_MSI_SHIFT) {
> > +		mtk_pcie_msi_handler(port, irq_bit - PCIE_MSI_SHIFT);
> > +
> > +		writel_relaxed(BIT(irq_bit), port->base + PCIE_INT_STATUS_REG);
> 
> Isn't this write the same thing you have for EOI in the INTx case?
> While I could understand your description in that case (this is a
> resampling operation), I don't get what this does here. Either this is
> also an EOI, but your initial description doesn't make sense, or it is
> an Ack, and it should be moved to the right place.
> 
> Which one is it?

I think it should be an EOI which used to clear the interrupt status of
a single set in the PCIe intc field, maybe I should move it to the end
of the mtk_pcie_msi_handler() function.

                  +-----+
                  | GIC |
                  +-----+
                     ^
                     |
                 port->irq
                     |
             +-+-+-+-+-+-+-+-+
             |0|1|2|3|4|5|6|7| (PCIe intc)
             +-+-+-+-+-+-+-+-+
              ^ ^           ^
              | |    ...    |
      +-------+ +------+    +-----------+
      |                |                |
+-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
|0|1|...|30|31|  |0|1|...|30|31|  |0|1|...|30|31| (MSI sets)
+-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
 ^ ^      ^  ^    ^ ^      ^  ^    ^ ^      ^  ^
 | |      |  |    | |      |  |    | |      |  |  (MSI vectors)
 | |      |  |    | |      |  |    | |      |  |

  (MSI SET0)       (MSI SET1)  ...   (MSI SET7)

I would like to ask another question. In this interrupt architecture, we
cannot implement an affinity for PCIe interrupts, so we return a
negative value in the mtk_pcie_set_affinity callback as follows: 

+static int mtk_pcie_set_affinity(struct irq_data *data,
+                                const struct cpumask *mask, bool force)
+{
+       return -EINVAL;
+}

But there will always be error logs when hotplug a CPU:

~ # echo 0 > /sys/devices/system/cpu/cpu1/online
[   93.633059] IRQ255: set affinity failed(-22).
[   93.633624] IRQ256: set affinity failed(-22).
[   93.634222] CPU1: shutdown
[   93.634586] psci: CPU1 killed (polled 0 ms)

Or when the system suspends:

~ # echo mem > /sys/power/state
[   93.635145] cpuhp: cpu_off cluster=0, cpu=1
[  169.835653] PM: suspend entry (deep)
[  169.836717] Filesystems sync: 0.000 seconds
[  169.837924] Freezing user space processes ... (elapsed 0.001 seconds)
done.
[  169.839922] OOM killer disabled.
[  169.840336] Freezing remaining freezable tasks ... (elapsed 0.001
seconds) done.
[  169.844715] Disabling non-boot CPUs ...
[  169.846443] IRQ255: set affinity failed(-22).
[  169.847002] IRQ256: set affinity failed(-22).
[  169.847586] CPU2: shutdown
[  169.847943] psci: CPU2 killed (polled 0 ms)
[  169.848489] cpuhp: cpu_off cluster=0, cpu=2
[  169.850285] IRQ255: set affinity failed(-22).
[  169.851369] IRQ256: set affinity failed(-22).
...

Sometimes this can cause misunderstandings to users, do we have a chance
to prevent this error log?

> 
> Thanks,
> 
> 	M.
> 

Thanks.
Marc Zyngier March 10, 2021, 9:41 a.m. UTC | #5
On Wed, 10 Mar 2021 06:48:49 +0000,
Jianjun Wang <jianjun.wang@mediatek.com> wrote:
> > > +static struct irq_chip mtk_msi_irq_chip = {
> > > +	.name = "MSI",
> > > +	.irq_enable = mtk_pcie_irq_unmask,
> > > +	.irq_disable = mtk_pcie_irq_mask,
> > 
> > Same comment as for the previous patch: enable/disable serve no
> > purpose here.
> 
> Replied in the previous patch, the enable/disable callback is used when
> the system suspend/resume.

As I said, your suspend/resume should be self contained, and not rely
on the irq subsystem to restore a viable state.

[...]

> > > @@ -408,6 +677,14 @@ static void mtk_pcie_irq_handler(struct irq_desc *desc)
> > >  		generic_handle_irq(virq);
> > >  	}
> > >  
> > > +	irq_bit = PCIE_MSI_SHIFT;
> > > +	for_each_set_bit_from(irq_bit, &status, PCIE_MSI_SET_NUM +
> > > +			      PCIE_MSI_SHIFT) {
> > > +		mtk_pcie_msi_handler(port, irq_bit - PCIE_MSI_SHIFT);
> > > +
> > > +		writel_relaxed(BIT(irq_bit), port->base + PCIE_INT_STATUS_REG);
> > 
> > Isn't this write the same thing you have for EOI in the INTx case?
> > While I could understand your description in that case (this is a
> > resampling operation), I don't get what this does here. Either this is
> > also an EOI, but your initial description doesn't make sense, or it is
> > an Ack, and it should be moved to the right place.
> > 
> > Which one is it?
> 
> I think it should be an EOI which used to clear the interrupt status of
> a single set in the PCIe intc field, maybe I should move it to the end
> of the mtk_pcie_msi_handler() function.

I doubt this is an EOI. If, as I suspect, it instructs the HW to clear
the bit so that new pending bits can be recorded, it must take place
*before* the interrupt is handled, or you may lose MSIs in the
interval between the handling of the interrupt and the clearing of the
pending bit. To satisfy this requirement, this should be an ACK, which
is consistent with the way most MSI controllers such as this one work.

> 
>                   +-----+
>                   | GIC |
>                   +-----+
>                      ^
>                      |
>                  port->irq
>                      |
>              +-+-+-+-+-+-+-+-+
>              |0|1|2|3|4|5|6|7| (PCIe intc)
>              +-+-+-+-+-+-+-+-+
>               ^ ^           ^
>               | |    ...    |
>       +-------+ +------+    +-----------+
>       |                |                |
> +-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
> |0|1|...|30|31|  |0|1|...|30|31|  |0|1|...|30|31| (MSI sets)
> +-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
>  ^ ^      ^  ^    ^ ^      ^  ^    ^ ^      ^  ^
>  | |      |  |    | |      |  |    | |      |  |  (MSI vectors)
>  | |      |  |    | |      |  |    | |      |  |
> 
>   (MSI SET0)       (MSI SET1)  ...   (MSI SET7)
> 
> I would like to ask another question. In this interrupt architecture, we
> cannot implement an affinity for PCIe interrupts, so we return a
> negative value in the mtk_pcie_set_affinity callback as follows: 
> 
> +static int mtk_pcie_set_affinity(struct irq_data *data,
> +                                const struct cpumask *mask, bool force)
> +{
> +       return -EINVAL;
> +}
> 
> But there will always be error logs when hotplug a CPU:
> 
> ~ # echo 0 > /sys/devices/system/cpu/cpu1/online
> [   93.633059] IRQ255: set affinity failed(-22).
> [   93.633624] IRQ256: set affinity failed(-22).
> [   93.634222] CPU1: shutdown
> [   93.634586] psci: CPU1 killed (polled 0 ms)
> 
> Or when the system suspends:
> 
> ~ # echo mem > /sys/power/state
> [   93.635145] cpuhp: cpu_off cluster=0, cpu=1
> [  169.835653] PM: suspend entry (deep)
> [  169.836717] Filesystems sync: 0.000 seconds
> [  169.837924] Freezing user space processes ... (elapsed 0.001 seconds)
> done.
> [  169.839922] OOM killer disabled.
> [  169.840336] Freezing remaining freezable tasks ... (elapsed 0.001
> seconds) done.
> [  169.844715] Disabling non-boot CPUs ...
> [  169.846443] IRQ255: set affinity failed(-22).
> [  169.847002] IRQ256: set affinity failed(-22).
> [  169.847586] CPU2: shutdown
> [  169.847943] psci: CPU2 killed (polled 0 ms)
> [  169.848489] cpuhp: cpu_off cluster=0, cpu=2
> [  169.850285] IRQ255: set affinity failed(-22).
> [  169.851369] IRQ256: set affinity failed(-22).
> ...
> 
> Sometimes this can cause misunderstandings to users, do we have a chance
> to prevent this error log?

No. This HW doesn't allow MSIs to be individually retargeted, and the
kernel isn't happy about that. That's one of the many reasons why
hiding MSIs behind a mux (or two in your case) is a *very bad idea*.

Thanks,

	M.
Pali Rohár March 11, 2021, 12:05 a.m. UTC | #6
On Wednesday 24 February 2021 14:11:30 Jianjun Wang wrote:
> +static int mtk_msi_bottom_domain_alloc(struct irq_domain *domain,
> +				       unsigned int virq, unsigned int nr_irqs,
> +				       void *arg)
> +{
> +	struct mtk_pcie_port *port = domain->host_data;
> +	struct mtk_msi_set *msi_set;
> +	int i, hwirq, set_idx;
> +
> +	mutex_lock(&port->lock);
> +
> +	hwirq = bitmap_find_free_region(port->msi_irq_in_use, PCIE_MSI_IRQS_NUM,
> +					order_base_2(nr_irqs));
> +
> +	mutex_unlock(&port->lock);
> +
> +	if (hwirq < 0)
> +		return -ENOSPC;
> +
> +	set_idx = hwirq / PCIE_MSI_IRQS_PER_SET;
> +	msi_set = &port->msi_sets[set_idx];
> +
> +	for (i = 0; i < nr_irqs; i++)
> +		irq_domain_set_info(domain, virq + i, hwirq + i,
> +				    &mtk_msi_bottom_irq_chip, msi_set,
> +				    handle_edge_irq, NULL, NULL);
> +
> +	return 0;
> +}
> +
> +static void mtk_msi_bottom_domain_free(struct irq_domain *domain,
> +				       unsigned int virq, unsigned int nr_irqs)
> +{
> +	struct mtk_pcie_port *port = domain->host_data;
> +	struct irq_data *data = irq_domain_get_irq_data(domain, virq);
> +
> +	mutex_lock(&port->lock);
> +
> +	bitmap_clear(port->msi_irq_in_use, data->hwirq, nr_irqs);

Marc, should not be there bitmap_release_region() with order_base_2()?

bitmap_release_region(port->msi_irq_in_use, data->hwirq, order_base_2(nr_irqs));

Because mtk_msi_bottom_domain_alloc() is allocating
order_base_2(nr_irqs) interrupts, not only nr_irqs.

> +
> +	mutex_unlock(&port->lock);
> +
> +	irq_domain_free_irqs_common(domain, virq, nr_irqs);
> +}
Marc Zyngier March 11, 2021, 8:19 a.m. UTC | #7
On 2021-03-11 00:05, Pali Rohár wrote:
> On Wednesday 24 February 2021 14:11:30 Jianjun Wang wrote:
>> +static int mtk_msi_bottom_domain_alloc(struct irq_domain *domain,
>> +				       unsigned int virq, unsigned int nr_irqs,
>> +				       void *arg)
>> +{
>> +	struct mtk_pcie_port *port = domain->host_data;
>> +	struct mtk_msi_set *msi_set;
>> +	int i, hwirq, set_idx;
>> +
>> +	mutex_lock(&port->lock);
>> +
>> +	hwirq = bitmap_find_free_region(port->msi_irq_in_use, 
>> PCIE_MSI_IRQS_NUM,
>> +					order_base_2(nr_irqs));
>> +
>> +	mutex_unlock(&port->lock);
>> +
>> +	if (hwirq < 0)
>> +		return -ENOSPC;
>> +
>> +	set_idx = hwirq / PCIE_MSI_IRQS_PER_SET;
>> +	msi_set = &port->msi_sets[set_idx];
>> +
>> +	for (i = 0; i < nr_irqs; i++)
>> +		irq_domain_set_info(domain, virq + i, hwirq + i,
>> +				    &mtk_msi_bottom_irq_chip, msi_set,
>> +				    handle_edge_irq, NULL, NULL);
>> +
>> +	return 0;
>> +}
>> +
>> +static void mtk_msi_bottom_domain_free(struct irq_domain *domain,
>> +				       unsigned int virq, unsigned int nr_irqs)
>> +{
>> +	struct mtk_pcie_port *port = domain->host_data;
>> +	struct irq_data *data = irq_domain_get_irq_data(domain, virq);
>> +
>> +	mutex_lock(&port->lock);
>> +
>> +	bitmap_clear(port->msi_irq_in_use, data->hwirq, nr_irqs);
> 
> Marc, should not be there bitmap_release_region() with order_base_2()?
> 
> bitmap_release_region(port->msi_irq_in_use, data->hwirq, 
> order_base_2(nr_irqs));
> 
> Because mtk_msi_bottom_domain_alloc() is allocating
> order_base_2(nr_irqs) interrupts, not only nr_irqs.

Indeed, good catch.

Thanks,

         M.
Jianjun Wang (王建军) March 11, 2021, 9:47 a.m. UTC | #8
On Wed, 2021-03-10 at 09:41 +0000, Marc Zyngier wrote:
> On Wed, 10 Mar 2021 06:48:49 +0000,
> Jianjun Wang <jianjun.wang@mediatek.com> wrote:
> > > > +static struct irq_chip mtk_msi_irq_chip = {
> > > > +	.name = "MSI",
> > > > +	.irq_enable = mtk_pcie_irq_unmask,
> > > > +	.irq_disable = mtk_pcie_irq_mask,
> > > 
> > > Same comment as for the previous patch: enable/disable serve no
> > > purpose here.
> > 
> > Replied in the previous patch, the enable/disable callback is used when
> > the system suspend/resume.
> 
> As I said, your suspend/resume should be self contained, and not rely
> on the irq subsystem to restore a viable state.

OK, I will try to find another way to save and restore the enabled state
of interrupts when the system suspend/resume.

> 
> [...]
> 
> > > > @@ -408,6 +677,14 @@ static void mtk_pcie_irq_handler(struct irq_desc *desc)
> > > >  		generic_handle_irq(virq);
> > > >  	}
> > > >  
> > > > +	irq_bit = PCIE_MSI_SHIFT;
> > > > +	for_each_set_bit_from(irq_bit, &status, PCIE_MSI_SET_NUM +
> > > > +			      PCIE_MSI_SHIFT) {
> > > > +		mtk_pcie_msi_handler(port, irq_bit - PCIE_MSI_SHIFT);
> > > > +
> > > > +		writel_relaxed(BIT(irq_bit), port->base + PCIE_INT_STATUS_REG);
> > > 
> > > Isn't this write the same thing you have for EOI in the INTx case?
> > > While I could understand your description in that case (this is a
> > > resampling operation), I don't get what this does here. Either this is
> > > also an EOI, but your initial description doesn't make sense, or it is
> > > an Ack, and it should be moved to the right place.
> > > 
> > > Which one is it?
> > 
> > I think it should be an EOI which used to clear the interrupt status of
> > a single set in the PCIe intc field, maybe I should move it to the end
> > of the mtk_pcie_msi_handler() function.
> 
> I doubt this is an EOI. If, as I suspect, it instructs the HW to clear
> the bit so that new pending bits can be recorded, it must take place
> *before* the interrupt is handled, or you may lose MSIs in the
> interval between the handling of the interrupt and the clearing of the
> pending bit. To satisfy this requirement, this should be an ACK, which
> is consistent with the way most MSI controllers such as this one work.

These bits are similar with the interrupt status of INTx, and the
interrupt status will remain until all the status of the corresponding
set are cleared. There is a while loop in mtk_pcie_msi_handler() which
is used to continuously polling and ACK the status of the MSI set, I
think the MSI may not be lose in this case.
 
> 
> > 
> >                   +-----+
> >                   | GIC |
> >                   +-----+
> >                      ^
> >                      |
> >                  port->irq
> >                      |
> >              +-+-+-+-+-+-+-+-+
> >              |0|1|2|3|4|5|6|7| (PCIe intc)
> >              +-+-+-+-+-+-+-+-+
> >               ^ ^           ^
> >               | |    ...    |
> >       +-------+ +------+    +-----------+
> >       |                |                |
> > +-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
> > |0|1|...|30|31|  |0|1|...|30|31|  |0|1|...|30|31| (MSI sets)
> > +-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
> >  ^ ^      ^  ^    ^ ^      ^  ^    ^ ^      ^  ^
> >  | |      |  |    | |      |  |    | |      |  |  (MSI vectors)
> >  | |      |  |    | |      |  |    | |      |  |
> > 
> >   (MSI SET0)       (MSI SET1)  ...   (MSI SET7)
> > 
> > I would like to ask another question. In this interrupt architecture, we
> > cannot implement an affinity for PCIe interrupts, so we return a
> > negative value in the mtk_pcie_set_affinity callback as follows: 
> > 
> > +static int mtk_pcie_set_affinity(struct irq_data *data,
> > +                                const struct cpumask *mask, bool force)
> > +{
> > +       return -EINVAL;
> > +}
> > 
> > But there will always be error logs when hotplug a CPU:
> > 
> > ~ # echo 0 > /sys/devices/system/cpu/cpu1/online
> > [   93.633059] IRQ255: set affinity failed(-22).
> > [   93.633624] IRQ256: set affinity failed(-22).
> > [   93.634222] CPU1: shutdown
> > [   93.634586] psci: CPU1 killed (polled 0 ms)
> > 
> > Or when the system suspends:
> > 
> > ~ # echo mem > /sys/power/state
> > [   93.635145] cpuhp: cpu_off cluster=0, cpu=1
> > [  169.835653] PM: suspend entry (deep)
> > [  169.836717] Filesystems sync: 0.000 seconds
> > [  169.837924] Freezing user space processes ... (elapsed 0.001 seconds)
> > done.
> > [  169.839922] OOM killer disabled.
> > [  169.840336] Freezing remaining freezable tasks ... (elapsed 0.001
> > seconds) done.
> > [  169.844715] Disabling non-boot CPUs ...
> > [  169.846443] IRQ255: set affinity failed(-22).
> > [  169.847002] IRQ256: set affinity failed(-22).
> > [  169.847586] CPU2: shutdown
> > [  169.847943] psci: CPU2 killed (polled 0 ms)
> > [  169.848489] cpuhp: cpu_off cluster=0, cpu=2
> > [  169.850285] IRQ255: set affinity failed(-22).
> > [  169.851369] IRQ256: set affinity failed(-22).
> > ...
> > 
> > Sometimes this can cause misunderstandings to users, do we have a chance
> > to prevent this error log?
> 
> No. This HW doesn't allow MSIs to be individually retargeted, and the
> kernel isn't happy about that. That's one of the many reasons why
> hiding MSIs behind a mux (or two in your case) is a *very bad idea*.
> 
> Thanks,
> 
> 	M.
>
Jianjun Wang (王建军) March 11, 2021, 9:50 a.m. UTC | #9
On Thu, 2021-03-11 at 08:19 +0000, Marc Zyngier wrote:
> On 2021-03-11 00:05, Pali Rohár wrote:
> > On Wednesday 24 February 2021 14:11:30 Jianjun Wang wrote:
> >> +static int mtk_msi_bottom_domain_alloc(struct irq_domain *domain,
> >> +				       unsigned int virq, unsigned int nr_irqs,
> >> +				       void *arg)
> >> +{
> >> +	struct mtk_pcie_port *port = domain->host_data;
> >> +	struct mtk_msi_set *msi_set;
> >> +	int i, hwirq, set_idx;
> >> +
> >> +	mutex_lock(&port->lock);
> >> +
> >> +	hwirq = bitmap_find_free_region(port->msi_irq_in_use, 
> >> PCIE_MSI_IRQS_NUM,
> >> +					order_base_2(nr_irqs));
> >> +
> >> +	mutex_unlock(&port->lock);
> >> +
> >> +	if (hwirq < 0)
> >> +		return -ENOSPC;
> >> +
> >> +	set_idx = hwirq / PCIE_MSI_IRQS_PER_SET;
> >> +	msi_set = &port->msi_sets[set_idx];
> >> +
> >> +	for (i = 0; i < nr_irqs; i++)
> >> +		irq_domain_set_info(domain, virq + i, hwirq + i,
> >> +				    &mtk_msi_bottom_irq_chip, msi_set,
> >> +				    handle_edge_irq, NULL, NULL);
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static void mtk_msi_bottom_domain_free(struct irq_domain *domain,
> >> +				       unsigned int virq, unsigned int nr_irqs)
> >> +{
> >> +	struct mtk_pcie_port *port = domain->host_data;
> >> +	struct irq_data *data = irq_domain_get_irq_data(domain, virq);
> >> +
> >> +	mutex_lock(&port->lock);
> >> +
> >> +	bitmap_clear(port->msi_irq_in_use, data->hwirq, nr_irqs);
> > 
> > Marc, should not be there bitmap_release_region() with order_base_2()?
> > 
> > bitmap_release_region(port->msi_irq_in_use, data->hwirq, 
> > order_base_2(nr_irqs));
> > 
> > Because mtk_msi_bottom_domain_alloc() is allocating
> > order_base_2(nr_irqs) interrupts, not only nr_irqs.
> 
> Indeed, good catch.

I will fix it in the next version, thanks for your review.

> 
> Thanks,
> 
>          M.
Marc Zyngier March 12, 2021, 9:14 a.m. UTC | #10
On Thu, 11 Mar 2021 09:47:45 +0000,
Jianjun Wang <jianjun.wang@mediatek.com> wrote:
> 
> On Wed, 2021-03-10 at 09:41 +0000, Marc Zyngier wrote:
> > On Wed, 10 Mar 2021 06:48:49 +0000,
> > Jianjun Wang <jianjun.wang@mediatek.com> wrote:
> > > > > @@ -408,6 +677,14 @@ static void mtk_pcie_irq_handler(struct irq_desc *desc)
> > > > >  		generic_handle_irq(virq);
> > > > >  	}
> > > > >  
> > > > > +	irq_bit = PCIE_MSI_SHIFT;
> > > > > +	for_each_set_bit_from(irq_bit, &status, PCIE_MSI_SET_NUM +
> > > > > +			      PCIE_MSI_SHIFT) {
> > > > > +		mtk_pcie_msi_handler(port, irq_bit - PCIE_MSI_SHIFT);
> > > > > +
> > > > > +		writel_relaxed(BIT(irq_bit), port->base + PCIE_INT_STATUS_REG);
> > > > 
> > > > Isn't this write the same thing you have for EOI in the INTx case?
> > > > While I could understand your description in that case (this is a
> > > > resampling operation), I don't get what this does here. Either this is
> > > > also an EOI, but your initial description doesn't make sense, or it is
> > > > an Ack, and it should be moved to the right place.
> > > > 
> > > > Which one is it?
> > > 
> > > I think it should be an EOI which used to clear the interrupt status of
> > > a single set in the PCIe intc field, maybe I should move it to the end
> > > of the mtk_pcie_msi_handler() function.
> > 
> > I doubt this is an EOI. If, as I suspect, it instructs the HW to clear
> > the bit so that new pending bits can be recorded, it must take place
> > *before* the interrupt is handled, or you may lose MSIs in the
> > interval between the handling of the interrupt and the clearing of the
> > pending bit. To satisfy this requirement, this should be an ACK, which
> > is consistent with the way most MSI controllers such as this one work.
> 
> These bits are similar with the interrupt status of INTx, and the
> interrupt status will remain until all the status of the corresponding
> set are cleared. There is a while loop in mtk_pcie_msi_handler() which
> is used to continuously polling and ACK the status of the MSI set, I
> think the MSI may not be lose in this case.

Ah, is that the write to PCIE_MSI_SET_STATUS_OFFSET that you are
referring to? In that case, yes, I agree.

However, this write to PCIE_INT_STATUS_REG is more a property of the
mux interrupt and not one of the MSI interrupt. Given that you do not
represent that level as another level of chained controller, you might
as well leave it where it is...

Thanks,

	M.
diff mbox series

Patch

diff --git a/drivers/pci/controller/pcie-mediatek-gen3.c b/drivers/pci/controller/pcie-mediatek-gen3.c
index 8b3b5f838b69..3cbec22ece0c 100644
--- a/drivers/pci/controller/pcie-mediatek-gen3.c
+++ b/drivers/pci/controller/pcie-mediatek-gen3.c
@@ -14,6 +14,7 @@ 
 #include <linux/irqdomain.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/msi.h>
 #include <linux/pci.h>
 #include <linux/phy/phy.h>
 #include <linux/platform_device.h>
@@ -48,12 +49,29 @@ 
 #define PCIE_LINK_STATUS_REG		0x154
 #define PCIE_PORT_LINKUP		BIT(8)
 
+#define PCIE_MSI_SET_NUM		8
+#define PCIE_MSI_IRQS_PER_SET		32
+#define PCIE_MSI_IRQS_NUM \
+	(PCIE_MSI_IRQS_PER_SET * PCIE_MSI_SET_NUM)
+
 #define PCIE_INT_ENABLE_REG		0x180
+#define PCIE_MSI_ENABLE			GENMASK(PCIE_MSI_SET_NUM + 8 - 1, 8)
+#define PCIE_MSI_SHIFT			8
 #define PCIE_INTX_SHIFT			24
 #define PCIE_INTX_ENABLE \
 	GENMASK(PCIE_INTX_SHIFT + PCI_NUM_INTX - 1, PCIE_INTX_SHIFT)
 
 #define PCIE_INT_STATUS_REG		0x184
+#define PCIE_MSI_SET_ENABLE_REG		0x190
+#define PCIE_MSI_SET_ENABLE		GENMASK(PCIE_MSI_SET_NUM - 1, 0)
+
+#define PCIE_MSI_SET_BASE_REG		0xc00
+#define PCIE_MSI_SET_OFFSET		0x10
+#define PCIE_MSI_SET_STATUS_OFFSET	0x04
+#define PCIE_MSI_SET_ENABLE_OFFSET	0x08
+
+#define PCIE_MSI_SET_ADDR_HI_BASE	0xc80
+#define PCIE_MSI_SET_ADDR_HI_OFFSET	0x04
 
 #define PCIE_TRANS_TABLE_BASE_REG	0x800
 #define PCIE_ATR_SRC_ADDR_MSB_OFFSET	0x4
@@ -73,6 +91,16 @@ 
 #define PCIE_ATR_TLP_TYPE_MEM		PCIE_ATR_TLP_TYPE(0)
 #define PCIE_ATR_TLP_TYPE_IO		PCIE_ATR_TLP_TYPE(2)
 
+/**
+ * struct mtk_pcie_msi - MSI information for each set
+ * @base: IO mapped register base
+ * @msg_addr: MSI message address
+ */
+struct mtk_msi_set {
+	void __iomem *base;
+	phys_addr_t msg_addr;
+};
+
 /**
  * struct mtk_pcie_port - PCIe port information
  * @dev: pointer to PCIe device
@@ -86,6 +114,11 @@ 
  * @irq: PCIe controller interrupt number
  * @irq_lock: lock protecting IRQ register access
  * @intx_domain: legacy INTx IRQ domain
+ * @msi_domain: MSI IRQ domain
+ * @msi_bottom_domain: MSI IRQ bottom domain
+ * @msi_sets: MSI sets information
+ * @lock: lock protecting IRQ bit map
+ * @msi_irq_in_use: bit map for assigned MSI IRQ
  */
 struct mtk_pcie_port {
 	struct device *dev;
@@ -100,6 +133,11 @@  struct mtk_pcie_port {
 	int irq;
 	raw_spinlock_t irq_lock;
 	struct irq_domain *intx_domain;
+	struct irq_domain *msi_domain;
+	struct irq_domain *msi_bottom_domain;
+	struct mtk_msi_set msi_sets[PCIE_MSI_SET_NUM];
+	struct mutex lock;
+	DECLARE_BITMAP(msi_irq_in_use, PCIE_MSI_IRQS_NUM);
 };
 
 /**
@@ -197,6 +235,35 @@  static int mtk_pcie_set_trans_table(struct mtk_pcie_port *port,
 	return 0;
 }
 
+static void mtk_pcie_enable_msi(struct mtk_pcie_port *port)
+{
+	int i;
+	u32 val;
+
+	val = readl_relaxed(port->base + PCIE_MSI_SET_ENABLE_REG);
+	val |= PCIE_MSI_SET_ENABLE;
+	writel_relaxed(val, port->base + PCIE_MSI_SET_ENABLE_REG);
+
+	val = readl_relaxed(port->base + PCIE_INT_ENABLE_REG);
+	val |= PCIE_MSI_ENABLE;
+	writel_relaxed(val, port->base + PCIE_INT_ENABLE_REG);
+
+	for (i = 0; i < PCIE_MSI_SET_NUM; i++) {
+		struct mtk_msi_set *msi_set = &port->msi_sets[i];
+
+		msi_set->base = port->base + PCIE_MSI_SET_BASE_REG +
+				i * PCIE_MSI_SET_OFFSET;
+		msi_set->msg_addr = port->reg_base + PCIE_MSI_SET_BASE_REG +
+				    i * PCIE_MSI_SET_OFFSET;
+
+		/* Configure the MSI capture address */
+		writel_relaxed(lower_32_bits(msi_set->msg_addr), msi_set->base);
+		writel_relaxed(upper_32_bits(msi_set->msg_addr),
+			       port->base + PCIE_MSI_SET_ADDR_HI_BASE +
+			       i * PCIE_MSI_SET_ADDR_HI_OFFSET);
+	}
+}
+
 static int mtk_pcie_startup_port(struct mtk_pcie_port *port)
 {
 	struct resource_entry *entry;
@@ -247,6 +314,8 @@  static int mtk_pcie_startup_port(struct mtk_pcie_port *port)
 		return err;
 	}
 
+	mtk_pcie_enable_msi(port);
+
 	/* Set PCIe translation windows */
 	resource_list_for_each_entry(entry, &host->windows) {
 		struct resource *res = entry->res;
@@ -290,6 +359,148 @@  static int mtk_pcie_set_affinity(struct irq_data *data,
 	return -EINVAL;
 }
 
+static void mtk_pcie_irq_mask(struct irq_data *data)
+{
+	pci_msi_mask_irq(data);
+	irq_chip_mask_parent(data);
+}
+
+static void mtk_pcie_irq_unmask(struct irq_data *data)
+{
+	pci_msi_unmask_irq(data);
+	irq_chip_unmask_parent(data);
+}
+
+static struct irq_chip mtk_msi_irq_chip = {
+	.name = "MSI",
+	.irq_enable = mtk_pcie_irq_unmask,
+	.irq_disable = mtk_pcie_irq_mask,
+	.irq_ack = irq_chip_ack_parent,
+	.irq_mask = mtk_pcie_irq_mask,
+	.irq_unmask = mtk_pcie_irq_unmask,
+};
+
+static struct msi_domain_info mtk_msi_domain_info = {
+	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_PCI_MSIX |
+		   MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI),
+	.chip	= &mtk_msi_irq_chip,
+};
+
+static void mtk_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
+	struct mtk_pcie_port *port = data->domain->host_data;
+	unsigned long hwirq;
+
+	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
+
+	msg->address_hi = upper_32_bits(msi_set->msg_addr);
+	msg->address_lo = lower_32_bits(msi_set->msg_addr);
+	msg->data = hwirq;
+	dev_dbg(port->dev, "msi#%#lx address_hi %#x address_lo %#x data %d\n",
+		hwirq, msg->address_hi, msg->address_lo, msg->data);
+}
+
+static void mtk_msi_bottom_irq_ack(struct irq_data *data)
+{
+	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
+	unsigned long hwirq;
+
+	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
+
+	writel_relaxed(BIT(hwirq), msi_set->base + PCIE_MSI_SET_STATUS_OFFSET);
+}
+
+static void mtk_msi_bottom_irq_mask(struct irq_data *data)
+{
+	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
+	struct mtk_pcie_port *port = data->domain->host_data;
+	unsigned long hwirq, flags;
+	u32 val;
+
+	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
+
+	raw_spin_lock_irqsave(&port->irq_lock, flags);
+	val = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+	val &= ~BIT(hwirq);
+	writel_relaxed(val, msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+	raw_spin_unlock_irqrestore(&port->irq_lock, flags);
+}
+
+static void mtk_msi_bottom_irq_unmask(struct irq_data *data)
+{
+	struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
+	struct mtk_pcie_port *port = data->domain->host_data;
+	unsigned long hwirq, flags;
+	u32 val;
+
+	hwirq =	data->hwirq % PCIE_MSI_IRQS_PER_SET;
+
+	raw_spin_lock_irqsave(&port->irq_lock, flags);
+	val = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+	val |= BIT(hwirq);
+	writel_relaxed(val, msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+	raw_spin_unlock_irqrestore(&port->irq_lock, flags);
+}
+
+static struct irq_chip mtk_msi_bottom_irq_chip = {
+	.irq_ack		= mtk_msi_bottom_irq_ack,
+	.irq_mask		= mtk_msi_bottom_irq_mask,
+	.irq_unmask		= mtk_msi_bottom_irq_unmask,
+	.irq_compose_msi_msg	= mtk_compose_msi_msg,
+	.irq_set_affinity	= mtk_pcie_set_affinity,
+	.name			= "MSI",
+};
+
+static int mtk_msi_bottom_domain_alloc(struct irq_domain *domain,
+				       unsigned int virq, unsigned int nr_irqs,
+				       void *arg)
+{
+	struct mtk_pcie_port *port = domain->host_data;
+	struct mtk_msi_set *msi_set;
+	int i, hwirq, set_idx;
+
+	mutex_lock(&port->lock);
+
+	hwirq = bitmap_find_free_region(port->msi_irq_in_use, PCIE_MSI_IRQS_NUM,
+					order_base_2(nr_irqs));
+
+	mutex_unlock(&port->lock);
+
+	if (hwirq < 0)
+		return -ENOSPC;
+
+	set_idx = hwirq / PCIE_MSI_IRQS_PER_SET;
+	msi_set = &port->msi_sets[set_idx];
+
+	for (i = 0; i < nr_irqs; i++)
+		irq_domain_set_info(domain, virq + i, hwirq + i,
+				    &mtk_msi_bottom_irq_chip, msi_set,
+				    handle_edge_irq, NULL, NULL);
+
+	return 0;
+}
+
+static void mtk_msi_bottom_domain_free(struct irq_domain *domain,
+				       unsigned int virq, unsigned int nr_irqs)
+{
+	struct mtk_pcie_port *port = domain->host_data;
+	struct irq_data *data = irq_domain_get_irq_data(domain, virq);
+
+	mutex_lock(&port->lock);
+
+	bitmap_clear(port->msi_irq_in_use, data->hwirq, nr_irqs);
+
+	mutex_unlock(&port->lock);
+
+	irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static const struct irq_domain_ops mtk_msi_bottom_domain_ops = {
+	.alloc = mtk_msi_bottom_domain_alloc,
+	.free = mtk_msi_bottom_domain_free,
+};
+
 static void mtk_intx_mask(struct irq_data *data)
 {
 	struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data);
@@ -360,6 +571,7 @@  static int mtk_pcie_init_irq_domains(struct mtk_pcie_port *port)
 {
 	struct device *dev = port->dev;
 	struct device_node *intc_node, *node = dev->of_node;
+	int ret;
 
 	raw_spin_lock_init(&port->irq_lock);
 
@@ -377,7 +589,34 @@  static int mtk_pcie_init_irq_domains(struct mtk_pcie_port *port)
 		return -ENODEV;
 	}
 
+	/* Setup MSI */
+	mutex_init(&port->lock);
+
+	port->msi_bottom_domain = irq_domain_add_linear(node, PCIE_MSI_IRQS_NUM,
+				  &mtk_msi_bottom_domain_ops, port);
+	if (!port->msi_bottom_domain) {
+		dev_info(dev, "failed to create MSI bottom domain\n");
+		ret = -ENODEV;
+		goto err_msi_bottom_domain;
+	}
+
+	port->msi_domain = pci_msi_create_irq_domain(dev->fwnode,
+						     &mtk_msi_domain_info,
+						     port->msi_bottom_domain);
+	if (!port->msi_domain) {
+		dev_info(dev, "failed to create MSI domain\n");
+		ret = -ENODEV;
+		goto err_msi_domain;
+	}
+
 	return 0;
+
+err_msi_domain:
+	irq_domain_remove(port->msi_bottom_domain);
+err_msi_bottom_domain:
+	irq_domain_remove(port->intx_domain);
+
+	return ret;
 }
 
 static void mtk_pcie_irq_teardown(struct mtk_pcie_port *port)
@@ -387,9 +626,39 @@  static void mtk_pcie_irq_teardown(struct mtk_pcie_port *port)
 	if (port->intx_domain)
 		irq_domain_remove(port->intx_domain);
 
+	if (port->msi_domain)
+		irq_domain_remove(port->msi_domain);
+
+	if (port->msi_bottom_domain)
+		irq_domain_remove(port->msi_bottom_domain);
+
 	irq_dispose_mapping(port->irq);
 }
 
+static void mtk_pcie_msi_handler(struct mtk_pcie_port *port, int set_idx)
+{
+	struct mtk_msi_set *msi_set = &port->msi_sets[set_idx];
+	unsigned long msi_enable, msi_status;
+	unsigned int virq;
+	irq_hw_number_t bit, hwirq;
+
+	msi_enable = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+
+	do {
+		msi_status = readl_relaxed(msi_set->base +
+					   PCIE_MSI_SET_STATUS_OFFSET);
+		msi_status &= msi_enable;
+		if (!msi_status)
+			break;
+
+		for_each_set_bit(bit, &msi_status, PCIE_MSI_IRQS_PER_SET) {
+			hwirq = bit + set_idx * PCIE_MSI_IRQS_PER_SET;
+			virq = irq_find_mapping(port->msi_bottom_domain, hwirq);
+			generic_handle_irq(virq);
+		}
+	} while (true);
+}
+
 static void mtk_pcie_irq_handler(struct irq_desc *desc)
 {
 	struct mtk_pcie_port *port = irq_desc_get_handler_data(desc);
@@ -408,6 +677,14 @@  static void mtk_pcie_irq_handler(struct irq_desc *desc)
 		generic_handle_irq(virq);
 	}
 
+	irq_bit = PCIE_MSI_SHIFT;
+	for_each_set_bit_from(irq_bit, &status, PCIE_MSI_SET_NUM +
+			      PCIE_MSI_SHIFT) {
+		mtk_pcie_msi_handler(port, irq_bit - PCIE_MSI_SHIFT);
+
+		writel_relaxed(BIT(irq_bit), port->base + PCIE_INT_STATUS_REG);
+	}
+
 	chained_irq_exit(irqchip, desc);
 }