diff mbox

[v10] PCI: tango: Add MSI controller support

Message ID 7b7278f4-7639-62b3-8a35-e6f7f9afa998@sigmadesigns.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Marc Gonzalez Aug. 22, 2017, 2:56 p.m. UTC
The MSI controller in Tango supports 256 message-signaled interrupts
and a single doorbell address.

Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
---
Changes from v9 to v10
- Start from Bjorn's cleanup branch
- Clean up the MSI init and unused statements
- Based on top of v4.13-rc6

Hello Bjorn,

This patch is almost identical to the patch reviewed by Marc Zyngier
on June 14 (10 weeks ago). I'm not sure he could review this patch
again in time for 4.14 (given his work load these past few weeks).

The host bridge part landed in 4.13, but the driver is useless
without MSI support (legacy interrupts are not supported).
Can you take it for 4.14?

Regards.
---
 drivers/pci/host/pcie-tango.c | 191 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 189 insertions(+), 2 deletions(-)

Comments

Marc Zyngier Aug. 22, 2017, 4:29 p.m. UTC | #1
On 22/08/17 15:56, Marc Gonzalez wrote:
> The MSI controller in Tango supports 256 message-signaled interrupts
> and a single doorbell address.
> 
> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
> ---
> Changes from v9 to v10
> - Start from Bjorn's cleanup branch
> - Clean up the MSI init and unused statements
> - Based on top of v4.13-rc6
> 
> Hello Bjorn,
> 
> This patch is almost identical to the patch reviewed by Marc Zyngier
> on June 14 (10 weeks ago). I'm not sure he could review this patch
> again in time for 4.14 (given his work load these past few weeks).

Thanks for worrying about my workload.

> 
> The host bridge part landed in 4.13, but the driver is useless
> without MSI support (legacy interrupts are not supported).
> Can you take it for 4.14?
> 
> Regards.
> ---
>  drivers/pci/host/pcie-tango.c | 191 +++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 189 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/host/pcie-tango.c b/drivers/pci/host/pcie-tango.c
> index 6bbb81f06a53..d672271ad719 100644
> --- a/drivers/pci/host/pcie-tango.c
> +++ b/drivers/pci/host/pcie-tango.c
> @@ -1,12 +1,170 @@
> +#include <linux/irqchip/chained_irq.h>
> +#include <linux/irqdomain.h>
>  #include <linux/pci-ecam.h>
>  #include <linux/delay.h>
> +#include <linux/msi.h>
>  #include <linux/of.h>
>  
> +#define MSI_MAX			256
> +
>  #define SMP8759_MUX		0x48
>  #define SMP8759_TEST_OUT	0x74
> +#define SMP8759_STATUS		0x80
> +#define SMP8759_ENABLE		0xa0
> +#define SMP8759_DOORBELL	0xa002e07c

Why is this hardcoded and not coming from the device-tree, just like any
other address property?

>  
>  struct tango_pcie {
> -	void __iomem *base;
> +	DECLARE_BITMAP(used_msi, MSI_MAX);
> +	spinlock_t		used_msi_lock;
> +	void __iomem		*base;
> +	struct irq_domain	*dom;
> +};
> +
> +static void tango_msi_isr(struct irq_desc *desc)
> +{
> +	struct irq_chip *chip = irq_desc_get_chip(desc);
> +	struct tango_pcie *pcie = irq_desc_get_handler_data(desc);
> +	unsigned long status, base, virq, idx, pos = 0;
> +
> +	chained_irq_enter(chip, desc);
> +	spin_lock(&pcie->used_msi_lock);
> +
> +	while ((pos = find_next_bit(pcie->used_msi, MSI_MAX, pos)) < MSI_MAX) {
> +		base = round_down(pos, 32);
> +		status = readl_relaxed(pcie->base + SMP8759_STATUS + base / 8);
> +		for_each_set_bit(idx, &status, 32) {
> +			virq = irq_find_mapping(pcie->dom, base + idx);
> +			generic_handle_irq(virq);
> +		}
> +		pos = base + 32;
> +	}
> +
> +	spin_unlock(&pcie->used_msi_lock);
> +	chained_irq_exit(chip, desc);
> +}
> +
> +static void tango_ack(struct irq_data *d)
> +{
> +	struct tango_pcie *pcie = d->chip_data;
> +	u32 offset = (d->hwirq / 32) * 4;
> +	u32 bit = BIT(d->hwirq % 32);
> +
> +	writel_relaxed(bit, pcie->base + SMP8759_STATUS + offset);
> +}
> +
> +static void update_msi_enable(struct irq_data *d, bool unmask)
> +{
> +	unsigned long flags;
> +	struct tango_pcie *pcie = d->chip_data;
> +	u32 offset = (d->hwirq / 32) * 4;
> +	u32 bit = BIT(d->hwirq % 32);
> +	u32 val;
> +
> +	spin_lock_irqsave(&pcie->used_msi_lock, flags);
> +	val = readl_relaxed(pcie->base + SMP8759_ENABLE + offset);
> +	val = unmask ? val | bit : val & ~bit;
> +	writel_relaxed(val, pcie->base + SMP8759_ENABLE + offset);
> +	spin_unlock_irqrestore(&pcie->used_msi_lock, flags);
> +}
> +
> +static void tango_mask(struct irq_data *d)
> +{
> +	update_msi_enable(d, false);
> +}
> +
> +static void tango_unmask(struct irq_data *d)
> +{
> +	update_msi_enable(d, true);
> +}
> +
> +static int tango_set_affinity(struct irq_data *d, const struct cpumask *mask,
> +			      bool force)
> +{
> +	return -EINVAL;
> +}
> +
> +static void tango_compose_msi_msg(struct irq_data *d, struct msi_msg *msg)
> +{
> +	msg->address_lo = lower_32_bits(SMP8759_DOORBELL);
> +	msg->address_hi = upper_32_bits(SMP8759_DOORBELL);
> +	msg->data = d->hwirq;
> +}
> +
> +static struct irq_chip tango_chip = {
> +	.irq_ack		= tango_ack,
> +	.irq_mask		= tango_mask,
> +	.irq_unmask		= tango_unmask,
> +	.irq_set_affinity	= tango_set_affinity,
> +	.irq_compose_msi_msg	= tango_compose_msi_msg,
> +};
> +
> +static void msi_ack(struct irq_data *d)
> +{
> +	irq_chip_ack_parent(d);
> +}
> +
> +static void msi_mask(struct irq_data *d)
> +{
> +	pci_msi_mask_irq(d);
> +	irq_chip_mask_parent(d);
> +}
> +
> +static void msi_unmask(struct irq_data *d)
> +{
> +	pci_msi_unmask_irq(d);
> +	irq_chip_unmask_parent(d);
> +}
> +
> +static struct irq_chip msi_chip = {
> +	.name = "MSI",
> +	.irq_ack = msi_ack,
> +	.irq_mask = msi_mask,
> +	.irq_unmask = msi_unmask,
> +};
> +
> +static struct msi_domain_info msi_dom_info = {
> +	.flags	= MSI_FLAG_PCI_MSIX
> +		| MSI_FLAG_USE_DEF_DOM_OPS
> +		| MSI_FLAG_USE_DEF_CHIP_OPS,
> +	.chip	= &msi_chip,
> +};
> +
> +static int tango_irq_domain_alloc(struct irq_domain *dom, unsigned int virq,
> +				  unsigned int nr_irqs, void *args)
> +{
> +	struct tango_pcie *pcie = dom->host_data;
> +	unsigned long flags;
> +	int pos;
> +
> +	spin_lock_irqsave(&pcie->used_msi_lock, flags);
> +	pos = find_first_zero_bit(pcie->used_msi, MSI_MAX);
> +	if (pos >= MSI_MAX) {
> +		spin_unlock_irqrestore(&pcie->used_msi_lock, flags);
> +		return -ENOSPC;
> +	}
> +	__set_bit(pos, pcie->used_msi);
> +	spin_unlock_irqrestore(&pcie->used_msi_lock, flags);
> +	irq_domain_set_info(dom, virq, pos, &tango_chip,
> +			pcie, handle_edge_irq, NULL, NULL);
> +
> +	return 0;
> +}
> +
> +static void tango_irq_domain_free(struct irq_domain *dom, unsigned int virq,
> +				  unsigned int nr_irqs)
> +{
> +	unsigned long flags;
> +	struct irq_data *d = irq_domain_get_irq_data(dom, virq);
> +	struct tango_pcie *pcie = d->chip_data;
> +
> +	spin_lock_irqsave(&pcie->used_msi_lock, flags);
> +	__clear_bit(d->hwirq, pcie->used_msi);
> +	spin_unlock_irqrestore(&pcie->used_msi_lock, flags);
> +}
> +
> +static const struct irq_domain_ops dom_ops = {
> +	.alloc	= tango_irq_domain_alloc,
> +	.free	= tango_irq_domain_free,
>  };
>  
>  static int smp8759_config_read(struct pci_bus *bus, unsigned int devfn,
> @@ -76,7 +234,9 @@ static int tango_pcie_probe(struct platform_device *pdev)
>  	struct device *dev = &pdev->dev;
>  	struct tango_pcie *pcie;
>  	struct resource *res;
> -	int ret;
> +	struct irq_domain *msi_dom, *irq_dom;
> +	struct fwnode_handle *fwnode = of_node_to_fwnode(dev->of_node);
> +	int ret, reg, virq;
>  
>  	dev_warn(dev, "simultaneous PCI config and MMIO accesses may cause data corruption\n");
>  	add_taint(TAINT_CRAP, LOCKDEP_STILL_OK);
> @@ -95,6 +255,33 @@ static int tango_pcie_probe(struct platform_device *pdev)
>  	if (!tango_pcie_link_up(pcie))
>  		return -ENODEV;
>  
> +	for (reg = 0; reg < MSI_MAX / 8; reg += 4)
> +		writel_relaxed(0, pcie->base + SMP8759_ENABLE + reg);
> +
> +	virq = platform_get_irq(pdev, 1);
> +	if (virq <= 0) {
> +		dev_err(dev, "Failed to map IRQ\n");
> +		return -ENXIO;
> +	}
> +
> +	irq_dom = irq_domain_create_linear(fwnode, MSI_MAX, &dom_ops, pcie);
> +	if (!irq_dom) {
> +		dev_err(dev, "Failed to create IRQ domain\n");
> +		return -ENOMEM;
> +	}
> +
> +	msi_dom = pci_msi_create_irq_domain(fwnode, &msi_dom_info, irq_dom);
> +	if (!msi_dom) {
> +		dev_err(dev, "Failed to create MSI domain\n");
> +		irq_domain_remove(irq_dom);
> +		return -ENOMEM;
> +	}
> +
> +	pcie->dom = irq_dom;
> +	spin_lock_init(&pcie->used_msi_lock);
> +
> +	irq_set_chained_handler_and_data(virq, tango_msi_isr, pcie);
> +
>  	return pci_host_common_probe(pdev, &smp8759_ecam_ops);
>  }
>  
> 

The remark above notwithstanding:

Acked-by: Marc Zyngier <marc.zyngier@arm.com>

	M.
Marc Gonzalez Aug. 22, 2017, 6:02 p.m. UTC | #2
On 22/08/2017 18:29, Marc Zyngier wrote:

> On 22/08/17 15:56, Marc Gonzalez wrote:
>
>>  #define SMP8759_MUX		0x48
>>  #define SMP8759_TEST_OUT	0x74
>> +#define SMP8759_STATUS		0x80
>> +#define SMP8759_ENABLE		0xa0
>> +#define SMP8759_DOORBELL	0xa002e07c
> 
> Why is this hardcoded and not coming from the device-tree, just like any
> other address property?

Since this bus address is software-configurable, I didn't think
it belonged in the DT. Also, I didn't see anything similar in
other binding docs, especially

Documentation/devicetree/bindings/interrupt-controller/msi.txt

Regards.
Marc Zyngier Aug. 22, 2017, 8:03 p.m. UTC | #3
On Tue, Aug 22 2017 at  8:02:18 pm BST, Marc Gonzalez <marc_gonzalez@sigmadesigns.com> wrote:
> On 22/08/2017 18:29, Marc Zyngier wrote:
>
>> On 22/08/17 15:56, Marc Gonzalez wrote:
>>
>>>  #define SMP8759_MUX		0x48
>>>  #define SMP8759_TEST_OUT	0x74
>>> +#define SMP8759_STATUS		0x80
>>> +#define SMP8759_ENABLE		0xa0
>>> +#define SMP8759_DOORBELL	0xa002e07c
>> 
>> Why is this hardcoded and not coming from the device-tree, just like any
>> other address property?
>
> Since this bus address is software-configurable, I didn't think
> it belonged in the DT. Also, I didn't see anything similar in
> other binding docs, especially
>
> Documentation/devicetree/bindings/interrupt-controller/msi.txt

If that's software configurable, how on Earth did you pick the address?
How do you ensure that it doesn't conflict with DMA? How is it
configured into the RC?

	M.
Marc Gonzalez Aug. 23, 2017, 12:59 p.m. UTC | #4
On 22/08/2017 16:56, Marc Gonzalez wrote:

> @@ -76,7 +234,9 @@ static int tango_pcie_probe(struct platform_device *pdev)
>  	struct device *dev = &pdev->dev;
>  	struct tango_pcie *pcie;
>  	struct resource *res;
> -	int ret;
> +	struct irq_domain *msi_dom, *irq_dom;
> +	struct fwnode_handle *fwnode = of_node_to_fwnode(dev->of_node);
> +	int ret, reg, virq;
>  
>  	dev_warn(dev, "simultaneous PCI config and MMIO accesses may cause data corruption\n");
>  	add_taint(TAINT_CRAP, LOCKDEP_STILL_OK);

  CC      drivers/pci/host/pcie-tango.o
drivers/pci/host/pcie-tango.c: In function 'tango_pcie_probe':
drivers/pci/host/pcie-tango.c:257:6: warning: unused variable 'ret' [-Wunused-variable]
  int ret, reg, virq;
      ^~~

Hmmm, dunno how I managed to miss that...
Is the kbuild test robot enjoying a well-deserved vacation?

Bjorn, if/when you take the patch, can you first apply this fixup:

@@ -254,7 +254,7 @@ static int tango_pcie_probe(struct platform_device *pdev)
        struct resource *res;
        struct irq_domain *msi_dom, *irq_dom;
        struct fwnode_handle *fwnode = of_node_to_fwnode(dev->of_node);
-       int ret, reg, virq;
+       int reg, virq;
 
        dev_warn(dev, "simultaneous PCI config and MMIO accesses may cause data corruption\n");
        add_taint(TAINT_CRAP, LOCKDEP_STILL_OK);


Regards.
Bjorn Helgaas Aug. 24, 2017, 5:01 p.m. UTC | #5
On Wed, Aug 23, 2017 at 02:59:42PM +0200, Marc Gonzalez wrote:
> On 22/08/2017 16:56, Marc Gonzalez wrote:
> 
> > @@ -76,7 +234,9 @@ static int tango_pcie_probe(struct platform_device *pdev)
> >  	struct device *dev = &pdev->dev;
> >  	struct tango_pcie *pcie;
> >  	struct resource *res;
> > -	int ret;
> > +	struct irq_domain *msi_dom, *irq_dom;
> > +	struct fwnode_handle *fwnode = of_node_to_fwnode(dev->of_node);
> > +	int ret, reg, virq;
> >  
> >  	dev_warn(dev, "simultaneous PCI config and MMIO accesses may cause data corruption\n");
> >  	add_taint(TAINT_CRAP, LOCKDEP_STILL_OK);
> 
>   CC      drivers/pci/host/pcie-tango.o
> drivers/pci/host/pcie-tango.c: In function 'tango_pcie_probe':
> drivers/pci/host/pcie-tango.c:257:6: warning: unused variable 'ret' [-Wunused-variable]
>   int ret, reg, virq;
>       ^~~
> 
> Hmmm, dunno how I managed to miss that...
> Is the kbuild test robot enjoying a well-deserved vacation?
> 
> Bjorn, if/when you take the patch, can you first apply this fixup:

Sure, no problem.

> @@ -254,7 +254,7 @@ static int tango_pcie_probe(struct platform_device *pdev)
>         struct resource *res;
>         struct irq_domain *msi_dom, *irq_dom;
>         struct fwnode_handle *fwnode = of_node_to_fwnode(dev->of_node);
> -       int ret, reg, virq;
> +       int reg, virq;
>  
>         dev_warn(dev, "simultaneous PCI config and MMIO accesses may cause data corruption\n");
>         add_taint(TAINT_CRAP, LOCKDEP_STILL_OK);
> 
> 
> Regards.
>
Bjorn Helgaas Aug. 24, 2017, 5:04 p.m. UTC | #6
On Tue, Aug 22, 2017 at 09:03:41PM +0100, Marc Zyngier wrote:
> On Tue, Aug 22 2017 at  8:02:18 pm BST, Marc Gonzalez <marc_gonzalez@sigmadesigns.com> wrote:
> > On 22/08/2017 18:29, Marc Zyngier wrote:
> >
> >> On 22/08/17 15:56, Marc Gonzalez wrote:
> >>
> >>>  #define SMP8759_MUX		0x48
> >>>  #define SMP8759_TEST_OUT	0x74
> >>> +#define SMP8759_STATUS		0x80
> >>> +#define SMP8759_ENABLE		0xa0
> >>> +#define SMP8759_DOORBELL	0xa002e07c
> >> 
> >> Why is this hardcoded and not coming from the device-tree, just like any
> >> other address property?
> >
> > Since this bus address is software-configurable, I didn't think
> > it belonged in the DT. Also, I didn't see anything similar in
> > other binding docs, especially
> >
> > Documentation/devicetree/bindings/interrupt-controller/msi.txt
> 
> If that's software configurable, how on Earth did you pick the address?
> How do you ensure that it doesn't conflict with DMA? How is it
> configured into the RC?

But we *do* need to resolve this.  This does seem like an address that
shouldn't be hard-coded into the driver.  Since this driver is
programming the address into an MSI message, but not into the receiver
of that message, there's a coordination issue between this driver and
whatever other software does that receiver configuration.

Bjorn
Marc Gonzalez Aug. 24, 2017, 5:51 p.m. UTC | #7
On 24/08/2017 19:04, Bjorn Helgaas wrote:
> On Tue, Aug 22, 2017 at 09:03:41PM +0100, Marc Zyngier wrote:
>> Marc Gonzalez wrote:
>>> On 22/08/2017 18:29, Marc Zyngier wrote:
>>>> On 22/08/17 15:56, Marc Gonzalez wrote:
>>>>
>>>>>  #define SMP8759_MUX		0x48
>>>>>  #define SMP8759_TEST_OUT	0x74
>>>>> +#define SMP8759_STATUS		0x80
>>>>> +#define SMP8759_ENABLE		0xa0
>>>>> +#define SMP8759_DOORBELL	0xa002e07c
>>>>
>>>> Why is this hardcoded and not coming from the device-tree, just like any
>>>> other address property?
>>>
>>> Since this bus address is software-configurable, I didn't think
>>> it belonged in the DT. Also, I didn't see anything similar in
>>> other binding docs, especially
>>>
>>> Documentation/devicetree/bindings/interrupt-controller/msi.txt
>>
>> If that's software configurable, how on Earth did you pick the address?
>> How do you ensure that it doesn't conflict with DMA? How is it
>> configured into the RC?
> 
> But we *do* need to resolve this.  This does seem like an address that
> shouldn't be hard-coded into the driver.  Since this driver is
> programming the address into an MSI message, but not into the receiver
> of that message, there's a coordination issue between this driver and
> whatever other software does that receiver configuration.

OK. I'll move the doorbell address to the DT for v11.

What property should be used for this address?

sigma,doorbell ?

Or maybe I can put it in reg, since I have a 1:1 mapping
between bus and cpu addresses?

git grep -i doorbell arch/arm/boot/dts/ arch/arm64/boot/dts/
returns nothing.

Regards.
Ard Biesheuvel Aug. 24, 2017, 6:35 p.m. UTC | #8
On 24 August 2017 at 18:51, Marc Gonzalez
<marc_gonzalez@sigmadesigns.com> wrote:
> On 24/08/2017 19:04, Bjorn Helgaas wrote:
>> On Tue, Aug 22, 2017 at 09:03:41PM +0100, Marc Zyngier wrote:
>>> Marc Gonzalez wrote:
>>>> On 22/08/2017 18:29, Marc Zyngier wrote:
>>>>> On 22/08/17 15:56, Marc Gonzalez wrote:
>>>>>
>>>>>>  #define SMP8759_MUX              0x48
>>>>>>  #define SMP8759_TEST_OUT 0x74
>>>>>> +#define SMP8759_STATUS           0x80
>>>>>> +#define SMP8759_ENABLE           0xa0
>>>>>> +#define SMP8759_DOORBELL 0xa002e07c
>>>>>
>>>>> Why is this hardcoded and not coming from the device-tree, just like any
>>>>> other address property?
>>>>
>>>> Since this bus address is software-configurable, I didn't think
>>>> it belonged in the DT. Also, I didn't see anything similar in
>>>> other binding docs, especially
>>>>
>>>> Documentation/devicetree/bindings/interrupt-controller/msi.txt
>>>
>>> If that's software configurable, how on Earth did you pick the address?
>>> How do you ensure that it doesn't conflict with DMA? How is it
>>> configured into the RC?
>>
>> But we *do* need to resolve this.  This does seem like an address that
>> shouldn't be hard-coded into the driver.  Since this driver is
>> programming the address into an MSI message, but not into the receiver
>> of that message, there's a coordination issue between this driver and
>> whatever other software does that receiver configuration.
>
> OK. I'll move the doorbell address to the DT for v11.
>
> What property should be used for this address?
>
> sigma,doorbell ?
>
> Or maybe I can put it in reg, since I have a 1:1 mapping
> between bus and cpu addresses?
>
> git grep -i doorbell arch/arm/boot/dts/ arch/arm64/boot/dts/
> returns nothing.
>

You haven't answered the question yet: you stated that the doorbell
address is software configurable, yet your code does not seem to
configure it. It only returns the doorbell address so that it gets
communicated to the downstream devices.

So how does the RC know which address is special, so it can trigger on
inbound writes hitting that address and assert the SPI ?
Mason Aug. 24, 2017, 8:53 p.m. UTC | #9
On 24/08/2017 20:35, Ard Biesheuvel wrote:
> On 24 August 2017 at 18:51, Marc Gonzalez wrote:
>> On 24/08/2017 19:04, Bjorn Helgaas wrote:
>>> On Tue, Aug 22, 2017 Marc Zyngier wrote:
>>>> Marc Gonzalez wrote:
>>>>> On 22/08/2017 18:29, Marc Zyngier wrote:
>>>>>> On 22/08/17 15:56, Marc Gonzalez wrote:
>>>>>>
>>>>>>>  #define SMP8759_MUX              0x48
>>>>>>>  #define SMP8759_TEST_OUT 0x74
>>>>>>> +#define SMP8759_STATUS           0x80
>>>>>>> +#define SMP8759_ENABLE           0xa0
>>>>>>> +#define SMP8759_DOORBELL 0xa002e07c
>>>>>>
>>>>>> Why is this hardcoded and not coming from the device-tree, just like any
>>>>>> other address property?
>>>>>
>>>>> Since this bus address is software-configurable, I didn't think
>>>>> it belonged in the DT. Also, I didn't see anything similar in
>>>>> other binding docs, especially
>>>>>
>>>>> Documentation/devicetree/bindings/interrupt-controller/msi.txt
>>>>
>>>> If that's software configurable, how on Earth did you pick the address?
>>>> How do you ensure that it doesn't conflict with DMA? How is it
>>>> configured into the RC?
>>>
>>> But we *do* need to resolve this.  This does seem like an address that
>>> shouldn't be hard-coded into the driver.  Since this driver is
>>> programming the address into an MSI message, but not into the receiver
>>> of that message, there's a coordination issue between this driver and
>>> whatever other software does that receiver configuration.
>>
>> OK. I'll move the doorbell address to the DT for v11.
>>
>> What property should be used for this address?
>>
>> sigma,doorbell ?
>>
>> Or maybe I can put it in reg, since I have a 1:1 mapping
>> between bus and cpu addresses?
>>
>> git grep -i doorbell arch/arm/boot/dts/ arch/arm64/boot/dts/
>> returns nothing.
> 
> You haven't answered the question yet: you stated that the doorbell
> address is software configurable, yet your code does not seem to
> configure it. It only returns the doorbell address so that it gets
> communicated to the downstream devices.
> 
> So how does the RC know which address is special, so it can trigger on
> inbound writes hitting that address and assert the SPI ?

The CPU address of the MSI doorbell address is 0x2e07c
i.e. within the reg space of the PCIe controller block.

As I discussed back in March, the RC implements an odd
bus-to-system mapping.

RC BAR0 defines a window in PCI address space (max 1GB).
Accesses outside this window are silently ignored.
The window is divided into 8 "regions" and there are 8
registers defining the offset into CPU space.

In pseudo code, assuming pci_address is within the
window defined by BAR0:

cpu_address map_bus_to_system(pci_address)
{
    temp = pci_address - BAR0.base
    region = temp / region_size
    offset = temp % region_size
    cpu_address = region_reg[region] + offset
    return cpu_address
}

The current setup is:

DRAM at 0x80000000-0xa0000000
BAR0.base = 0x80000000
REGION[0] = 0x80000000
REGION[1] = 0x88000000
REGION[2] = 0x90000000
REGION[3] = 0x98000000
REGION[4] = 0x0

(This map means 1:1 identity for DRAM addresses.)

Thus when a device writes to 0xa002e07c (region 4)
the write is forwarded to 0x2e07c.

Regards.
Marc Zyngier Aug. 25, 2017, 7:54 a.m. UTC | #10
On Thu, Aug 24 2017 at 10:53:16 pm BST, Mason <slash.tmp@free.fr> wrote:
> On 24/08/2017 20:35, Ard Biesheuvel wrote:
>> On 24 August 2017 at 18:51, Marc Gonzalez wrote:
>>> On 24/08/2017 19:04, Bjorn Helgaas wrote:
>>>> On Tue, Aug 22, 2017 Marc Zyngier wrote:
>>>>> Marc Gonzalez wrote:
>>>>>> On 22/08/2017 18:29, Marc Zyngier wrote:
>>>>>>> On 22/08/17 15:56, Marc Gonzalez wrote:
>>>>>>>
>>>>>>>>  #define SMP8759_MUX              0x48
>>>>>>>>  #define SMP8759_TEST_OUT 0x74
>>>>>>>> +#define SMP8759_STATUS           0x80
>>>>>>>> +#define SMP8759_ENABLE           0xa0
>>>>>>>> +#define SMP8759_DOORBELL 0xa002e07c
>>>>>>>
>>>>>>> Why is this hardcoded and not coming from the device-tree, just like any
>>>>>>> other address property?
>>>>>>
>>>>>> Since this bus address is software-configurable, I didn't think
>>>>>> it belonged in the DT. Also, I didn't see anything similar in
>>>>>> other binding docs, especially
>>>>>>
>>>>>> Documentation/devicetree/bindings/interrupt-controller/msi.txt
>>>>>
>>>>> If that's software configurable, how on Earth did you pick the address?
>>>>> How do you ensure that it doesn't conflict with DMA? How is it
>>>>> configured into the RC?
>>>>
>>>> But we *do* need to resolve this.  This does seem like an address that
>>>> shouldn't be hard-coded into the driver.  Since this driver is
>>>> programming the address into an MSI message, but not into the receiver
>>>> of that message, there's a coordination issue between this driver and
>>>> whatever other software does that receiver configuration.
>>>
>>> OK. I'll move the doorbell address to the DT for v11.
>>>
>>> What property should be used for this address?
>>>
>>> sigma,doorbell ?
>>>
>>> Or maybe I can put it in reg, since I have a 1:1 mapping
>>> between bus and cpu addresses?
>>>
>>> git grep -i doorbell arch/arm/boot/dts/ arch/arm64/boot/dts/
>>> returns nothing.
>> 
>> You haven't answered the question yet: you stated that the doorbell
>> address is software configurable, yet your code does not seem to
>> configure it. It only returns the doorbell address so that it gets
>> communicated to the downstream devices.
>> 
>> So how does the RC know which address is special, so it can trigger on
>> inbound writes hitting that address and assert the SPI ?
>
> The CPU address of the MSI doorbell address is 0x2e07c
> i.e. within the reg space of the PCIe controller block.

Which you describe in DT already, right? So why aren't you using an
offset in this region as your MSI ddorbell (potentially applying an
offset, see below)?

>
> As I discussed back in March, the RC implements an odd
> bus-to-system mapping.
>
> RC BAR0 defines a window in PCI address space (max 1GB).
> Accesses outside this window are silently ignored.
> The window is divided into 8 "regions" and there are 8
> registers defining the offset into CPU space.
>
> In pseudo code, assuming pci_address is within the
> window defined by BAR0:
>
> cpu_address map_bus_to_system(pci_address)
> {
>     temp = pci_address - BAR0.base
>     region = temp / region_size
>     offset = temp % region_size
>     cpu_address = region_reg[region] + offset
>     return cpu_address
> }
>
> The current setup is:
>
> DRAM at 0x80000000-0xa0000000
> BAR0.base = 0x80000000
> REGION[0] = 0x80000000
> REGION[1] = 0x88000000
> REGION[2] = 0x90000000
> REGION[3] = 0x98000000
> REGION[4] = 0x0
>
> (This map means 1:1 identity for DRAM addresses.)
>
> Thus when a device writes to 0xa002e07c (region 4)
> the write is forwarded to 0x2e07c.

But how do you find out about the 0xa0000000 offset? You must make sure
that the provided address is outside of RAM, should you end-up on a
system more than 1GB of RAM.

	M.
Mason Aug. 25, 2017, 8:56 a.m. UTC | #11
On 25/08/2017 09:54, Marc Zyngier wrote:
> On Thu, Aug 24 2017 at 10:53:16 pm BST, Mason <slash.tmp@free.fr> wrote:
>> On 24/08/2017 20:35, Ard Biesheuvel wrote:
>>> On 24 August 2017 at 18:51, Marc Gonzalez wrote:
>>>> On 24/08/2017 19:04, Bjorn Helgaas wrote:
>>>>> On Tue, Aug 22, 2017 Marc Zyngier wrote:
>>>>>> Marc Gonzalez wrote:
>>>>>>> On 22/08/2017 18:29, Marc Zyngier wrote:
>>>>>>>> On 22/08/17 15:56, Marc Gonzalez wrote:
>>>>>>>>
>>>>>>>>>  #define SMP8759_MUX              0x48
>>>>>>>>>  #define SMP8759_TEST_OUT 0x74
>>>>>>>>> +#define SMP8759_STATUS           0x80
>>>>>>>>> +#define SMP8759_ENABLE           0xa0
>>>>>>>>> +#define SMP8759_DOORBELL 0xa002e07c
>>>>>>>>
>>>>>>>> Why is this hardcoded and not coming from the device-tree, just like any
>>>>>>>> other address property?
>>>>>>>
>>>>>>> Since this bus address is software-configurable, I didn't think
>>>>>>> it belonged in the DT. Also, I didn't see anything similar in
>>>>>>> other binding docs, especially
>>>>>>>
>>>>>>> Documentation/devicetree/bindings/interrupt-controller/msi.txt
>>>>>>
>>>>>> If that's software configurable, how on Earth did you pick the address?
>>>>>> How do you ensure that it doesn't conflict with DMA? How is it
>>>>>> configured into the RC?
>>>>>
>>>>> But we *do* need to resolve this.  This does seem like an address that
>>>>> shouldn't be hard-coded into the driver.  Since this driver is
>>>>> programming the address into an MSI message, but not into the receiver
>>>>> of that message, there's a coordination issue between this driver and
>>>>> whatever other software does that receiver configuration.
>>>>
>>>> OK. I'll move the doorbell address to the DT for v11.
>>>>
>>>> What property should be used for this address?
>>>>
>>>> sigma,doorbell ?
>>>>
>>>> Or maybe I can put it in reg, since I have a 1:1 mapping
>>>> between bus and cpu addresses?
>>>>
>>>> git grep -i doorbell arch/arm/boot/dts/ arch/arm64/boot/dts/
>>>> returns nothing.
>>>
>>> You haven't answered the question yet: you stated that the doorbell
>>> address is software configurable, yet your code does not seem to
>>> configure it. It only returns the doorbell address so that it gets
>>> communicated to the downstream devices.
>>>
>>> So how does the RC know which address is special, so it can trigger on
>>> inbound writes hitting that address and assert the SPI ?
>>
>> The CPU address of the MSI doorbell address is 0x2e07c
>> i.e. within the reg space of the PCIe controller block.
> 
> Which you describe in DT already, right? So why aren't you using an
> offset in this region as your MSI doorbell (potentially applying an
> offset, see below)?

Yes, the controller is described in DT:

	pcie@2e000 {
		compatible = "sigma,smp8759-pcie";
		reg = <0x50000000 0x400000>, <0x2e000 0x100>;

IIUC, you're saying I don't need the doorbell address
explicitly in the DT, because I can compute:

0x2e000 (from the DT) + 0x7c (offset within the block)

OK, that sounds right.

Then there is the matter of the region offset,
i.e. 0xa0000000 in my current code.

It might also be worth keeping in mind that there is
a second revision of the PCIe controller that handles
the doorbell differently. In rev2, I just pick an
arbitrary address within the window, program that
bus address into the controller, and the controller
knows to forward that single address to the right
place. See "[RFC PATCH v0.2] PCI: Add support for
tango PCIe host bridge" for an example of this.

Typically, I pick the first address of the window,
BAR0.base, since I assume no device will ever need
to read/write the first word of RAM.


>> As I discussed back in March, the RC implements an odd
>> bus-to-system mapping.
>>
>> RC BAR0 defines a window in PCI address space (max 1GB).
>> Accesses outside this window are silently ignored.
>> The window is divided into 8 "regions" and there are 8
>> registers defining the offset into CPU space.
>>
>> In pseudo code, assuming pci_address is within the
>> window defined by BAR0:
>>
>> cpu_address map_bus_to_system(pci_address)
>> {
>>     temp = pci_address - BAR0.base
>>     region = temp / region_size
>>     offset = temp % region_size
>>     cpu_address = region_reg[region] + offset
>>     return cpu_address
>> }
>>
>> The current setup is:
>>
>> DRAM at 0x80000000-0xa0000000
>> BAR0.base = 0x80000000
>> REGION[0] = 0x80000000
>> REGION[1] = 0x88000000
>> REGION[2] = 0x90000000
>> REGION[3] = 0x98000000
>> REGION[4] = 0x0
>>
>> (This map means 1:1 identity for DRAM addresses.)
>>
>> Thus when a device writes to 0xa002e07c (region 4)
>> the write is forwarded to 0x2e07c.
> 
> But how do you find out about the 0xa0000000 offset? You must make sure
> that the provided address is outside of RAM, should you end-up on a
> system more than 1GB of RAM.

You're right, I've swept this issue under the rug so far.

The boards typically come with either
- two 512MB DIMMs
- two 1GB DIMMs
(there may be other setups I'm not aware of, e.g. with
a single memory module).

In the DT for my dev board, I describe *all* of the RAM.

	memory@80000000 {
		device_type = "memory";
		reg = <0x80000000 0x80000000>; /* 2 GB */
	};

But actually, Linux is only given to manage a fraction
of this memory, as some of it is for other processors
and DSPs, and a large part is for video decoder buffers.

In the end, Linux manages, typically
128MB, 256MB, 512MB, 2x128MB, or 2x256MB
(there may be other setups I'm not aware of)
And the actual config is passed to Linux through
a mem= command-line directive.

It is not clear to me, as I discussed with Ard, how the
Linux driver is supposed to make this all work.

Maybe I can have some platform code that walks the
different RAM areas available to Linux, and sets up
the appropriate physical-to-dma mappings?

Regards.
Mason Aug. 25, 2017, 3:01 p.m. UTC | #12
On 25/08/2017 09:54, Marc Zyngier wrote:
> On Thu, Aug 24 2017 at 10:53:16 pm BST, Mason <slash.tmp@free.fr> wrote:
>> On 24/08/2017 20:35, Ard Biesheuvel wrote:
>>> On 24 August 2017 at 18:51, Marc Gonzalez wrote:
>>>> On 24/08/2017 19:04, Bjorn Helgaas wrote:
>>>>> On Tue, Aug 22, 2017 Marc Zyngier wrote:
>>>>>> Marc Gonzalez wrote:
>>>>>>> On 22/08/2017 18:29, Marc Zyngier wrote:
>>>>>>>> On 22/08/17 15:56, Marc Gonzalez wrote:
>>>>>>>>
>>>>>>>>>  #define SMP8759_MUX              0x48
>>>>>>>>>  #define SMP8759_TEST_OUT 0x74
>>>>>>>>> +#define SMP8759_STATUS           0x80
>>>>>>>>> +#define SMP8759_ENABLE           0xa0
>>>>>>>>> +#define SMP8759_DOORBELL 0xa002e07c
>>>>>>>>
>>>>>>>> Why is this hardcoded and not coming from the device-tree, just like any
>>>>>>>> other address property?
>>>>>>>
>>>>>>> Since this bus address is software-configurable, I didn't think
>>>>>>> it belonged in the DT. Also, I didn't see anything similar in
>>>>>>> other binding docs, especially
>>>>>>>
>>>>>>> Documentation/devicetree/bindings/interrupt-controller/msi.txt
>>>>>>
>>>>>> If that's software configurable, how on Earth did you pick the address?
>>>>>> How do you ensure that it doesn't conflict with DMA? How is it
>>>>>> configured into the RC?
>>>>>
>>>>> But we *do* need to resolve this.  This does seem like an address that
>>>>> shouldn't be hard-coded into the driver.  Since this driver is
>>>>> programming the address into an MSI message, but not into the receiver
>>>>> of that message, there's a coordination issue between this driver and
>>>>> whatever other software does that receiver configuration.
>>>>
>>>> OK. I'll move the doorbell address to the DT for v11.
>>>>
>>>> What property should be used for this address?
>>>>
>>>> sigma,doorbell ?
>>>>
>>>> Or maybe I can put it in reg, since I have a 1:1 mapping
>>>> between bus and cpu addresses?
>>>>
>>>> git grep -i doorbell arch/arm/boot/dts/ arch/arm64/boot/dts/
>>>> returns nothing.
>>>
>>> You haven't answered the question yet: you stated that the doorbell
>>> address is software configurable, yet your code does not seem to
>>> configure it. It only returns the doorbell address so that it gets
>>> communicated to the downstream devices.
>>>
>>> So how does the RC know which address is special, so it can trigger on
>>> inbound writes hitting that address and assert the SPI ?
>>
>> The CPU address of the MSI doorbell address is 0x2e07c
>> i.e. within the reg space of the PCIe controller block.
> 
> Which you describe in DT already, right? So why aren't you using an
> offset in this region as your MSI ddorbell (potentially applying an
> offset, see below)?
> 
>
>> As I discussed back in March, the RC implements an odd
>> bus-to-system mapping.
>>
>> RC BAR0 defines a window in PCI address space (max 1GB).
>> Accesses outside this window are silently ignored.
>> The window is divided into 8 "regions" and there are 8
>> registers defining the offset into CPU space.
>>
>> In pseudo code, assuming pci_address is within the
>> window defined by BAR0:
>>
>> cpu_address map_bus_to_system(pci_address)
>> {
>>     temp = pci_address - BAR0.base
>>     region = temp / region_size
>>     offset = temp % region_size
>>     cpu_address = region_reg[region] + offset
>>     return cpu_address
>> }
>>
>> The current setup is:
>>
>> DRAM at 0x80000000-0xa0000000
>> BAR0.base = 0x80000000
>> REGION[0] = 0x80000000
>> REGION[1] = 0x88000000
>> REGION[2] = 0x90000000
>> REGION[3] = 0x98000000
>> REGION[4] = 0x0
>>
>> (This map means 1:1 identity for DRAM addresses.)
>>
>> Thus when a device writes to 0xa002e07c (region 4)
>> the write is forwarded to 0x2e07c.
> 
> But how do you find out about the 0xa0000000 offset? You must make sure
> that the provided address is outside of RAM, should you end-up on a
> system more than 1GB of RAM.

Robin wrote a prophetic post back in March:
http://lists.infradead.org/pipermail/linux-arm-kernel/2017-March/492965.html

> The appropriate DT property would be "dma-ranges", i.e.
> 
> pci@... {
> 	...
> 	dma-ranges = <(PCI bus address) (CPU phys address) (size)>;
> }

The dma-ranges property seems to be exactly what I'm looking for:

Restrict DMA to the first X MB of RAM (use a bounce buffer
for other physical addresses).

I added the following property to my PCIe node

  dma-ranges = <0x0 0x80000000 0x80000000 0x20000000>;

with the intent to create a 1:1 mapping for [0x80000000, 0xa0000000[

But it does not work. Arg!

My PCIe controller driver seems to be correctly calling of_dma_get_range:

[    0.520469] [<c03d85e8>] (of_dma_get_range) from [<c03d5ad8>] (of_dma_configure+0x48/0x234)
[    0.520483] [<c03d5ad8>] (of_dma_configure) from [<c02fa154>] (pci_device_add+0xac/0x350)
[    0.520493] [<c02fa154>] (pci_device_add) from [<c02fa488>] (pci_scan_single_device+0x90/0xb0)
[    0.520501] [<c02fa488>] (pci_scan_single_device) from [<c02fa500>] (pci_scan_slot+0x58/0x100)
[    0.520510] [<c02fa500>] (pci_scan_slot) from [<c02fb418>] (pci_scan_child_bus+0x20/0xf8)
[    0.520519] [<c02fb418>] (pci_scan_child_bus) from [<c02fb6e8>] (pci_scan_root_bus_msi+0xcc/0xd8)
[    0.520527] [<c02fb6e8>] (pci_scan_root_bus_msi) from [<c02fb70c>] (pci_scan_root_bus+0x18/0x20)
[    0.520537] [<c02fb70c>] (pci_scan_root_bus) from [<c0310544>] (pci_host_common_probe+0xc8/0x314)
[    0.520546] [<c0310544>] (pci_host_common_probe) from [<c0310ce8>] (tango_pcie_probe+0x148/0x350)
[    0.520557] [<c0310ce8>] (tango_pcie_probe) from [<c034d398>] (platform_drv_probe+0x34/0x6c)

of_dma_get_range() is called on the pcie node (which is expected)
but after parsing n_addr_cells and n_size_cells in the while loop,
the code jumps to the parent node ("soc")... while my property is
attached to the pcie node...

[    0.507754] of_dma_get_range: node=dfbf74cc np=dfbf74cc name=/soc/pcie@2e000 name2=/soc/pcie@2e000
...
[    0.509162] __of_find_property: node=soc find=#address-cells prop=compatible
[    0.509168] __of_find_property: node=soc find=#address-cells prop=interrupt-parent
[    0.509173] __of_find_property: node=soc find=#address-cells prop=#address-cells
[    0.509178] __of_find_property: node=soc find=#size-cells prop=compatible
[    0.509182] __of_find_property: node=soc find=#size-cells prop=interrupt-parent
[    0.509186] __of_find_property: node=soc find=#size-cells prop=#address-cells
[    0.509190] __of_find_property: node=soc find=#size-cells prop=#size-cells
[    0.509195] __of_find_property: node=soc find=dma-ranges prop=compatible
[    0.509199] __of_find_property: node=soc find=dma-ranges prop=interrupt-parent
[    0.509203] __of_find_property: node=soc find=dma-ranges prop=#address-cells
[    0.509207] __of_find_property: node=soc find=dma-ranges prop=#size-cells
[    0.509211] __of_find_property: node=soc find=dma-ranges prop=ranges
[    0.509215] __of_find_property: node=soc find=dma-ranges prop=name
[    0.509219] dma-ranges=  (null)

http://elixir.free-electrons.com/linux/latest/source/drivers/of/address.c#L838

What am I missing?

Regards.
Robin Murphy Aug. 25, 2017, 3:25 p.m. UTC | #13
On 25/08/17 16:01, Mason wrote:
> On 25/08/2017 09:54, Marc Zyngier wrote:
>> On Thu, Aug 24 2017 at 10:53:16 pm BST, Mason <slash.tmp@free.fr> wrote:
>>> On 24/08/2017 20:35, Ard Biesheuvel wrote:
>>>> On 24 August 2017 at 18:51, Marc Gonzalez wrote:
>>>>> On 24/08/2017 19:04, Bjorn Helgaas wrote:
>>>>>> On Tue, Aug 22, 2017 Marc Zyngier wrote:
>>>>>>> Marc Gonzalez wrote:
>>>>>>>> On 22/08/2017 18:29, Marc Zyngier wrote:
>>>>>>>>> On 22/08/17 15:56, Marc Gonzalez wrote:
>>>>>>>>>
>>>>>>>>>>  #define SMP8759_MUX              0x48
>>>>>>>>>>  #define SMP8759_TEST_OUT 0x74
>>>>>>>>>> +#define SMP8759_STATUS           0x80
>>>>>>>>>> +#define SMP8759_ENABLE           0xa0
>>>>>>>>>> +#define SMP8759_DOORBELL 0xa002e07c
>>>>>>>>>
>>>>>>>>> Why is this hardcoded and not coming from the device-tree, just like any
>>>>>>>>> other address property?
>>>>>>>>
>>>>>>>> Since this bus address is software-configurable, I didn't think
>>>>>>>> it belonged in the DT. Also, I didn't see anything similar in
>>>>>>>> other binding docs, especially
>>>>>>>>
>>>>>>>> Documentation/devicetree/bindings/interrupt-controller/msi.txt
>>>>>>>
>>>>>>> If that's software configurable, how on Earth did you pick the address?
>>>>>>> How do you ensure that it doesn't conflict with DMA? How is it
>>>>>>> configured into the RC?
>>>>>>
>>>>>> But we *do* need to resolve this.  This does seem like an address that
>>>>>> shouldn't be hard-coded into the driver.  Since this driver is
>>>>>> programming the address into an MSI message, but not into the receiver
>>>>>> of that message, there's a coordination issue between this driver and
>>>>>> whatever other software does that receiver configuration.
>>>>>
>>>>> OK. I'll move the doorbell address to the DT for v11.
>>>>>
>>>>> What property should be used for this address?
>>>>>
>>>>> sigma,doorbell ?
>>>>>
>>>>> Or maybe I can put it in reg, since I have a 1:1 mapping
>>>>> between bus and cpu addresses?
>>>>>
>>>>> git grep -i doorbell arch/arm/boot/dts/ arch/arm64/boot/dts/
>>>>> returns nothing.
>>>>
>>>> You haven't answered the question yet: you stated that the doorbell
>>>> address is software configurable, yet your code does not seem to
>>>> configure it. It only returns the doorbell address so that it gets
>>>> communicated to the downstream devices.
>>>>
>>>> So how does the RC know which address is special, so it can trigger on
>>>> inbound writes hitting that address and assert the SPI ?
>>>
>>> The CPU address of the MSI doorbell address is 0x2e07c
>>> i.e. within the reg space of the PCIe controller block.
>>
>> Which you describe in DT already, right? So why aren't you using an
>> offset in this region as your MSI ddorbell (potentially applying an
>> offset, see below)?
>>
>>
>>> As I discussed back in March, the RC implements an odd
>>> bus-to-system mapping.
>>>
>>> RC BAR0 defines a window in PCI address space (max 1GB).
>>> Accesses outside this window are silently ignored.
>>> The window is divided into 8 "regions" and there are 8
>>> registers defining the offset into CPU space.
>>>
>>> In pseudo code, assuming pci_address is within the
>>> window defined by BAR0:
>>>
>>> cpu_address map_bus_to_system(pci_address)
>>> {
>>>     temp = pci_address - BAR0.base
>>>     region = temp / region_size
>>>     offset = temp % region_size
>>>     cpu_address = region_reg[region] + offset
>>>     return cpu_address
>>> }
>>>
>>> The current setup is:
>>>
>>> DRAM at 0x80000000-0xa0000000
>>> BAR0.base = 0x80000000
>>> REGION[0] = 0x80000000
>>> REGION[1] = 0x88000000
>>> REGION[2] = 0x90000000
>>> REGION[3] = 0x98000000
>>> REGION[4] = 0x0
>>>
>>> (This map means 1:1 identity for DRAM addresses.)
>>>
>>> Thus when a device writes to 0xa002e07c (region 4)
>>> the write is forwarded to 0x2e07c.
>>
>> But how do you find out about the 0xa0000000 offset? You must make sure
>> that the provided address is outside of RAM, should you end-up on a
>> system more than 1GB of RAM.
> 
> Robin wrote a prophetic post back in March:
> http://lists.infradead.org/pipermail/linux-arm-kernel/2017-March/492965.html
> 
>> The appropriate DT property would be "dma-ranges", i.e.
>>
>> pci@... {
>> 	...
>> 	dma-ranges = <(PCI bus address) (CPU phys address) (size)>;
>> }
> 
> The dma-ranges property seems to be exactly what I'm looking for:
> 
> Restrict DMA to the first X MB of RAM (use a bounce buffer
> for other physical addresses).
> 
> I added the following property to my PCIe node
> 
>   dma-ranges = <0x0 0x80000000 0x80000000 0x20000000>;
> 
> with the intent to create a 1:1 mapping for [0x80000000, 0xa0000000[
> 
> But it does not work. Arg!
> 
> My PCIe controller driver seems to be correctly calling of_dma_get_range:
> 
> [    0.520469] [<c03d85e8>] (of_dma_get_range) from [<c03d5ad8>] (of_dma_configure+0x48/0x234)
> [    0.520483] [<c03d5ad8>] (of_dma_configure) from [<c02fa154>] (pci_device_add+0xac/0x350)
> [    0.520493] [<c02fa154>] (pci_device_add) from [<c02fa488>] (pci_scan_single_device+0x90/0xb0)
> [    0.520501] [<c02fa488>] (pci_scan_single_device) from [<c02fa500>] (pci_scan_slot+0x58/0x100)
> [    0.520510] [<c02fa500>] (pci_scan_slot) from [<c02fb418>] (pci_scan_child_bus+0x20/0xf8)
> [    0.520519] [<c02fb418>] (pci_scan_child_bus) from [<c02fb6e8>] (pci_scan_root_bus_msi+0xcc/0xd8)
> [    0.520527] [<c02fb6e8>] (pci_scan_root_bus_msi) from [<c02fb70c>] (pci_scan_root_bus+0x18/0x20)
> [    0.520537] [<c02fb70c>] (pci_scan_root_bus) from [<c0310544>] (pci_host_common_probe+0xc8/0x314)
> [    0.520546] [<c0310544>] (pci_host_common_probe) from [<c0310ce8>] (tango_pcie_probe+0x148/0x350)
> [    0.520557] [<c0310ce8>] (tango_pcie_probe) from [<c034d398>] (platform_drv_probe+0x34/0x6c)
> 
> of_dma_get_range() is called on the pcie node (which is expected)
> but after parsing n_addr_cells and n_size_cells in the while loop,
> the code jumps to the parent node ("soc")... while my property is
> attached to the pcie node...

This is not your driver calling of_dma_get_range(), this is the PCI core
doing so in the act of DMA master configuration for a discovered
*endpoint*. The fact that the "pass the host controller's OF node
because we don't have one for the endpoint" bodge only works properly
for dma-coherent and not dma-ranges is a known, but irrelevant, problem.

If your host controller driver needs to discover its windows from DT to
configure *itself*, it needs to parse dma-ranges itself; see pcie-iproc,
pcie-racar, pcie-xgene, etc. for examples.

Robin.

> 
> [    0.507754] of_dma_get_range: node=dfbf74cc np=dfbf74cc name=/soc/pcie@2e000 name2=/soc/pcie@2e000
> ...
> [    0.509162] __of_find_property: node=soc find=#address-cells prop=compatible
> [    0.509168] __of_find_property: node=soc find=#address-cells prop=interrupt-parent
> [    0.509173] __of_find_property: node=soc find=#address-cells prop=#address-cells
> [    0.509178] __of_find_property: node=soc find=#size-cells prop=compatible
> [    0.509182] __of_find_property: node=soc find=#size-cells prop=interrupt-parent
> [    0.509186] __of_find_property: node=soc find=#size-cells prop=#address-cells
> [    0.509190] __of_find_property: node=soc find=#size-cells prop=#size-cells
> [    0.509195] __of_find_property: node=soc find=dma-ranges prop=compatible
> [    0.509199] __of_find_property: node=soc find=dma-ranges prop=interrupt-parent
> [    0.509203] __of_find_property: node=soc find=dma-ranges prop=#address-cells
> [    0.509207] __of_find_property: node=soc find=dma-ranges prop=#size-cells
> [    0.509211] __of_find_property: node=soc find=dma-ranges prop=ranges
> [    0.509215] __of_find_property: node=soc find=dma-ranges prop=name
> [    0.509219] dma-ranges=  (null)
> 
> http://elixir.free-electrons.com/linux/latest/source/drivers/of/address.c#L838
> 
> What am I missing?
> 
> Regards.
>
Mason Aug. 25, 2017, 3:35 p.m. UTC | #14
On 25/08/2017 17:25, Robin Murphy wrote:

> On 25/08/17 16:01, Mason wrote:
>
>> Robin wrote a prophetic post back in March:
>> http://lists.infradead.org/pipermail/linux-arm-kernel/2017-March/492965.html
>>
>>> The appropriate DT property would be "dma-ranges", i.e.
>>>
>>> pci@... {
>>> 	...
>>> 	dma-ranges = <(PCI bus address) (CPU phys address) (size)>;
>>> }
>>
>> The dma-ranges property seems to be exactly what I'm looking for:
>>
>> Restrict DMA to the first X MB of RAM (use a bounce buffer
>> for other physical addresses).
>>
>> I added the following property to my PCIe node
>>
>>   dma-ranges = <0x0 0x80000000 0x80000000 0x20000000>;
>>
>> with the intent to create a 1:1 mapping for [0x80000000, 0xa0000000[
>>
>> But it does not work. Arg!
>>
>> My PCIe controller driver seems to be correctly calling of_dma_get_range:
>>
>> [    0.520469] [<c03d85e8>] (of_dma_get_range) from [<c03d5ad8>] (of_dma_configure+0x48/0x234)
>> [    0.520483] [<c03d5ad8>] (of_dma_configure) from [<c02fa154>] (pci_device_add+0xac/0x350)
>> [    0.520493] [<c02fa154>] (pci_device_add) from [<c02fa488>] (pci_scan_single_device+0x90/0xb0)
>> [    0.520501] [<c02fa488>] (pci_scan_single_device) from [<c02fa500>] (pci_scan_slot+0x58/0x100)
>> [    0.520510] [<c02fa500>] (pci_scan_slot) from [<c02fb418>] (pci_scan_child_bus+0x20/0xf8)
>> [    0.520519] [<c02fb418>] (pci_scan_child_bus) from [<c02fb6e8>] (pci_scan_root_bus_msi+0xcc/0xd8)
>> [    0.520527] [<c02fb6e8>] (pci_scan_root_bus_msi) from [<c02fb70c>] (pci_scan_root_bus+0x18/0x20)
>> [    0.520537] [<c02fb70c>] (pci_scan_root_bus) from [<c0310544>] (pci_host_common_probe+0xc8/0x314)
>> [    0.520546] [<c0310544>] (pci_host_common_probe) from [<c0310ce8>] (tango_pcie_probe+0x148/0x350)
>> [    0.520557] [<c0310ce8>] (tango_pcie_probe) from [<c034d398>] (platform_drv_probe+0x34/0x6c)
>>
>> of_dma_get_range() is called on the pcie node (which is expected)
>> but after parsing n_addr_cells and n_size_cells in the while loop,
>> the code jumps to the parent node ("soc")... while my property is
>> attached to the pcie node...
> 
> This is not your driver calling of_dma_get_range(), this is the PCI core
> doing so in the act of DMA master configuration for a discovered
> *endpoint*. The fact that the "pass the host controller's OF node
> because we don't have one for the endpoint" bodge only works properly
> for dma-coherent and not dma-ranges is a known, but irrelevant, problem.
> 
> If your host controller driver needs to discover its windows from DT to
> configure *itself*, it needs to parse dma-ranges itself; see pcie-iproc,
> pcie-racar, pcie-xgene, etc. for examples.

Yes, I'm aware that I need to do my own parsing of dma-ranges.
I can use that information to configure BAR0.base and the
region registers.

But Linux needs to record my settings at some point, right?
Otherwise, how does the DMA framework know that devices can
only reach cpu addresses [0x80000000, 0xa0000000[ and when
to use bounce buffers?

What's preventing the XHCI driver from allocating memory
outside of my "safe" range, and having the DMA framework
blindly map that?

Regards.
Robin Murphy Aug. 25, 2017, 3:45 p.m. UTC | #15
On 25/08/17 16:35, Mason wrote:
> On 25/08/2017 17:25, Robin Murphy wrote:
> 
>> On 25/08/17 16:01, Mason wrote:
>>
>>> Robin wrote a prophetic post back in March:
>>> http://lists.infradead.org/pipermail/linux-arm-kernel/2017-March/492965.html
>>>
>>>> The appropriate DT property would be "dma-ranges", i.e.
>>>>
>>>> pci@... {
>>>> 	...
>>>> 	dma-ranges = <(PCI bus address) (CPU phys address) (size)>;
>>>> }
>>>
>>> The dma-ranges property seems to be exactly what I'm looking for:
>>>
>>> Restrict DMA to the first X MB of RAM (use a bounce buffer
>>> for other physical addresses).
>>>
>>> I added the following property to my PCIe node
>>>
>>>   dma-ranges = <0x0 0x80000000 0x80000000 0x20000000>;
>>>
>>> with the intent to create a 1:1 mapping for [0x80000000, 0xa0000000[
>>>
>>> But it does not work. Arg!
>>>
>>> My PCIe controller driver seems to be correctly calling of_dma_get_range:
>>>
>>> [    0.520469] [<c03d85e8>] (of_dma_get_range) from [<c03d5ad8>] (of_dma_configure+0x48/0x234)
>>> [    0.520483] [<c03d5ad8>] (of_dma_configure) from [<c02fa154>] (pci_device_add+0xac/0x350)
>>> [    0.520493] [<c02fa154>] (pci_device_add) from [<c02fa488>] (pci_scan_single_device+0x90/0xb0)
>>> [    0.520501] [<c02fa488>] (pci_scan_single_device) from [<c02fa500>] (pci_scan_slot+0x58/0x100)
>>> [    0.520510] [<c02fa500>] (pci_scan_slot) from [<c02fb418>] (pci_scan_child_bus+0x20/0xf8)
>>> [    0.520519] [<c02fb418>] (pci_scan_child_bus) from [<c02fb6e8>] (pci_scan_root_bus_msi+0xcc/0xd8)
>>> [    0.520527] [<c02fb6e8>] (pci_scan_root_bus_msi) from [<c02fb70c>] (pci_scan_root_bus+0x18/0x20)
>>> [    0.520537] [<c02fb70c>] (pci_scan_root_bus) from [<c0310544>] (pci_host_common_probe+0xc8/0x314)
>>> [    0.520546] [<c0310544>] (pci_host_common_probe) from [<c0310ce8>] (tango_pcie_probe+0x148/0x350)
>>> [    0.520557] [<c0310ce8>] (tango_pcie_probe) from [<c034d398>] (platform_drv_probe+0x34/0x6c)
>>>
>>> of_dma_get_range() is called on the pcie node (which is expected)
>>> but after parsing n_addr_cells and n_size_cells in the while loop,
>>> the code jumps to the parent node ("soc")... while my property is
>>> attached to the pcie node...
>>
>> This is not your driver calling of_dma_get_range(), this is the PCI core
>> doing so in the act of DMA master configuration for a discovered
>> *endpoint*. The fact that the "pass the host controller's OF node
>> because we don't have one for the endpoint" bodge only works properly
>> for dma-coherent and not dma-ranges is a known, but irrelevant, problem.
>>
>> If your host controller driver needs to discover its windows from DT to
>> configure *itself*, it needs to parse dma-ranges itself; see pcie-iproc,
>> pcie-racar, pcie-xgene, etc. for examples.
> 
> Yes, I'm aware that I need to do my own parsing of dma-ranges.
> I can use that information to configure BAR0.base and the
> region registers.
> 
> But Linux needs to record my settings at some point, right?
> Otherwise, how does the DMA framework know that devices can
> only reach cpu addresses [0x80000000, 0xa0000000[ and when
> to use bounce buffers?
> 
> What's preventing the XHCI driver from allocating memory
> outside of my "safe" range, and having the DMA framework
> blindly map that?

At the moment, nothing. Systems that have physical memory that is not
visible in PCI mem space are having a bad time and will not go to space
today.

But that bears no relation to your MSI controller getting its doorbell
address set appropriately.

Robin.
Mason Aug. 25, 2017, 4:44 p.m. UTC | #16
On 25/08/2017 17:45, Robin Murphy wrote:

> On 25/08/17 16:35, Mason wrote:
>
>> On 25/08/2017 17:25, Robin Murphy wrote:
>>
>>> On 25/08/17 16:01, Mason wrote:
>>>
>>>> Robin wrote a prophetic post back in March:
>>>> http://lists.infradead.org/pipermail/linux-arm-kernel/2017-March/492965.html
>>>>
>>>>> The appropriate DT property would be "dma-ranges", i.e.
>>>>>
>>>>> pci@... {
>>>>> 	...
>>>>> 	dma-ranges = <(PCI bus address) (CPU phys address) (size)>;
>>>>> }
>>>>
>>>> The dma-ranges property seems to be exactly what I'm looking for:
>>>>
>>>> Restrict DMA to the first X MB of RAM (use a bounce buffer
>>>> for other physical addresses).
>>>>
>>>> I added the following property to my PCIe node
>>>>
>>>>   dma-ranges = <0x0 0x80000000 0x80000000 0x20000000>;
>>>>
>>>> with the intent to create a 1:1 mapping for [0x80000000, 0xa0000000[
>>>>
>>>> But it does not work. Arg!
>>>>
>>>> My PCIe controller driver seems to be correctly calling of_dma_get_range:
>>>>
>>>> [    0.520469] [<c03d85e8>] (of_dma_get_range) from [<c03d5ad8>] (of_dma_configure+0x48/0x234)
>>>> [    0.520483] [<c03d5ad8>] (of_dma_configure) from [<c02fa154>] (pci_device_add+0xac/0x350)
>>>> [    0.520493] [<c02fa154>] (pci_device_add) from [<c02fa488>] (pci_scan_single_device+0x90/0xb0)
>>>> [    0.520501] [<c02fa488>] (pci_scan_single_device) from [<c02fa500>] (pci_scan_slot+0x58/0x100)
>>>> [    0.520510] [<c02fa500>] (pci_scan_slot) from [<c02fb418>] (pci_scan_child_bus+0x20/0xf8)
>>>> [    0.520519] [<c02fb418>] (pci_scan_child_bus) from [<c02fb6e8>] (pci_scan_root_bus_msi+0xcc/0xd8)
>>>> [    0.520527] [<c02fb6e8>] (pci_scan_root_bus_msi) from [<c02fb70c>] (pci_scan_root_bus+0x18/0x20)
>>>> [    0.520537] [<c02fb70c>] (pci_scan_root_bus) from [<c0310544>] (pci_host_common_probe+0xc8/0x314)
>>>> [    0.520546] [<c0310544>] (pci_host_common_probe) from [<c0310ce8>] (tango_pcie_probe+0x148/0x350)
>>>> [    0.520557] [<c0310ce8>] (tango_pcie_probe) from [<c034d398>] (platform_drv_probe+0x34/0x6c)
>>>>
>>>> of_dma_get_range() is called on the pcie node (which is expected)
>>>> but after parsing n_addr_cells and n_size_cells in the while loop,
>>>> the code jumps to the parent node ("soc")... while my property is
>>>> attached to the pcie node...
>>>
>>> This is not your driver calling of_dma_get_range(), this is the PCI core
>>> doing so in the act of DMA master configuration for a discovered
>>> *endpoint*. The fact that the "pass the host controller's OF node
>>> because we don't have one for the endpoint" bodge only works properly
>>> for dma-coherent and not dma-ranges is a known, but irrelevant, problem.
>>>
>>> If your host controller driver needs to discover its windows from DT to
>>> configure *itself*, it needs to parse dma-ranges itself; see pcie-iproc,
>>> pcie-racar, pcie-xgene, etc. for examples.
>>
>> Yes, I'm aware that I need to do my own parsing of dma-ranges.
>> I can use that information to configure BAR0.base and the
>> region registers.
>>
>> But Linux needs to record my settings at some point, right?
>> Otherwise, how does the DMA framework know that devices can
>> only reach cpu addresses [0x80000000, 0xa0000000[ and when
>> to use bounce buffers?
>>
>> What's preventing the XHCI driver from allocating memory
>> outside of my "safe" range, and having the DMA framework
>> blindly map that?
> 
> At the moment, nothing. Systems that have physical memory that is not
> visible in PCI mem space are having a bad time and will not go to space
> today.
> 
> But that bears no relation to your MSI controller getting its doorbell
> address set appropriately.

OK, so this is what I propose for v11 in order to not
hard code the MSI doorbell address (e.g. 0xa002e07c)

I add the following property to the pcie node:

	dma-ranges = <0x0 0x80000000 0x80000000 0x20000000>;

I.e. pci_addr = 0x80000000, cpu_addr = 0x80000000, len=0x20000000

Then in the PCIe driver, I parse dma-ranges.

Consequently

	MSI_doorbell_addr = cpu_addr + len + res.start + 0x7c

Bjorn, Marc, Robin, is that an acceptable solution?


Tangent:

Robin, for my own education, how does one configure the DMA
framework to use bounce buffers for certain addresses?

Regards.
Marc Zyngier Aug. 26, 2017, 1:08 p.m. UTC | #17
On Fri, Aug 25 2017 at  6:44:27 pm BST, Mason <slash.tmp@free.fr> wrote:
> On 25/08/2017 17:45, Robin Murphy wrote:
>
>> On 25/08/17 16:35, Mason wrote:
>>
>>> On 25/08/2017 17:25, Robin Murphy wrote:
>>>
>>>> On 25/08/17 16:01, Mason wrote:
>>>>
>>>>> Robin wrote a prophetic post back in March:
>>>>> http://lists.infradead.org/pipermail/linux-arm-kernel/2017-March/492965.html
>>>>>
>>>>>> The appropriate DT property would be "dma-ranges", i.e.
>>>>>>
>>>>>> pci@... {
>>>>>> 	...
>>>>>> 	dma-ranges = <(PCI bus address) (CPU phys address) (size)>;
>>>>>> }
>>>>>
>>>>> The dma-ranges property seems to be exactly what I'm looking for:
>>>>>
>>>>> Restrict DMA to the first X MB of RAM (use a bounce buffer
>>>>> for other physical addresses).
>>>>>
>>>>> I added the following property to my PCIe node
>>>>>
>>>>>   dma-ranges = <0x0 0x80000000 0x80000000 0x20000000>;
>>>>>
>>>>> with the intent to create a 1:1 mapping for [0x80000000, 0xa0000000[
>>>>>
>>>>> But it does not work. Arg!
>>>>>
>>>>> My PCIe controller driver seems to be correctly calling of_dma_get_range:
>>>>>
>>>>> [ 0.520469] [<c03d85e8>] (of_dma_get_range) from [<c03d5ad8>]
>>>>> (of_dma_configure+0x48/0x234)
>>>>> [ 0.520483] [<c03d5ad8>] (of_dma_configure) from [<c02fa154>]
>>>>> (pci_device_add+0xac/0x350)
>>>>> [ 0.520493] [<c02fa154>] (pci_device_add) from [<c02fa488>]
>>>>> (pci_scan_single_device+0x90/0xb0)
>>>>> [ 0.520501] [<c02fa488>] (pci_scan_single_device) from
>>>>> [<c02fa500>] (pci_scan_slot+0x58/0x100)
>>>>> [ 0.520510] [<c02fa500>] (pci_scan_slot) from [<c02fb418>]
>>>>> (pci_scan_child_bus+0x20/0xf8)
>>>>> [ 0.520519] [<c02fb418>] (pci_scan_child_bus) from [<c02fb6e8>]
>>>>> (pci_scan_root_bus_msi+0xcc/0xd8)
>>>>> [ 0.520527] [<c02fb6e8>] (pci_scan_root_bus_msi) from
>>>>> [<c02fb70c>] (pci_scan_root_bus+0x18/0x20)
>>>>> [ 0.520537] [<c02fb70c>] (pci_scan_root_bus) from [<c0310544>]
>>>>> (pci_host_common_probe+0xc8/0x314)
>>>>> [ 0.520546] [<c0310544>] (pci_host_common_probe) from
>>>>> [<c0310ce8>] (tango_pcie_probe+0x148/0x350)
>>>>> [ 0.520557] [<c0310ce8>] (tango_pcie_probe) from [<c034d398>]
>>>>> (platform_drv_probe+0x34/0x6c)
>>>>>
>>>>> of_dma_get_range() is called on the pcie node (which is expected)
>>>>> but after parsing n_addr_cells and n_size_cells in the while loop,
>>>>> the code jumps to the parent node ("soc")... while my property is
>>>>> attached to the pcie node...
>>>>
>>>> This is not your driver calling of_dma_get_range(), this is the PCI core
>>>> doing so in the act of DMA master configuration for a discovered
>>>> *endpoint*. The fact that the "pass the host controller's OF node
>>>> because we don't have one for the endpoint" bodge only works properly
>>>> for dma-coherent and not dma-ranges is a known, but irrelevant, problem.
>>>>
>>>> If your host controller driver needs to discover its windows from DT to
>>>> configure *itself*, it needs to parse dma-ranges itself; see pcie-iproc,
>>>> pcie-racar, pcie-xgene, etc. for examples.
>>>
>>> Yes, I'm aware that I need to do my own parsing of dma-ranges.
>>> I can use that information to configure BAR0.base and the
>>> region registers.
>>>
>>> But Linux needs to record my settings at some point, right?
>>> Otherwise, how does the DMA framework know that devices can
>>> only reach cpu addresses [0x80000000, 0xa0000000[ and when
>>> to use bounce buffers?
>>>
>>> What's preventing the XHCI driver from allocating memory
>>> outside of my "safe" range, and having the DMA framework
>>> blindly map that?
>> 
>> At the moment, nothing. Systems that have physical memory that is not
>> visible in PCI mem space are having a bad time and will not go to space
>> today.
>> 
>> But that bears no relation to your MSI controller getting its doorbell
>> address set appropriately.
>
> OK, so this is what I propose for v11 in order to not
> hard code the MSI doorbell address (e.g. 0xa002e07c)
>
> I add the following property to the pcie node:
>
> 	dma-ranges = <0x0 0x80000000 0x80000000 0x20000000>;
>
> I.e. pci_addr = 0x80000000, cpu_addr = 0x80000000, len=0x20000000
>
> Then in the PCIe driver, I parse dma-ranges.
>
> Consequently
>
> 	MSI_doorbell_addr = cpu_addr + len + res.start + 0x7c
>
> Bjorn, Marc, Robin, is that an acceptable solution?

It seems to work, but I still have my doubts about this BAR0.base and
the associated regions. Are these regions so hardcoded in HW that the RC
cannot DMA outside of this 1GB region? Or can it be reconfigured by some
SW agent to cover more RAM, should someone decide that 1GB is on the
"too little" side?

If the former is true, the HW is remarkably busted and/or inflexible. If
the latter is true, then the dma-ranges property feels very fragile, as
it must be kept in sync with the amount of memory that the system has.

	M.
Mason Aug. 26, 2017, 6:12 p.m. UTC | #18
On 26/08/2017 15:08, Marc Zyngier wrote:

> On Aug 25 2017 at 18:44, Mason wrote:
>
>> On 25/08/2017 17:45, Robin Murphy wrote:
>>
>>> On 25/08/17 16:35, Mason wrote:
>>>
>>>> On 25/08/2017 17:25, Robin Murphy wrote:
>>>>
>>>>> On 25/08/17 16:01, Mason wrote:
>>>>>
>>>>>> Robin wrote a prophetic post back in March:
>>>>>> http://lists.infradead.org/pipermail/linux-arm-kernel/2017-March/492965.html
>>>>>>
>>>>>>> The appropriate DT property would be "dma-ranges", i.e.
>>>>>>>
>>>>>>> pci@... {
>>>>>>> 	...
>>>>>>> 	dma-ranges = <(PCI bus address) (CPU phys address) (size)>;
>>>>>>> }
>>>>>>
>>>>>> The dma-ranges property seems to be exactly what I'm looking for:
>>>>>>
>>>>>> Restrict DMA to the first X MB of RAM (use a bounce buffer
>>>>>> for other physical addresses).
>>>>>>
>>>>>> I added the following property to my PCIe node
>>>>>>
>>>>>>   dma-ranges = <0x0 0x80000000 0x80000000 0x20000000>;
>>>>>>
>>>>>> with the intent to create a 1:1 mapping for [0x80000000, 0xa0000000[
>>>>>>
>>>>>> But it does not work. Arg!
>>>>>>
>>>>>> My PCIe controller driver seems to be correctly calling of_dma_get_range:
>>>>>>
>>>>>> [ 0.520469] [<c03d85e8>] (of_dma_get_range) from [<c03d5ad8>] (of_dma_configure+0x48/0x234)
>>>>>> [ 0.520483] [<c03d5ad8>] (of_dma_configure) from [<c02fa154>] (pci_device_add+0xac/0x350)
>>>>>> [ 0.520493] [<c02fa154>] (pci_device_add) from [<c02fa488>] (pci_scan_single_device+0x90/0xb0)
>>>>>> [ 0.520501] [<c02fa488>] (pci_scan_single_device) from [<c02fa500>] (pci_scan_slot+0x58/0x100)
>>>>>> [ 0.520510] [<c02fa500>] (pci_scan_slot) from [<c02fb418>] (pci_scan_child_bus+0x20/0xf8)
>>>>>> [ 0.520519] [<c02fb418>] (pci_scan_child_bus) from [<c02fb6e8>] (pci_scan_root_bus_msi+0xcc/0xd8)
>>>>>> [ 0.520527] [<c02fb6e8>] (pci_scan_root_bus_msi) from> [<c02fb70c>] (pci_scan_root_bus+0x18/0x20)
>>>>>> [ 0.520537] [<c02fb70c>] (pci_scan_root_bus) from [<c0310544>] (pci_host_common_probe+0xc8/0x314)
>>>>>> [ 0.520546] [<c0310544>] (pci_host_common_probe) from [<c0310ce8>] (tango_pcie_probe+0x148/0x350)
>>>>>> [ 0.520557] [<c0310ce8>] (tango_pcie_probe) from [<c034d398>] (platform_drv_probe+0x34/0x6c)
>>>>>>
>>>>>> of_dma_get_range() is called on the pcie node (which is expected)
>>>>>> but after parsing n_addr_cells and n_size_cells in the while loop,
>>>>>> the code jumps to the parent node ("soc")... while my property is
>>>>>> attached to the pcie node...
>>>>>
>>>>> This is not your driver calling of_dma_get_range(), this is the PCI core
>>>>> doing so in the act of DMA master configuration for a discovered
>>>>> *endpoint*. The fact that the "pass the host controller's OF node
>>>>> because we don't have one for the endpoint" bodge only works properly
>>>>> for dma-coherent and not dma-ranges is a known, but irrelevant, problem.
>>>>>
>>>>> If your host controller driver needs to discover its windows from DT to
>>>>> configure *itself*, it needs to parse dma-ranges itself; see pcie-iproc,
>>>>> pcie-racar, pcie-xgene, etc. for examples.
>>>>
>>>> Yes, I'm aware that I need to do my own parsing of dma-ranges.
>>>> I can use that information to configure BAR0.base and the
>>>> region registers.
>>>>
>>>> But Linux needs to record my settings at some point, right?
>>>> Otherwise, how does the DMA framework know that devices can
>>>> only reach cpu addresses [0x80000000, 0xa0000000[ and when
>>>> to use bounce buffers?
>>>>
>>>> What's preventing the XHCI driver from allocating memory
>>>> outside of my "safe" range, and having the DMA framework
>>>> blindly map that?
>>>
>>> At the moment, nothing. Systems that have physical memory that is not
>>> visible in PCI mem space are having a bad time and will not go to space
>>> today.
>>>
>>> But that bears no relation to your MSI controller getting its doorbell
>>> address set appropriately.
>>
>> OK, so this is what I propose for v11 in order to not
>> hard code the MSI doorbell address (e.g. 0xa002e07c)
>>
>> I add the following property to the pcie node:
>>
>> 	dma-ranges = <0x0 0x80000000 0x80000000 0x20000000>;
>>
>> I.e. pci_addr = 0x80000000, cpu_addr = 0x80000000, len=0x20000000
>>
>> Then in the PCIe driver, I parse dma-ranges.
>>
>> Consequently
>>
>> 	MSI_doorbell_addr = cpu_addr + len + res.start + 0x7c
>>
>> Bjorn, Marc, Robin, is that an acceptable solution?
> 
> It seems to work, but I still have my doubts about this BAR0.base and
> the associated regions. Are these regions so hardcoded in HW that the RC
> cannot DMA outside of this 1GB region? Or can it be reconfigured by some
> SW agent to cover more RAM, should someone decide that 1GB is on the
> "too little" side?
> 
> If the former is true, the HW is remarkably busted and/or inflexible.

This HW block has already been deemed insane because
of the muxing of mem and config space... So you're
late to the party :-)

I wouldn't call the regions "hard-coded" since they are
software-configurable to point anywhere in the CPU bus.
(Although I'm not sure if that's any use, since the DMA
framework seems to expect a 1:1 mapping.)

But the other side (PCI bus) is quite inflexible:
accesses to addresses outside the window defined by BAR0
are silently ignored, no working around that :-(

> If the latter is true, then the dma-ranges property feels very fragile, as
> it must be kept in sync with the amount of memory that the system has.

I'm confused.

As I pointed out, the dma-ranges in the pcie node is
ignored by the DMA framework. And Robin confirmed that
"Systems that have physical memory that is not visible
in PCI mem space are having a bad time and will not go
to space today."

So there are several setups where something is bound
to break:

1) Linux manages more than 1 GB (contiguous)
=> because one region needs to point to the doorbell
area, so 128 MB are wasted.

2) Linux manages non-contiguous memory
=> e.g. 128MB@0x80000000 + 128MB@0xc0000000

That's why I've asked about bounce buffers.

The system I test on boots with mem=512MB

Regards.
diff mbox

Patch

diff --git a/drivers/pci/host/pcie-tango.c b/drivers/pci/host/pcie-tango.c
index 6bbb81f06a53..d672271ad719 100644
--- a/drivers/pci/host/pcie-tango.c
+++ b/drivers/pci/host/pcie-tango.c
@@ -1,12 +1,170 @@ 
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
 #include <linux/pci-ecam.h>
 #include <linux/delay.h>
+#include <linux/msi.h>
 #include <linux/of.h>
 
+#define MSI_MAX			256
+
 #define SMP8759_MUX		0x48
 #define SMP8759_TEST_OUT	0x74
+#define SMP8759_STATUS		0x80
+#define SMP8759_ENABLE		0xa0
+#define SMP8759_DOORBELL	0xa002e07c
 
 struct tango_pcie {
-	void __iomem *base;
+	DECLARE_BITMAP(used_msi, MSI_MAX);
+	spinlock_t		used_msi_lock;
+	void __iomem		*base;
+	struct irq_domain	*dom;
+};
+
+static void tango_msi_isr(struct irq_desc *desc)
+{
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	struct tango_pcie *pcie = irq_desc_get_handler_data(desc);
+	unsigned long status, base, virq, idx, pos = 0;
+
+	chained_irq_enter(chip, desc);
+	spin_lock(&pcie->used_msi_lock);
+
+	while ((pos = find_next_bit(pcie->used_msi, MSI_MAX, pos)) < MSI_MAX) {
+		base = round_down(pos, 32);
+		status = readl_relaxed(pcie->base + SMP8759_STATUS + base / 8);
+		for_each_set_bit(idx, &status, 32) {
+			virq = irq_find_mapping(pcie->dom, base + idx);
+			generic_handle_irq(virq);
+		}
+		pos = base + 32;
+	}
+
+	spin_unlock(&pcie->used_msi_lock);
+	chained_irq_exit(chip, desc);
+}
+
+static void tango_ack(struct irq_data *d)
+{
+	struct tango_pcie *pcie = d->chip_data;
+	u32 offset = (d->hwirq / 32) * 4;
+	u32 bit = BIT(d->hwirq % 32);
+
+	writel_relaxed(bit, pcie->base + SMP8759_STATUS + offset);
+}
+
+static void update_msi_enable(struct irq_data *d, bool unmask)
+{
+	unsigned long flags;
+	struct tango_pcie *pcie = d->chip_data;
+	u32 offset = (d->hwirq / 32) * 4;
+	u32 bit = BIT(d->hwirq % 32);
+	u32 val;
+
+	spin_lock_irqsave(&pcie->used_msi_lock, flags);
+	val = readl_relaxed(pcie->base + SMP8759_ENABLE + offset);
+	val = unmask ? val | bit : val & ~bit;
+	writel_relaxed(val, pcie->base + SMP8759_ENABLE + offset);
+	spin_unlock_irqrestore(&pcie->used_msi_lock, flags);
+}
+
+static void tango_mask(struct irq_data *d)
+{
+	update_msi_enable(d, false);
+}
+
+static void tango_unmask(struct irq_data *d)
+{
+	update_msi_enable(d, true);
+}
+
+static int tango_set_affinity(struct irq_data *d, const struct cpumask *mask,
+			      bool force)
+{
+	return -EINVAL;
+}
+
+static void tango_compose_msi_msg(struct irq_data *d, struct msi_msg *msg)
+{
+	msg->address_lo = lower_32_bits(SMP8759_DOORBELL);
+	msg->address_hi = upper_32_bits(SMP8759_DOORBELL);
+	msg->data = d->hwirq;
+}
+
+static struct irq_chip tango_chip = {
+	.irq_ack		= tango_ack,
+	.irq_mask		= tango_mask,
+	.irq_unmask		= tango_unmask,
+	.irq_set_affinity	= tango_set_affinity,
+	.irq_compose_msi_msg	= tango_compose_msi_msg,
+};
+
+static void msi_ack(struct irq_data *d)
+{
+	irq_chip_ack_parent(d);
+}
+
+static void msi_mask(struct irq_data *d)
+{
+	pci_msi_mask_irq(d);
+	irq_chip_mask_parent(d);
+}
+
+static void msi_unmask(struct irq_data *d)
+{
+	pci_msi_unmask_irq(d);
+	irq_chip_unmask_parent(d);
+}
+
+static struct irq_chip msi_chip = {
+	.name = "MSI",
+	.irq_ack = msi_ack,
+	.irq_mask = msi_mask,
+	.irq_unmask = msi_unmask,
+};
+
+static struct msi_domain_info msi_dom_info = {
+	.flags	= MSI_FLAG_PCI_MSIX
+		| MSI_FLAG_USE_DEF_DOM_OPS
+		| MSI_FLAG_USE_DEF_CHIP_OPS,
+	.chip	= &msi_chip,
+};
+
+static int tango_irq_domain_alloc(struct irq_domain *dom, unsigned int virq,
+				  unsigned int nr_irqs, void *args)
+{
+	struct tango_pcie *pcie = dom->host_data;
+	unsigned long flags;
+	int pos;
+
+	spin_lock_irqsave(&pcie->used_msi_lock, flags);
+	pos = find_first_zero_bit(pcie->used_msi, MSI_MAX);
+	if (pos >= MSI_MAX) {
+		spin_unlock_irqrestore(&pcie->used_msi_lock, flags);
+		return -ENOSPC;
+	}
+	__set_bit(pos, pcie->used_msi);
+	spin_unlock_irqrestore(&pcie->used_msi_lock, flags);
+	irq_domain_set_info(dom, virq, pos, &tango_chip,
+			pcie, handle_edge_irq, NULL, NULL);
+
+	return 0;
+}
+
+static void tango_irq_domain_free(struct irq_domain *dom, unsigned int virq,
+				  unsigned int nr_irqs)
+{
+	unsigned long flags;
+	struct irq_data *d = irq_domain_get_irq_data(dom, virq);
+	struct tango_pcie *pcie = d->chip_data;
+
+	spin_lock_irqsave(&pcie->used_msi_lock, flags);
+	__clear_bit(d->hwirq, pcie->used_msi);
+	spin_unlock_irqrestore(&pcie->used_msi_lock, flags);
+}
+
+static const struct irq_domain_ops dom_ops = {
+	.alloc	= tango_irq_domain_alloc,
+	.free	= tango_irq_domain_free,
 };
 
 static int smp8759_config_read(struct pci_bus *bus, unsigned int devfn,
@@ -76,7 +234,9 @@  static int tango_pcie_probe(struct platform_device *pdev)
 	struct device *dev = &pdev->dev;
 	struct tango_pcie *pcie;
 	struct resource *res;
-	int ret;
+	struct irq_domain *msi_dom, *irq_dom;
+	struct fwnode_handle *fwnode = of_node_to_fwnode(dev->of_node);
+	int ret, reg, virq;
 
 	dev_warn(dev, "simultaneous PCI config and MMIO accesses may cause data corruption\n");
 	add_taint(TAINT_CRAP, LOCKDEP_STILL_OK);
@@ -95,6 +255,33 @@  static int tango_pcie_probe(struct platform_device *pdev)
 	if (!tango_pcie_link_up(pcie))
 		return -ENODEV;
 
+	for (reg = 0; reg < MSI_MAX / 8; reg += 4)
+		writel_relaxed(0, pcie->base + SMP8759_ENABLE + reg);
+
+	virq = platform_get_irq(pdev, 1);
+	if (virq <= 0) {
+		dev_err(dev, "Failed to map IRQ\n");
+		return -ENXIO;
+	}
+
+	irq_dom = irq_domain_create_linear(fwnode, MSI_MAX, &dom_ops, pcie);
+	if (!irq_dom) {
+		dev_err(dev, "Failed to create IRQ domain\n");
+		return -ENOMEM;
+	}
+
+	msi_dom = pci_msi_create_irq_domain(fwnode, &msi_dom_info, irq_dom);
+	if (!msi_dom) {
+		dev_err(dev, "Failed to create MSI domain\n");
+		irq_domain_remove(irq_dom);
+		return -ENOMEM;
+	}
+
+	pcie->dom = irq_dom;
+	spin_lock_init(&pcie->used_msi_lock);
+
+	irq_set_chained_handler_and_data(virq, tango_msi_isr, pcie);
+
 	return pci_host_common_probe(pdev, &smp8759_ecam_ops);
 }