diff mbox

[RFC,v3,3/3] PCI/ACPI: hisi: Add ACPI support for HiSilicon SoCs Host Controllers

Message ID 1455039260-6040-4-git-send-email-gabriele.paoloni@huawei.com (mailing list archive)
State New, archived
Headers show

Commit Message

Gabriele Paoloni Feb. 9, 2016, 5:34 p.m. UTC
From: gabriele paoloni <gabriele.paoloni@huawei.com>

This patch adds specific quirks for PCI config space accessors,
it uses _HID to decide whether to hook pci_ops or not.

Signed-off-by: Dongdong Liu <liudongdong3@huawei.com>
Signed-off-by: Gabriele Paoloni <gabriele.paoloni@huawei.com>
---
 MAINTAINERS                       |   1 +
 drivers/pci/host/Kconfig          |   8 ++
 drivers/pci/host/Makefile         |   1 +
 drivers/pci/host/pcie-hisi-acpi.c | 188 ++++++++++++++++++++++++++++++++++++++
 drivers/pci/host/pcie-hisi.c      |   2 -
 drivers/pci/host/pcie-hisi.h      |   2 +
 6 files changed, 200 insertions(+), 2 deletions(-)
 create mode 100644 drivers/pci/host/pcie-hisi-acpi.c

Comments

Mark Rutland Feb. 9, 2016, 6:24 p.m. UTC | #1
On Tue, Feb 09, 2016 at 05:34:20PM +0000, Gabriele Paoloni wrote:
> From: gabriele paoloni <gabriele.paoloni@huawei.com>
> 
> This patch adds specific quirks for PCI config space accessors,
> it uses _HID to decide whether to hook pci_ops or not.

If I understand correctly, this would mean that it's not actually ECAM
compliant, then?

> Signed-off-by: Dongdong Liu <liudongdong3@huawei.com>
> Signed-off-by: Gabriele Paoloni <gabriele.paoloni@huawei.com>
> ---
>  MAINTAINERS                       |   1 +
>  drivers/pci/host/Kconfig          |   8 ++
>  drivers/pci/host/Makefile         |   1 +
>  drivers/pci/host/pcie-hisi-acpi.c | 188 ++++++++++++++++++++++++++++++++++++++
>  drivers/pci/host/pcie-hisi.c      |   2 -
>  drivers/pci/host/pcie-hisi.h      |   2 +
>  6 files changed, 200 insertions(+), 2 deletions(-)
>  create mode 100644 drivers/pci/host/pcie-hisi-acpi.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index d69f436..f184c3e 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -8412,6 +8412,7 @@ F:	Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
>  F:	drivers/pci/host/pcie-hisi.h
>  F:	drivers/pci/host/pcie-hisi.c
>  F:	drivers/pci/host/pcie-hisi-common.c
> +F:	drivers/pci/host/pcie-hisi-acpi.c
>  
>  PCIE DRIVER FOR QUALCOMM MSM
>  M:     Stanimir Varbanov <svarbanov@mm-sol.com>
> diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig
> index 75a6054..65b1add 100644
> --- a/drivers/pci/host/Kconfig
> +++ b/drivers/pci/host/Kconfig
> @@ -181,6 +181,14 @@ config PCI_HISI
>  	  Say Y here if you want PCIe controller support on HiSilicon
>  	  Hip05 and Hip06 SoCs
>  
> +config PCI_HISI_ACPI
> +	depends on ACPI
> +	bool "HiSilicon Hip05 and Hip06 SoCs ACPI PCIe controllers"
> +	select ACPI_PCI_HOST_GENERIC
> +	help
> +	  Say Y here if you want ACPI PCIe controller support on HiSilicon
> +	  Hip05 and Hip06 SoCs
> +
>  config PCIE_QCOM
>  	bool "Qualcomm PCIe controller"
>  	depends on ARCH_QCOM && OF
> diff --git a/drivers/pci/host/Makefile b/drivers/pci/host/Makefile
> index 8c93c0f..57e4379 100644
> --- a/drivers/pci/host/Makefile
> +++ b/drivers/pci/host/Makefile
> @@ -21,4 +21,5 @@ obj-$(CONFIG_PCIE_IPROC_BCMA) += pcie-iproc-bcma.o
>  obj-$(CONFIG_PCIE_ALTERA) += pcie-altera.o
>  obj-$(CONFIG_PCIE_ALTERA_MSI) += pcie-altera-msi.o
>  obj-$(CONFIG_PCI_HISI) += pcie-hisi.o pcie-hisi-common.o
> +obj-$(CONFIG_PCI_HISI_ACPI) += pcie-hisi-acpi.o pcie-hisi-common.o
>  obj-$(CONFIG_PCIE_QCOM) += pcie-qcom.o
> diff --git a/drivers/pci/host/pcie-hisi-acpi.c b/drivers/pci/host/pcie-hisi-acpi.c
> new file mode 100644
> index 0000000..3605260
> --- /dev/null
> +++ b/drivers/pci/host/pcie-hisi-acpi.c
> @@ -0,0 +1,188 @@
> +/*
> + * PCIe host controller driver for HiSilicon HipXX SoCs
> + *
> + * Copyright (C) 2016 HiSilicon Co., Ltd. http://www.hisilicon.com
> + *
> + * Author: Dongdong Liu <liudongdong3@huawei.com>
> + *         Gabriele Paoloni <gabriele.paoloni@huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +#include <linux/interrupt.h>
> +#include <linux/acpi.h>
> +#include <linux/ecam.h>
> +#include <linux/pci.h>
> +#include <linux/pci-acpi.h>
> +#include "pcie-hisi.h"
> +
> +#define GET_PCIE_LINK_STATUS  0x0
> +
> +/* uuid 6d30f553-836c-408e-b6ad-45bccc957949 */
> +const u8 hisi_pcie_acpi_dsm_uuid[] = {
> +	0x53, 0xf5, 0x30, 0x6d, 0x6c, 0x83, 0x8e, 0x40,
> +	0xb6, 0xad, 0x45, 0xbc, 0xcc, 0x95, 0x79, 0x49
> +};
> +
> +static const struct acpi_device_id hisi_pcie_ids[] = {
> +	{"HISI0080", 0},
> +	{"", 0},
> +};
> +
> +static int hisi_pcie_get_addr(struct acpi_pci_root *root, const char *name,
> +				void __iomem **addr)
> +{
> +	struct acpi_device *device;
> +	u64 base;
> +	u64 size;
> +	u32 buf[4];
> +	int ret;
> +
> +	device =  root->device;
> +	ret = fwnode_property_read_u32_array(&device->fwnode, name,
> +					buf, ARRAY_SIZE(buf));
> +	if (ret) {
> +		dev_err(&device->dev, "can't get %s\n", name);
> +		return ret;
> +	}
> +
> +	base = ((u64)buf[0] << 32) | buf[1];
> +	size =  ((u64)buf[2] << 32) | buf[3];
> +	*addr = devm_ioremap(&device->dev, base, size);
> +	if (!(*addr)) {
> +		dev_err(&device->dev, "error with ioremap\n");
> +		return -ENOMEM;
> +	}

This does not seem like the correct way of describing an addres in ACPI.

> +
> +	return 0;
> +}
> +
> +
> +static int hisi_pcie_link_up_acpi(struct acpi_pci_root *root)
> +{
> +	u32 val;
> +	struct acpi_device *device;
> +	union acpi_object *obj;
> +
> +	device = root->device;
> +	obj = acpi_evaluate_dsm(device->handle,
> +		hisi_pcie_acpi_dsm_uuid, 0,
> +		GET_PCIE_LINK_STATUS, NULL);
> +
> +	if (!obj  ||  obj->type != ACPI_TYPE_INTEGER)  {
> +		dev_err(&device->dev, "can't get link status from _DSM\n");
> +		return 0;
> +	}
> +	val = obj->integer.value;
> +
> +	return ((val & PCIE_LTSSM_STATE_MASK) == PCIE_LTSSM_LINKUP_STATE);
> +
> +}
> +
> +/*
> + * Retrieve rc_dbi base and size from _DSD
> + * Name (_DSD, Package () {
> + *	ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
> + *	Package () {
> + *	Package () {"rc-dbi", Package () { 0x0, 0xb0080000, 0x0, 0x10000 }},
> + *	}
> + *	})
> + */

As above, this does not look right. ACPI has standard mechanisms for
describing addresses. Making something up like this is not a good idea.

Mark.
Gabriele Paoloni Feb. 10, 2016, 9:52 a.m. UTC | #2
Hi Mark

> -----Original Message-----
> From: Mark Rutland [mailto:mark.rutland@arm.com]
> Sent: 09 February 2016 18:24
> To: Gabriele Paoloni
> Cc: Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong (C); Linuxarm;
> qiujiang; bhelgaas@google.com; arnd@arndb.de;
> Lorenzo.Pieralisi@arm.com; tn@semihalf.com; linux-pci@vger.kernel.org;
> linux-kernel@vger.kernel.org; xuwei (O); linux-acpi@vger.kernel.org;
> jcm@redhat.com; zhangjukuo; Liguozhu (Kenneth); linux-arm-
> kernel@lists.infradead.org
> Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support for
> HiSilicon SoCs Host Controllers
> 
> On Tue, Feb 09, 2016 at 05:34:20PM +0000, Gabriele Paoloni wrote:
> > From: gabriele paoloni <gabriele.paoloni@huawei.com>
> >
> > This patch adds specific quirks for PCI config space accessors,
> > it uses _HID to decide whether to hook pci_ops or not.
> 
> If I understand correctly, this would mean that it's not actually ECAM
> compliant, then?

Correct we need out own methods to read/write the RC config space...see
for example:

+	if (bus->number == root->secondary.start)
+		return hisi_pcie_common_cfg_read(reg_base, where, size, val);
+
+	return pci_generic_config_read(bus, devfn, where, size, val);

> 
> > Signed-off-by: Dongdong Liu <liudongdong3@huawei.com>
> > Signed-off-by: Gabriele Paoloni <gabriele.paoloni@huawei.com>
> > ---
> >  MAINTAINERS                       |   1 +
> >  drivers/pci/host/Kconfig          |   8 ++
> >  drivers/pci/host/Makefile         |   1 +
> >  drivers/pci/host/pcie-hisi-acpi.c | 188
> ++++++++++++++++++++++++++++++++++++++
> >  drivers/pci/host/pcie-hisi.c      |   2 -
> >  drivers/pci/host/pcie-hisi.h      |   2 +
> >  6 files changed, 200 insertions(+), 2 deletions(-)
> >  create mode 100644 drivers/pci/host/pcie-hisi-acpi.c
> >
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index d69f436..f184c3e 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -8412,6 +8412,7 @@ F:
> 	Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
> >  F:	drivers/pci/host/pcie-hisi.h
> >  F:	drivers/pci/host/pcie-hisi.c
> >  F:	drivers/pci/host/pcie-hisi-common.c
> > +F:	drivers/pci/host/pcie-hisi-acpi.c
> >
> >  PCIE DRIVER FOR QUALCOMM MSM
> >  M:     Stanimir Varbanov <svarbanov@mm-sol.com>
> > diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig
> > index 75a6054..65b1add 100644
> > --- a/drivers/pci/host/Kconfig
> > +++ b/drivers/pci/host/Kconfig
> > @@ -181,6 +181,14 @@ config PCI_HISI
> >  	  Say Y here if you want PCIe controller support on HiSilicon
> >  	  Hip05 and Hip06 SoCs
> >
> > +config PCI_HISI_ACPI
> > +	depends on ACPI
> > +	bool "HiSilicon Hip05 and Hip06 SoCs ACPI PCIe controllers"
> > +	select ACPI_PCI_HOST_GENERIC
> > +	help
> > +	  Say Y here if you want ACPI PCIe controller support on
> HiSilicon
> > +	  Hip05 and Hip06 SoCs
> > +
> >  config PCIE_QCOM
> >  	bool "Qualcomm PCIe controller"
> >  	depends on ARCH_QCOM && OF
> > diff --git a/drivers/pci/host/Makefile b/drivers/pci/host/Makefile
> > index 8c93c0f..57e4379 100644
> > --- a/drivers/pci/host/Makefile
> > +++ b/drivers/pci/host/Makefile
> > @@ -21,4 +21,5 @@ obj-$(CONFIG_PCIE_IPROC_BCMA) += pcie-iproc-bcma.o
> >  obj-$(CONFIG_PCIE_ALTERA) += pcie-altera.o
> >  obj-$(CONFIG_PCIE_ALTERA_MSI) += pcie-altera-msi.o
> >  obj-$(CONFIG_PCI_HISI) += pcie-hisi.o pcie-hisi-common.o
> > +obj-$(CONFIG_PCI_HISI_ACPI) += pcie-hisi-acpi.o pcie-hisi-common.o
> >  obj-$(CONFIG_PCIE_QCOM) += pcie-qcom.o
> > diff --git a/drivers/pci/host/pcie-hisi-acpi.c
> b/drivers/pci/host/pcie-hisi-acpi.c
> > new file mode 100644
> > index 0000000..3605260
> > --- /dev/null
> > +++ b/drivers/pci/host/pcie-hisi-acpi.c
> > @@ -0,0 +1,188 @@
> > +/*
> > + * PCIe host controller driver for HiSilicon HipXX SoCs
> > + *
> > + * Copyright (C) 2016 HiSilicon Co., Ltd. http://www.hisilicon.com
> > + *
> > + * Author: Dongdong Liu <liudongdong3@huawei.com>
> > + *         Gabriele Paoloni <gabriele.paoloni@huawei.com>
> > + *
> > + * This program is free software; you can redistribute it and/or
> modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + */
> > +#include <linux/interrupt.h>
> > +#include <linux/acpi.h>
> > +#include <linux/ecam.h>
> > +#include <linux/pci.h>
> > +#include <linux/pci-acpi.h>
> > +#include "pcie-hisi.h"
> > +
> > +#define GET_PCIE_LINK_STATUS  0x0
> > +
> > +/* uuid 6d30f553-836c-408e-b6ad-45bccc957949 */
> > +const u8 hisi_pcie_acpi_dsm_uuid[] = {
> > +	0x53, 0xf5, 0x30, 0x6d, 0x6c, 0x83, 0x8e, 0x40,
> > +	0xb6, 0xad, 0x45, 0xbc, 0xcc, 0x95, 0x79, 0x49
> > +};
> > +
> > +static const struct acpi_device_id hisi_pcie_ids[] = {
> > +	{"HISI0080", 0},
> > +	{"", 0},
> > +};
> > +
> > +static int hisi_pcie_get_addr(struct acpi_pci_root *root, const char
> *name,
> > +				void __iomem **addr)
> > +{
> > +	struct acpi_device *device;
> > +	u64 base;
> > +	u64 size;
> > +	u32 buf[4];
> > +	int ret;
> > +
> > +	device =  root->device;
> > +	ret = fwnode_property_read_u32_array(&device->fwnode, name,
> > +					buf, ARRAY_SIZE(buf));
> > +	if (ret) {
> > +		dev_err(&device->dev, "can't get %s\n", name);
> > +		return ret;
> > +	}
> > +
> > +	base = ((u64)buf[0] << 32) | buf[1];
> > +	size =  ((u64)buf[2] << 32) | buf[3];
> > +	*addr = devm_ioremap(&device->dev, base, size);
> > +	if (!(*addr)) {
> > +		dev_err(&device->dev, "error with ioremap\n");
> > +		return -ENOMEM;
> > +	}
> 
> This does not seem like the correct way of describing an addres in
> ACPI.

Ok I guess this is related to the comment below...

> 
> > +
> > +	return 0;
> > +}
> > +
> > +
> > +static int hisi_pcie_link_up_acpi(struct acpi_pci_root *root)
> > +{
> > +	u32 val;
> > +	struct acpi_device *device;
> > +	union acpi_object *obj;
> > +
> > +	device = root->device;
> > +	obj = acpi_evaluate_dsm(device->handle,
> > +		hisi_pcie_acpi_dsm_uuid, 0,
> > +		GET_PCIE_LINK_STATUS, NULL);
> > +
> > +	if (!obj  ||  obj->type != ACPI_TYPE_INTEGER)  {
> > +		dev_err(&device->dev, "can't get link status from _DSM\n");
> > +		return 0;
> > +	}
> > +	val = obj->integer.value;
> > +
> > +	return ((val & PCIE_LTSSM_STATE_MASK) ==
> PCIE_LTSSM_LINKUP_STATE);
> > +
> > +}
> > +
> > +/*
> > + * Retrieve rc_dbi base and size from _DSD
> > + * Name (_DSD, Package () {
> > + *	ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
> > + *	Package () {
> > + *	Package () {"rc-dbi", Package () { 0x0, 0xb0080000, 0x0, 0x10000
> }},
> > + *	}
> > + *	})
> > + */
> 
> As above, this does not look right. ACPI has standard mechanisms for
> describing addresses. Making something up like this is not a good idea.
 
I am quite new to ACPI, may I ask you to explain a bit? 
In our case we need to put somewhere in the ACPI table the value of 
the base register address used to read/write the RC config space... 

Do you have any suggestion about the correct place to put it?

Many Thanks

Gab

> 
> Mark.
Mark Rutland Feb. 10, 2016, 11:12 a.m. UTC | #3
On Wed, Feb 10, 2016 at 09:52:36AM +0000, Gabriele Paoloni wrote:
> Hi Mark
> 
> > On Tue, Feb 09, 2016 at 05:34:20PM +0000, Gabriele Paoloni wrote:
> > > From: gabriele paoloni <gabriele.paoloni@huawei.com>
> > > +/*
> > > + * Retrieve rc_dbi base and size from _DSD
> > > + * Name (_DSD, Package () {
> > > + *	ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
> > > + *	Package () {
> > > + *	Package () {"rc-dbi", Package () { 0x0, 0xb0080000, 0x0, 0x10000
> > }},
> > > + *	}
> > > + *	})
> > > + */
> > 
> > As above, this does not look right. ACPI has standard mechanisms for
> > describing addresses. Making something up like this is not a good idea.
>  
> I am quite new to ACPI, may I ask you to explain a bit? 

ACPI has standard mechanisms for describing certain resources, and these
should not be described in _DSD. Memory or IO address regions are such
resources (in _CRS, IIRC), and should not be described in _DSD.

Thanks,
Mark.
Gabriele Paoloni Feb. 10, 2016, 2:45 p.m. UTC | #4
> -----Original Message-----
> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> owner@vger.kernel.org] On Behalf Of Mark Rutland
> Sent: 10 February 2016 11:13
> To: Gabriele Paoloni
> Cc: Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong (C); Linuxarm;
> qiujiang; bhelgaas@google.com; arnd@arndb.de;
> Lorenzo.Pieralisi@arm.com; tn@semihalf.com; linux-pci@vger.kernel.org;
> linux-kernel@vger.kernel.org; xuwei (O); linux-acpi@vger.kernel.org;
> jcm@redhat.com; zhangjukuo; Liguozhu (Kenneth); linux-arm-
> kernel@lists.infradead.org
> Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support for
> HiSilicon SoCs Host Controllers
> 
> On Wed, Feb 10, 2016 at 09:52:36AM +0000, Gabriele Paoloni wrote:
> > Hi Mark
> >
> > > On Tue, Feb 09, 2016 at 05:34:20PM +0000, Gabriele Paoloni wrote:
> > > > From: gabriele paoloni <gabriele.paoloni@huawei.com>
> > > > +/*
> > > > + * Retrieve rc_dbi base and size from _DSD
> > > > + * Name (_DSD, Package () {
> > > > + *	ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
> > > > + *	Package () {
> > > > + *	Package () {"rc-dbi", Package () { 0x0, 0xb0080000, 0x0,
> 0x10000
> > > }},
> > > > + *	}
> > > > + *	})
> > > > + */
> > >
> > > As above, this does not look right. ACPI has standard mechanisms
> for
> > > describing addresses. Making something up like this is not a good
> idea.
> >
> > I am quite new to ACPI, may I ask you to explain a bit?
> 
> ACPI has standard mechanisms for describing certain resources, and
> these
> should not be described in _DSD. Memory or IO address regions are such
> resources (in _CRS, IIRC), and should not be described in _DSD.

Hi Mark,

In my case I think in need to look into the MCFG object as the problem
I have is RC using a different range than the rest of the hierarchy.

I'll investigate this and try to come with a solution in v4

Many Thanks

Gab 

> 
> Thanks,
> Mark.
Gabriele Paoloni Feb. 23, 2016, 2:47 a.m. UTC | #5
Hi Mark

> -----Original Message-----
> From: Gabriele Paoloni
> Sent: 10 February 2016 22:45
> To: Mark Rutland
> Cc: Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong (C); Linuxarm;
> qiujiang; bhelgaas@google.com; arnd@arndb.de; Lorenzo.Pieralisi@arm.com;
> tn@semihalf.com; linux-pci@vger.kernel.org; linux-
> kernel@vger.kernel.org; xuwei (O); linux-acpi@vger.kernel.org;
> jcm@redhat.com; zhangjukuo; Liguozhu (Kenneth); linux-arm-
> kernel@lists.infradead.org
> Subject: RE: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support for
> HiSilicon SoCs Host Controllers
> 
> > -----Original Message-----
> > From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> > owner@vger.kernel.org] On Behalf Of Mark Rutland
> > Sent: 10 February 2016 11:13
> > To: Gabriele Paoloni
> > Cc: Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong (C); Linuxarm;
> > qiujiang; bhelgaas@google.com; arnd@arndb.de;
> > Lorenzo.Pieralisi@arm.com; tn@semihalf.com; linux-pci@vger.kernel.org;
> > linux-kernel@vger.kernel.org; xuwei (O); linux-acpi@vger.kernel.org;
> > jcm@redhat.com; zhangjukuo; Liguozhu (Kenneth); linux-arm-
> > kernel@lists.infradead.org
> > Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support for
> > HiSilicon SoCs Host Controllers
> >
> > On Wed, Feb 10, 2016 at 09:52:36AM +0000, Gabriele Paoloni wrote:
> > > Hi Mark
> > >
> > > > On Tue, Feb 09, 2016 at 05:34:20PM +0000, Gabriele Paoloni wrote:
> > > > > From: gabriele paoloni <gabriele.paoloni@huawei.com>
> > > > > +/*
> > > > > + * Retrieve rc_dbi base and size from _DSD
> > > > > + * Name (_DSD, Package () {
> > > > > + *	ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
> > > > > + *	Package () {
> > > > > + *	Package () {"rc-dbi", Package () { 0x0, 0xb0080000, 0x0,
> > 0x10000
> > > > }},
> > > > > + *	}
> > > > > + *	})
> > > > > + */
> > > >
> > > > As above, this does not look right. ACPI has standard mechanisms
> > for
> > > > describing addresses. Making something up like this is not a good
> > idea.
> > >
> > > I am quite new to ACPI, may I ask you to explain a bit?
> >
> > ACPI has standard mechanisms for describing certain resources, and
> > these
> > should not be described in _DSD. Memory or IO address regions are
> such
> > resources (in _CRS, IIRC), and should not be described in _DSD.
> 
> Hi Mark,
> 
> In my case I think in need to look into the MCFG object as the problem
> I have is RC using a different range than the rest of the hierarchy.
> 
> I'll investigate this and try to come with a solution in v4

I have looked into this and in our case we cannot use the
standard MCFG object to pass the RC config space addresses.

The reason is that in our HW we have the config base addresses of the 
root complex ports that are less than 0x100000 byte distant one from
the other as we only map the first 0x10000 bytes.

Now the MCFG acpi framework always fix the MCFG resource size to 0x100000
for each bus; therefore if we pass our RC addresses through MCFG we end
up with a resource conflict.

To give you a practical example we are in a situation where we have:

port0: [0x00000000b0080000 - 0x00000000b0080000 + 0x10000]
port1: [0x00000000b0090000 - 0x00000000b0090000 + 0x10000]
port2: [0x00000000b00A0000 - 0x00000000b00A0000 + 0x10000]
port3: [0x00000000b00B0000 - 0x00000000b00B0000 + 0x10000]
  
So if we pass the base addresses through MCFG the resources
will overlap as MCFG will consider 0x100000 size for each base
address of the root complex (only the RC bus uses that address)

So far I do not see many option other than using _DSD to pass
these RC config base addresses.

Thanks and Regards

Gab 

> 
> Many Thanks
> 
> Gab
> 
> >
> > Thanks,
> > Mark.
Bjorn Helgaas Feb. 24, 2016, 1:14 a.m. UTC | #6
On Tue, Feb 23, 2016 at 02:47:22AM +0000, Gabriele Paoloni wrote:
> > -----Original Message-----
> > From: Gabriele Paoloni
> > Sent: 10 February 2016 22:45
> > To: Mark Rutland
> > Cc: Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong (C); Linuxarm;
> > qiujiang; bhelgaas@google.com; arnd@arndb.de; Lorenzo.Pieralisi@arm.com;
> > tn@semihalf.com; linux-pci@vger.kernel.org; linux-
> > kernel@vger.kernel.org; xuwei (O); linux-acpi@vger.kernel.org;
> > jcm@redhat.com; zhangjukuo; Liguozhu (Kenneth); linux-arm-
> > kernel@lists.infradead.org
> > Subject: RE: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support for
> > HiSilicon SoCs Host Controllers
> > 
> > > -----Original Message-----
> > > From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> > > owner@vger.kernel.org] On Behalf Of Mark Rutland
> > > Sent: 10 February 2016 11:13
> > > To: Gabriele Paoloni
> > > Cc: Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong (C); Linuxarm;
> > > qiujiang; bhelgaas@google.com; arnd@arndb.de;
> > > Lorenzo.Pieralisi@arm.com; tn@semihalf.com; linux-pci@vger.kernel.org;
> > > linux-kernel@vger.kernel.org; xuwei (O); linux-acpi@vger.kernel.org;
> > > jcm@redhat.com; zhangjukuo; Liguozhu (Kenneth); linux-arm-
> > > kernel@lists.infradead.org
> > > Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support for
> > > HiSilicon SoCs Host Controllers
> > >
> > > On Wed, Feb 10, 2016 at 09:52:36AM +0000, Gabriele Paoloni wrote:
> > > > Hi Mark
> > > >
> > > > > On Tue, Feb 09, 2016 at 05:34:20PM +0000, Gabriele Paoloni wrote:
> > > > > > From: gabriele paoloni <gabriele.paoloni@huawei.com>
> > > > > > +/*
> > > > > > + * Retrieve rc_dbi base and size from _DSD
> > > > > > + * Name (_DSD, Package () {
> > > > > > + *	ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
> > > > > > + *	Package () {
> > > > > > + *	Package () {"rc-dbi", Package () { 0x0, 0xb0080000, 0x0,
> > > 0x10000
> > > > > }},
> > > > > > + *	}
> > > > > > + *	})
> > > > > > + */
> > > > >
> > > > > As above, this does not look right. ACPI has standard mechanisms
> > > for
> > > > > describing addresses. Making something up like this is not a good
> > > idea.
> > > >
> > > > I am quite new to ACPI, may I ask you to explain a bit?
> > >
> > > ACPI has standard mechanisms for describing certain resources, and
> > > these
> > > should not be described in _DSD. Memory or IO address regions are
> > such
> > > resources (in _CRS, IIRC), and should not be described in _DSD.
> > 
> > Hi Mark,
> > 
> > In my case I think in need to look into the MCFG object as the problem
> > I have is RC using a different range than the rest of the hierarchy.
> > 
> > I'll investigate this and try to come with a solution in v4
> 
> I have looked into this and in our case we cannot use the
> standard MCFG object to pass the RC config space addresses.
> 
> The reason is that in our HW we have the config base addresses of the 
> root complex ports that are less than 0x100000 byte distant one from
> the other as we only map the first 0x10000 bytes.

The ECAM spec requires 4096 bytes per function, 8 functions per
device, 32 devices per bus, which means you need 0x100000 bytes of
address space per bus.  If your device doesn't supply that, it doesn't
really implement ECAM, and you probably can't use the standard ways of
describing ECAM (MCFG, _CBA).

> Now the MCFG acpi framework always fix the MCFG resource size to 0x100000
> for each bus; therefore if we pass our RC addresses through MCFG we end
> up with a resource conflict.
> 
> To give you a practical example we are in a situation where we have:
> 
> port0: [0x00000000b0080000 - 0x00000000b0080000 + 0x10000]
> port1: [0x00000000b0090000 - 0x00000000b0090000 + 0x10000]
> port2: [0x00000000b00A0000 - 0x00000000b00A0000 + 0x10000]
> port3: [0x00000000b00B0000 - 0x00000000b00B0000 + 0x10000]
>   
> So if we pass the base addresses through MCFG the resources
> will overlap as MCFG will consider 0x100000 size for each base
> address of the root complex (only the RC bus uses that address)
> 
> So far I do not see many option other than using _DSD to pass
> these RC config base addresses.

I don't want to take over Mark's discussion, but I really do not think
_DSD is the correct way to fix this.  _CRS is like a generalized PCI
BAR.  A PCI device is only allowed to respond to address space
reported via a BAR.  An ACPI device is only allowed to respond to
address space reported via _CRS.  Those are important rules because
they mean we can manage address space with generic code instead of
device-specific code.

Bjorn
Gabriele Paoloni Feb. 24, 2016, 6:46 a.m. UTC | #7
Hi Bjorn, many thanks for replying

> -----Original Message-----
> From: Bjorn Helgaas [mailto:helgaas@kernel.org]
> Sent: 24 February 2016 09:14
> To: Gabriele Paoloni
> Cc: 'Mark Rutland'; Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong
> (C); Linuxarm; qiujiang; 'bhelgaas@google.com'; 'arnd@arndb.de';
> 'Lorenzo.Pieralisi@arm.com'; 'tn@semihalf.com'; 'linux-
> pci@vger.kernel.org'; 'linux-kernel@vger.kernel.org'; xuwei (O);
> 'linux-acpi@vger.kernel.org'; 'jcm@redhat.com'; zhangjukuo; Liguozhu
> (Kenneth); 'linux-arm-kernel@lists.infradead.org'
> Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support for
> HiSilicon SoCs Host Controllers
> 
> On Tue, Feb 23, 2016 at 02:47:22AM +0000, Gabriele Paoloni wrote:
> > > -----Original Message-----
> > > From: Gabriele Paoloni
> > > Sent: 10 February 2016 22:45
> > > To: Mark Rutland
> > > Cc: Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong (C); Linuxarm;
> > > qiujiang; bhelgaas@google.com; arnd@arndb.de;
> Lorenzo.Pieralisi@arm.com;
> > > tn@semihalf.com; linux-pci@vger.kernel.org; linux-
> > > kernel@vger.kernel.org; xuwei (O); linux-acpi@vger.kernel.org;
> > > jcm@redhat.com; zhangjukuo; Liguozhu (Kenneth); linux-arm-
> > > kernel@lists.infradead.org
> > > Subject: RE: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support
> for
> > > HiSilicon SoCs Host Controllers
> > >
> > > > -----Original Message-----
> > > > From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> > > > owner@vger.kernel.org] On Behalf Of Mark Rutland
> > > > Sent: 10 February 2016 11:13
> > > > To: Gabriele Paoloni
> > > > Cc: Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong (C);
> Linuxarm;
> > > > qiujiang; bhelgaas@google.com; arnd@arndb.de;
> > > > Lorenzo.Pieralisi@arm.com; tn@semihalf.com; linux-
> pci@vger.kernel.org;
> > > > linux-kernel@vger.kernel.org; xuwei (O); linux-
> acpi@vger.kernel.org;
> > > > jcm@redhat.com; zhangjukuo; Liguozhu (Kenneth); linux-arm-
> > > > kernel@lists.infradead.org
> > > > Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support
> for
> > > > HiSilicon SoCs Host Controllers
> > > >
> > > > On Wed, Feb 10, 2016 at 09:52:36AM +0000, Gabriele Paoloni wrote:
> > > > > Hi Mark
> > > > >
> > > > > > On Tue, Feb 09, 2016 at 05:34:20PM +0000, Gabriele Paoloni
> wrote:
> > > > > > > From: gabriele paoloni <gabriele.paoloni@huawei.com>
> > > > > > > +/*
> > > > > > > + * Retrieve rc_dbi base and size from _DSD
> > > > > > > + * Name (_DSD, Package () {
> > > > > > > + *	ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
> > > > > > > + *	Package () {
> > > > > > > + *	Package () {"rc-dbi", Package () { 0x0, 0xb0080000,
> 0x0,
> > > > 0x10000
> > > > > > }},
> > > > > > > + *	}
> > > > > > > + *	})
> > > > > > > + */
> > > > > >
> > > > > > As above, this does not look right. ACPI has standard
> mechanisms
> > > > for
> > > > > > describing addresses. Making something up like this is not a
> good
> > > > idea.
> > > > >
> > > > > I am quite new to ACPI, may I ask you to explain a bit?
> > > >
> > > > ACPI has standard mechanisms for describing certain resources,
> and
> > > > these
> > > > should not be described in _DSD. Memory or IO address regions are
> > > such
> > > > resources (in _CRS, IIRC), and should not be described in _DSD.
> > >
> > > Hi Mark,
> > >
> > > In my case I think in need to look into the MCFG object as the
> problem
> > > I have is RC using a different range than the rest of the hierarchy.
> > >
> > > I'll investigate this and try to come with a solution in v4
> >
> > I have looked into this and in our case we cannot use the
> > standard MCFG object to pass the RC config space addresses.
> >
> > The reason is that in our HW we have the config base addresses of the
> > root complex ports that are less than 0x100000 byte distant one from
> > the other as we only map the first 0x10000 bytes.
> 
> The ECAM spec requires 4096 bytes per function, 8 functions per
> device, 32 devices per bus, which means you need 0x100000 bytes of
> address space per bus.  If your device doesn't supply that, it doesn't
> really implement ECAM, and you probably can't use the standard ways of
> describing ECAM (MCFG, _CBA).

Correct, our host bridge is non ECAM only for the RC bus config space;
for any other bus underneath the root bus we support ECAM access, so
we need a quirk for the RC config rd/wr  

> 
> > Now the MCFG acpi framework always fix the MCFG resource size to
> 0x100000
> > for each bus; therefore if we pass our RC addresses through MCFG we
> end
> > up with a resource conflict.
> >
> > To give you a practical example we are in a situation where we have:
> >
> > port0: [0x00000000b0080000 - 0x00000000b0080000 + 0x10000]
> > port1: [0x00000000b0090000 - 0x00000000b0090000 + 0x10000]
> > port2: [0x00000000b00A0000 - 0x00000000b00A0000 + 0x10000]
> > port3: [0x00000000b00B0000 - 0x00000000b00B0000 + 0x10000]
> >
> > So if we pass the base addresses through MCFG the resources
> > will overlap as MCFG will consider 0x100000 size for each base
> > address of the root complex (only the RC bus uses that address)
> >
> > So far I do not see many option other than using _DSD to pass
> > these RC config base addresses.
> 
> I don't want to take over Mark's discussion, but I really do not think
> _DSD is the correct way to fix this.  _CRS is like a generalized PCI
> BAR.  A PCI device is only allowed to respond to address space
> reported via a BAR.  An ACPI device is only allowed to respond to
> address space reported via _CRS.  Those are important rules because
> they mean we can manage address space with generic code instead of
> device-specific code.

From what I see in 
http://lxr.free-electrons.com/source/drivers/acpi/pci_root.c#L723
acpi_pci_probe_root_resources() parses the _CRS method and
it only works for MEM and IO resources, so I don't think it is 
right to pass a config space address by _CRS or to modify the 
current ACPI framework to support this.

On the other side, since this is an exception only for the config
space address of our host controller (as said before all the buses
below the root one support ECAM), I think that it is right to put
this address as a device specific data (in fact the rest of the
config space addresses will be parsed from MCFG).
I thought the purpose of the quirks from Nowicki patchset were meant
to handle this sort of special cases...


Thanks and Regards

Gab

> 
> Bjorn
Bjorn Helgaas Feb. 24, 2016, 3:25 p.m. UTC | #8
On Wed, Feb 24, 2016 at 06:46:09AM +0000, Gabriele Paoloni wrote:
> 
> Hi Bjorn, many thanks for replying
> 
> > -----Original Message-----
> > From: Bjorn Helgaas [mailto:helgaas@kernel.org]
> > Sent: 24 February 2016 09:14
> > To: Gabriele Paoloni
> > Cc: 'Mark Rutland'; Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong
> > (C); Linuxarm; qiujiang; 'bhelgaas@google.com'; 'arnd@arndb.de';
> > 'Lorenzo.Pieralisi@arm.com'; 'tn@semihalf.com'; 'linux-
> > pci@vger.kernel.org'; 'linux-kernel@vger.kernel.org'; xuwei (O);
> > 'linux-acpi@vger.kernel.org'; 'jcm@redhat.com'; zhangjukuo; Liguozhu
> > (Kenneth); 'linux-arm-kernel@lists.infradead.org'
> > Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support for
> > HiSilicon SoCs Host Controllers
> > 
> > On Tue, Feb 23, 2016 at 02:47:22AM +0000, Gabriele Paoloni wrote:
> > > > -----Original Message-----
> > > > From: Gabriele Paoloni
> > > > Sent: 10 February 2016 22:45
> > > > To: Mark Rutland
> > > > Cc: Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong (C); Linuxarm;
> > > > qiujiang; bhelgaas@google.com; arnd@arndb.de;
> > Lorenzo.Pieralisi@arm.com;
> > > > tn@semihalf.com; linux-pci@vger.kernel.org; linux-
> > > > kernel@vger.kernel.org; xuwei (O); linux-acpi@vger.kernel.org;
> > > > jcm@redhat.com; zhangjukuo; Liguozhu (Kenneth); linux-arm-
> > > > kernel@lists.infradead.org
> > > > Subject: RE: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support
> > for
> > > > HiSilicon SoCs Host Controllers
> > > >
> > > > > -----Original Message-----
> > > > > From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> > > > > owner@vger.kernel.org] On Behalf Of Mark Rutland
> > > > > Sent: 10 February 2016 11:13
> > > > > To: Gabriele Paoloni
> > > > > Cc: Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong (C);
> > Linuxarm;
> > > > > qiujiang; bhelgaas@google.com; arnd@arndb.de;
> > > > > Lorenzo.Pieralisi@arm.com; tn@semihalf.com; linux-
> > pci@vger.kernel.org;
> > > > > linux-kernel@vger.kernel.org; xuwei (O); linux-
> > acpi@vger.kernel.org;
> > > > > jcm@redhat.com; zhangjukuo; Liguozhu (Kenneth); linux-arm-
> > > > > kernel@lists.infradead.org
> > > > > Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support
> > for
> > > > > HiSilicon SoCs Host Controllers
> > > > >
> > > > > On Wed, Feb 10, 2016 at 09:52:36AM +0000, Gabriele Paoloni wrote:
> > > > > > Hi Mark
> > > > > >
> > > > > > > On Tue, Feb 09, 2016 at 05:34:20PM +0000, Gabriele Paoloni
> > wrote:
> > > > > > > > From: gabriele paoloni <gabriele.paoloni@huawei.com>
> > > > > > > > +/*
> > > > > > > > + * Retrieve rc_dbi base and size from _DSD
> > > > > > > > + * Name (_DSD, Package () {
> > > > > > > > + *	ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
> > > > > > > > + *	Package () {
> > > > > > > > + *	Package () {"rc-dbi", Package () { 0x0, 0xb0080000,
> > 0x0,
> > > > > 0x10000
> > > > > > > }},
> > > > > > > > + *	}
> > > > > > > > + *	})
> > > > > > > > + */
> > > > > > >
> > > > > > > As above, this does not look right. ACPI has standard
> > mechanisms
> > > > > for
> > > > > > > describing addresses. Making something up like this is not a
> > good
> > > > > idea.
> > > > > >
> > > > > > I am quite new to ACPI, may I ask you to explain a bit?
> > > > >
> > > > > ACPI has standard mechanisms for describing certain resources,
> > and
> > > > > these
> > > > > should not be described in _DSD. Memory or IO address regions are
> > > > such
> > > > > resources (in _CRS, IIRC), and should not be described in _DSD.
> > > >
> > > > Hi Mark,
> > > >
> > > > In my case I think in need to look into the MCFG object as the
> > problem
> > > > I have is RC using a different range than the rest of the hierarchy.
> > > >
> > > > I'll investigate this and try to come with a solution in v4
> > >
> > > I have looked into this and in our case we cannot use the
> > > standard MCFG object to pass the RC config space addresses.
> > >
> > > The reason is that in our HW we have the config base addresses of the
> > > root complex ports that are less than 0x100000 byte distant one from
> > > the other as we only map the first 0x10000 bytes.
> > 
> > The ECAM spec requires 4096 bytes per function, 8 functions per
> > device, 32 devices per bus, which means you need 0x100000 bytes of
> > address space per bus.  If your device doesn't supply that, it doesn't
> > really implement ECAM, and you probably can't use the standard ways of
> > describing ECAM (MCFG, _CBA).
> 
> Correct, our host bridge is non ECAM only for the RC bus config space;
> for any other bus underneath the root bus we support ECAM access, so
> we need a quirk for the RC config rd/wr  

MCFG can specify a bus number range, so you might be able to use it
to describe the ECAM space for buses below the root bus, e.g.,
[bus 01-ff].

> > > Now the MCFG acpi framework always fix the MCFG resource size to
> > 0x100000
> > > for each bus; therefore if we pass our RC addresses through MCFG we
> > end
> > > up with a resource conflict.
> > >
> > > To give you a practical example we are in a situation where we have:
> > >
> > > port0: [0x00000000b0080000 - 0x00000000b0080000 + 0x10000]
> > > port1: [0x00000000b0090000 - 0x00000000b0090000 + 0x10000]
> > > port2: [0x00000000b00A0000 - 0x00000000b00A0000 + 0x10000]
> > > port3: [0x00000000b00B0000 - 0x00000000b00B0000 + 0x10000]
> > >
> > > So if we pass the base addresses through MCFG the resources
> > > will overlap as MCFG will consider 0x100000 size for each base
> > > address of the root complex (only the RC bus uses that address)
> > >
> > > So far I do not see many option other than using _DSD to pass
> > > these RC config base addresses.
> > 
> > I don't want to take over Mark's discussion, but I really do not think
> > _DSD is the correct way to fix this.  _CRS is like a generalized PCI
> > BAR.  A PCI device is only allowed to respond to address space
> > reported via a BAR.  An ACPI device is only allowed to respond to
> > address space reported via _CRS.  Those are important rules because
> > they mean we can manage address space with generic code instead of
> > device-specific code.
> 
> From what I see in 
> http://lxr.free-electrons.com/source/drivers/acpi/pci_root.c#L723
> acpi_pci_probe_root_resources() parses the _CRS method and
> it only works for MEM and IO resources, so I don't think it is 
> right to pass a config space address by _CRS or to modify the 
> current ACPI framework to support this.

Config space addresses are made up of [bus, device, function, register
offset], and you're right that _CRS doesn't contain those addresses
directly; _CRS only describes memory, I/O, and bus number ranges.

But part of the function of a host bridge is to convert memory or I/O
accesses on the primary (CPU) side to PCI config accesses on the
secondary (PCI) side, and this CPU-side memory or I/O region should be
reported via _CRS.

I think the relevant spec is the PCI Firmware Spec, r3.0, sec 4.1.2.
Note 2 in that section says the address range of an MMCFG region
must be reserved by declaring a motherboard resource, i.e., included
in the _CRS of a PNP0C02 or similar device.

> On the other side, since this is an exception only for the config
> space address of our host controller (as said before all the buses
> below the root one support ECAM), I think that it is right to put
> this address as a device specific data (in fact the rest of the
> config space addresses will be parsed from MCFG).

A kernel with no support for your host controller (and thus no
knowledge of its _DSD) should still be able to operate the rest of the
system correctly.  That means we must have a generic way to learn what
address space is consumed by the host controller so we don't try to
assign it to other devices.

Bjorn
Gabriele Paoloni Feb. 25, 2016, 3:01 a.m. UTC | #9
Hi Bjorn

> -----Original Message-----
> From: Bjorn Helgaas [mailto:helgaas@kernel.org]
> Sent: 24 February 2016 23:26
> To: Gabriele Paoloni
> Cc: 'Mark Rutland'; Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong
> (C); Linuxarm; qiujiang; 'bhelgaas@google.com'; 'arnd@arndb.de';
> 'Lorenzo.Pieralisi@arm.com'; 'tn@semihalf.com'; 'linux-
> pci@vger.kernel.org'; 'linux-kernel@vger.kernel.org'; xuwei (O);
> 'linux-acpi@vger.kernel.org'; 'jcm@redhat.com'; zhangjukuo; Liguozhu
> (Kenneth); 'linux-arm-kernel@lists.infradead.org'
> Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support for
> HiSilicon SoCs Host Controllers
> 
> On Wed, Feb 24, 2016 at 06:46:09AM +0000, Gabriele Paoloni wrote:
> >
> > Hi Bjorn, many thanks for replying
> >
> > > -----Original Message-----
> > > From: Bjorn Helgaas [mailto:helgaas@kernel.org]
> > > Sent: 24 February 2016 09:14
> > > To: Gabriele Paoloni
> > > Cc: 'Mark Rutland'; Guohanjun (Hanjun Guo); Wangzhou (B);
> liudongdong
> > > (C); Linuxarm; qiujiang; 'bhelgaas@google.com'; 'arnd@arndb.de';
> > > 'Lorenzo.Pieralisi@arm.com'; 'tn@semihalf.com'; 'linux-
> > > pci@vger.kernel.org'; 'linux-kernel@vger.kernel.org'; xuwei (O);
> > > 'linux-acpi@vger.kernel.org'; 'jcm@redhat.com'; zhangjukuo;
> Liguozhu
> > > (Kenneth); 'linux-arm-kernel@lists.infradead.org'
> > > Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support
> for
> > > HiSilicon SoCs Host Controllers
> > >
> > > On Tue, Feb 23, 2016 at 02:47:22AM +0000, Gabriele Paoloni wrote:
> > > > > -----Original Message-----
> > > > > From: Gabriele Paoloni
> > > > > Sent: 10 February 2016 22:45
> > > > > To: Mark Rutland
> > > > > Cc: Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong (C);
> Linuxarm;
> > > > > qiujiang; bhelgaas@google.com; arnd@arndb.de;
> > > Lorenzo.Pieralisi@arm.com;
> > > > > tn@semihalf.com; linux-pci@vger.kernel.org; linux-
> > > > > kernel@vger.kernel.org; xuwei (O); linux-acpi@vger.kernel.org;
> > > > > jcm@redhat.com; zhangjukuo; Liguozhu (Kenneth); linux-arm-
> > > > > kernel@lists.infradead.org
> > > > > Subject: RE: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI
> support
> > > for
> > > > > HiSilicon SoCs Host Controllers
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: linux-kernel-owner@vger.kernel.org [mailto:linux-
> kernel-
> > > > > > owner@vger.kernel.org] On Behalf Of Mark Rutland
> > > > > > Sent: 10 February 2016 11:13
> > > > > > To: Gabriele Paoloni
> > > > > > Cc: Guohanjun (Hanjun Guo); Wangzhou (B); liudongdong (C);
> > > Linuxarm;
> > > > > > qiujiang; bhelgaas@google.com; arnd@arndb.de;
> > > > > > Lorenzo.Pieralisi@arm.com; tn@semihalf.com; linux-
> > > pci@vger.kernel.org;
> > > > > > linux-kernel@vger.kernel.org; xuwei (O); linux-
> > > acpi@vger.kernel.org;
> > > > > > jcm@redhat.com; zhangjukuo; Liguozhu (Kenneth); linux-arm-
> > > > > > kernel@lists.infradead.org
> > > > > > Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI
> support
> > > for
> > > > > > HiSilicon SoCs Host Controllers
> > > > > >
> > > > > > On Wed, Feb 10, 2016 at 09:52:36AM +0000, Gabriele Paoloni
> wrote:
> > > > > > > Hi Mark
> > > > > > >
> > > > > > > > On Tue, Feb 09, 2016 at 05:34:20PM +0000, Gabriele
> Paoloni
> > > wrote:
> > > > > > > > > From: gabriele paoloni <gabriele.paoloni@huawei.com>
> > > > > > > > > +/*
> > > > > > > > > + * Retrieve rc_dbi base and size from _DSD
> > > > > > > > > + * Name (_DSD, Package () {
> > > > > > > > > + *	ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
> > > > > > > > > + *	Package () {
> > > > > > > > > + *	Package () {"rc-dbi", Package () { 0x0,
> 0xb0080000,
> > > 0x0,
> > > > > > 0x10000
> > > > > > > > }},
> > > > > > > > > + *	}
> > > > > > > > > + *	})
> > > > > > > > > + */
> > > > > > > >
> > > > > > > > As above, this does not look right. ACPI has standard
> > > mechanisms
> > > > > > for
> > > > > > > > describing addresses. Making something up like this is
> not a
> > > good
> > > > > > idea.
> > > > > > >
> > > > > > > I am quite new to ACPI, may I ask you to explain a bit?
> > > > > >
> > > > > > ACPI has standard mechanisms for describing certain resources,
> > > and
> > > > > > these
> > > > > > should not be described in _DSD. Memory or IO address regions
> are
> > > > > such
> > > > > > resources (in _CRS, IIRC), and should not be described in
> _DSD.
> > > > >
> > > > > Hi Mark,
> > > > >
> > > > > In my case I think in need to look into the MCFG object as the
> > > problem
> > > > > I have is RC using a different range than the rest of the
> hierarchy.
> > > > >
> > > > > I'll investigate this and try to come with a solution in v4
> > > >
> > > > I have looked into this and in our case we cannot use the
> > > > standard MCFG object to pass the RC config space addresses.
> > > >
> > > > The reason is that in our HW we have the config base addresses of
> the
> > > > root complex ports that are less than 0x100000 byte distant one
> from
> > > > the other as we only map the first 0x10000 bytes.
> > >
> > > The ECAM spec requires 4096 bytes per function, 8 functions per
> > > device, 32 devices per bus, which means you need 0x100000 bytes of
> > > address space per bus.  If your device doesn't supply that, it
> doesn't
> > > really implement ECAM, and you probably can't use the standard ways
> of
> > > describing ECAM (MCFG, _CBA).
> >
> > Correct, our host bridge is non ECAM only for the RC bus config space;
> > for any other bus underneath the root bus we support ECAM access, so
> > we need a quirk for the RC config rd/wr
> 
> MCFG can specify a bus number range, so you might be able to use it
> to describe the ECAM space for buses below the root bus, e.g.,
> [bus 01-ff].

Yes, for this patchset we use MCFG for buses [01-ff] and _DSD for bus 0

> 
> > > > Now the MCFG acpi framework always fix the MCFG resource size to
> > > 0x100000
> > > > for each bus; therefore if we pass our RC addresses through MCFG
> we
> > > end
> > > > up with a resource conflict.
> > > >
> > > > To give you a practical example we are in a situation where we
> have:
> > > >
> > > > port0: [0x00000000b0080000 - 0x00000000b0080000 + 0x10000]
> > > > port1: [0x00000000b0090000 - 0x00000000b0090000 + 0x10000]
> > > > port2: [0x00000000b00A0000 - 0x00000000b00A0000 + 0x10000]
> > > > port3: [0x00000000b00B0000 - 0x00000000b00B0000 + 0x10000]
> > > >
> > > > So if we pass the base addresses through MCFG the resources
> > > > will overlap as MCFG will consider 0x100000 size for each base
> > > > address of the root complex (only the RC bus uses that address)
> > > >
> > > > So far I do not see many option other than using _DSD to pass
> > > > these RC config base addresses.
> > >
> > > I don't want to take over Mark's discussion, but I really do not
> think
> > > _DSD is the correct way to fix this.  _CRS is like a generalized
> PCI
> > > BAR.  A PCI device is only allowed to respond to address space
> > > reported via a BAR.  An ACPI device is only allowed to respond to
> > > address space reported via _CRS.  Those are important rules because
> > > they mean we can manage address space with generic code instead of
> > > device-specific code.
> >
> > From what I see in
> > http://lxr.free-electrons.com/source/drivers/acpi/pci_root.c#L723
> > acpi_pci_probe_root_resources() parses the _CRS method and
> > it only works for MEM and IO resources, so I don't think it is
> > right to pass a config space address by _CRS or to modify the
> > current ACPI framework to support this.
> 
> Config space addresses are made up of [bus, device, function, register
> offset], and you're right that _CRS doesn't contain those addresses
> directly; _CRS only describes memory, I/O, and bus number ranges.
> 
> But part of the function of a host bridge is to convert memory or I/O
> accesses on the primary (CPU) side to PCI config accesses on the
> secondary (PCI) side, and this CPU-side memory or I/O region should be
> reported via _CRS.

Yes the memory and I/O ranges are specified in the _CRS method of the
root complex PCI device

> 
> I think the relevant spec is the PCI Firmware Spec, r3.0, sec 4.1.2.
> Note 2 in that section says the address range of an MMCFG region
> must be reserved by declaring a motherboard resource, i.e., included
> in the _CRS of a PNP0C02 or similar device.

I had a look a this. So yes the specs says that we should use the 
PNP0C02 device if MCFG is not supported. 
So probably I can use acpi_get_devices("PNP0C02",...) to retrieve it
from the quirk match function, I will look into this... 

> 
> > On the other side, since this is an exception only for the config
> > space address of our host controller (as said before all the buses
> > below the root one support ECAM), I think that it is right to put
> > this address as a device specific data (in fact the rest of the
> > config space addresses will be parsed from MCFG).
> 
> A kernel with no support for your host controller (and thus no
> knowledge of its _DSD) should still be able to operate the rest of the
> system correctly.  That means we must have a generic way to learn what
> address space is consumed by the host controller so we don't try to
> assign it to other devices.

This is something I don't understand much...
Are you talking about a scenario where we have a Kernel image compiled
without our host controller support and running on our platform?

In this case the quirk is not called and the standard acpi pci root
driver will run and probably the config rd/wr for bus 0 will fail...

I don't see in this case how the physical address that we specify in the
_DSD can be assigned to other devices...

maybe I misunderstood something here...

Many Thanks

Gab 

> 
> Bjorn
Lorenzo Pieralisi Feb. 25, 2016, 12:07 p.m. UTC | #10
On Thu, Feb 25, 2016 at 03:01:19AM +0000, Gabriele Paoloni wrote:

[...]

> > I think the relevant spec is the PCI Firmware Spec, r3.0, sec 4.1.2.
> > Note 2 in that section says the address range of an MMCFG region
> > must be reserved by declaring a motherboard resource, i.e., included
> > in the _CRS of a PNP0C02 or similar device.
> 
> I had a look a this. So yes the specs says that we should use the 
> PNP0C02 device if MCFG is not supported. 

AFAIK, PNP0C02 is a resource reservation mechanism, the spec says that
MCFG regions must be reserved using PNP0C02, even though its
current usage on x86 is a bit unfathomable to me, in particular
in relation to MCFG resources retrieved for hotpluggable bridges (ie
through _CBA, which I think consider the MCFG region as reserved
by default, regardless of PNP0c02):

see pci_mmcfg_check_reserved() arch/x86/pci/mmconfig-shared.c

Have a look at drivers/pnp/system.c for PNP0c02

> So probably I can use acpi_get_devices("PNP0C02",...) to retrieve it
> from the quirk match function, I will look into this... 
> 
> > 
> > > On the other side, since this is an exception only for the config
> > > space address of our host controller (as said before all the buses
> > > below the root one support ECAM), I think that it is right to put
> > > this address as a device specific data (in fact the rest of the
> > > config space addresses will be parsed from MCFG).
> > 
> > A kernel with no support for your host controller (and thus no
> > knowledge of its _DSD) should still be able to operate the rest of the
> > system correctly.  That means we must have a generic way to learn what
> > address space is consumed by the host controller so we don't try to
> > assign it to other devices.
> 
> This is something I don't understand much...
> Are you talking about a scenario where we have a Kernel image compiled
> without our host controller support and running on our platform?

I *think* the point here is that your host controller config space should be
reserved through PNP0c02 so that the kernel will reserve it through the
generic PNP0c02 driver even if your host controller driver (and related
_DSD) is not supported in the kernel.

I do not understand how PNP0c02 works, currently, by the way.

If I read x86 code correctly, the unassigned PCI bus resources are
assigned in arch/x86/pci/i386.c (?) fs_initcall(pcibios_assign_resources),
with a comment:

/**
 * called in fs_initcall (one below subsys_initcall),
 * give a chance for motherboard reserve resources
 */

Problem is, motherboard resources are requested through (?):

drivers/pnp/system.c

which is also initialized at fs_initcall, so it might be called after
core x86 code reassign resources, defeating the purpose PNP0c02 was
designed for, namely, request motherboard regions before resources
are assigned, am I wrong ?

As per last Tomasz's patchset, we claim and assign unassigned PCI
resources upon ACPI PCI host bridge probing (which happens at
subsys_initcall time, courtesy of ACPI current code); at that time the
kernel did not even register the PNP0c02 driver (drivers/pnp/system.c)
(it does that at fs_initcall). On the other hand, we insert MCFG
regions into the resource tree upon MCFG parsing, so I do not
see why we need to rely on PNP0c02 to do that for us (granted, the
mechanism is part of the PCI fw specs, which are x86 centric anyway
ie we can't certainly rely on Int15 e820 to detect reserved memory
on ARM :D)

There is lots of legacy x86 here and Bjorn definitely has more
visibility into that than I have, the ARM world must understand
how this works to make sure we have an agreement.

Thanks,
Lorenzo
Bjorn Helgaas Feb. 25, 2016, 7:59 p.m. UTC | #11
On Thu, Feb 25, 2016 at 12:07:50PM +0000, Lorenzo Pieralisi wrote:
> On Thu, Feb 25, 2016 at 03:01:19AM +0000, Gabriele Paoloni wrote:
> 
> [...]
> 
> > > I think the relevant spec is the PCI Firmware Spec, r3.0, sec 4.1.2.
> > > Note 2 in that section says the address range of an MMCFG region
> > > must be reserved by declaring a motherboard resource, i.e., included
> > > in the _CRS of a PNP0C02 or similar device.
> > 
> > I had a look a this. So yes the specs says that we should use the 
> > PNP0C02 device if MCFG is not supported. 
> 
> AFAIK, PNP0C02 is a resource reservation mechanism, the spec says that
> MCFG regions must be reserved using PNP0C02, even though its
> current usage on x86 is a bit unfathomable to me, in particular
> in relation to MCFG resources retrieved for hotpluggable bridges (ie
> through _CBA, which I think consider the MCFG region as reserved
> by default, regardless of PNP0c02):
> 
> see pci_mmcfg_check_reserved() arch/x86/pci/mmconfig-shared.c

I don't know how _CBA-related resources would be reserved.  I haven't
personally worked with any host bridges that supply _CBA, so I don't
know whether or how they handled it.

I think the spec intent was that the Consumer/Producer bit (Extended
Address Space Descriptor, General Flags Bit[0], see ACPI spec sec
6.4.3.5.4) would be used.  Resources such as ECAM areas would be
marked "Consumer", meaning the bridge consumed that space itself, and
windows passed down to the PCI bus would be marked "Producer".

But BIOSes didn't use that bit consistently, so we couldn't rely on
it.  I verified experimentally that Windows didn't pay attention to
that bit either, at least for DWord descriptors:
https://bugzilla.kernel.org/show_bug.cgi?id=15701

It's conceivable that we could still use that bit in Extended Address
Space descriptors, or maybe some hack like pay attention if the bridge
has _CBA, or some such.  Or maybe a BIOS could add a PNP0C02 device
along with the PNP0A03 device that uses _CBA, with the PNP0C02 _CRS
describing the ECAM area referenced by _CBA.  Seeems hacky no matter
how we slice it.

> Have a look at drivers/pnp/system.c for PNP0c02
> 
> > So probably I can use acpi_get_devices("PNP0C02",...) to retrieve it
> > from the quirk match function, I will look into this... 
> > 
> > > 
> > > > On the other side, since this is an exception only for the config
> > > > space address of our host controller (as said before all the buses
> > > > below the root one support ECAM), I think that it is right to put
> > > > this address as a device specific data (in fact the rest of the
> > > > config space addresses will be parsed from MCFG).
> > > 
> > > A kernel with no support for your host controller (and thus no
> > > knowledge of its _DSD) should still be able to operate the rest of the
> > > system correctly.  That means we must have a generic way to learn what
> > > address space is consumed by the host controller so we don't try to
> > > assign it to other devices.
> > 
> > This is something I don't understand much...
> > Are you talking about a scenario where we have a Kernel image compiled
> > without our host controller support and running on our platform?
> 
> I *think* the point here is that your host controller config space should be
> reserved through PNP0c02 so that the kernel will reserve it through the
> generic PNP0c02 driver even if your host controller driver (and related
> _DSD) is not supported in the kernel.

Right.  Assume you have two top-level devices:

  PNP0A03  PCI host bridge
    _CRS     describes windows
    ????     describes ECAM space consumed
  PNPxxxx  another ACPI device, currently disabled
    _PRS     specifies possible resource settings, may specify no restrictions
    _SRS     assign resources and enable device
    _CRS     empty until device enabled

When the OS enables PNPxxxx, it must first assign resources to it
using _PRS and _SRS.  We evaluate _PRS to find out what the addresses
PNPxxxx can support.  This tells us things like how wide the address
decoder is, the size of the region required, and any alignment
restrictions -- basically the same information we get by sizing a PCI
BAR.

Now, how do we assign space for PNPxxxx?  In a few cases, _PRS has
only a few specific possibilities, e.g., an x86 legacy serial port
that can be at 0x3f8 or 0x2f8.  But in general, _PRS need not impose
any restrictions.

So in general the OS can use any space that can be routed to PNPxxxx.
If there's an upstream bridge, it may define windows that restrict the
possibilities.  But in this case, there *is* no upstream bridge, so
the possible choices are the entire physical address space of the
platform, except for other things that are already allocated: RAM, the
_CRS settings for other ACPI devices, things reserved by the E820
table (at least on x86), etc.

If PNP0A03 consumes address space for ECAM, that space must be
reported *somewhere* so the OS knows not to place PNPxxxx there.  This
reporting must be generic (not device-specific like _DSD).  The ACPI
core (not drivers) is responsible for managing this address space
because:

  a) the OS is not guaranteed to have drivers for all devices, and
  
  b) even it *did* have drivers for all devices, the PNPxxxx space may
  be assigned before drivers are initialized.

> I do not understand how PNP0c02 works, currently, by the way.
> 
> If I read x86 code correctly, the unassigned PCI bus resources are
> assigned in arch/x86/pci/i386.c (?) fs_initcall(pcibios_assign_resources),
> with a comment:
> 
> /**
>  * called in fs_initcall (one below subsys_initcall),
>  * give a chance for motherboard reserve resources
>  */
> 
> Problem is, motherboard resources are requested through (?):
> 
> drivers/pnp/system.c
> 
> which is also initialized at fs_initcall, so it might be called after
> core x86 code reassign resources, defeating the purpose PNP0c02 was
> designed for, namely, request motherboard regions before resources
> are assigned, am I wrong ?

I think you're right.  This is a long-standing screwup in Linux.
IMHO, ACPI resources should be parsed and reserved by the ACPI core,
before any PCI resource management (since PCI host bridges are
represented in ACPI).  But historically PCI devices have enumerated
before ACPI got involved.  And the ACPI core doesn't really pay
attention to _CRS for most devices (with the exception of PNP0C02).

IMO the PNP0C02 code in drivers/pnp/system.c should really be done in
the ACPI core for all ACPI devices, similar to the way the PCI core
reserves BAR space for all PCI devices, even if we don't have drivers
for them.  I've tried to fix this in the past, but it is really a
nightmare to unravel everything.

Because the ACPI core doesn't reserve resources for the _CRS of all
ACPI devices, we're already vulnerable to the problem of placing a
device on top of another ACPI device.  We don't see problems because
on x86, at least, most ACPI devices are already configured by the BIOS
to be enabled and non-overlapping.  But x86 has the advantage of
having extensive test coverage courtesy of Windows, and as long as
_CRS has the right stuff in it, we at least have the potential of
fixing problems in Linux.

If the platform doesn't report resource usage correctly on ARM, we may
not find problems (because we don't have the Windows test suite) and
if we have resource assignment problems because _CRS is lacking, we'll
have no way to fix them.

> As per last Tomasz's patchset, we claim and assign unassigned PCI
> resources upon ACPI PCI host bridge probing (which happens at
> subsys_initcall time, courtesy of ACPI current code); at that time the
> kernel did not even register the PNP0c02 driver (drivers/pnp/system.c)
> (it does that at fs_initcall). On the other hand, we insert MCFG
> regions into the resource tree upon MCFG parsing, so I do not
> see why we need to rely on PNP0c02 to do that for us (granted, the
> mechanism is part of the PCI fw specs, which are x86 centric anyway
> ie we can't certainly rely on Int15 e820 to detect reserved memory
> on ARM :D)
> 
> There is lots of legacy x86 here and Bjorn definitely has more
> visibility into that than I have, the ARM world must understand
> how this works to make sure we have an agreement.

As you say, there is lots of unpleasant x86 legacy here.  Possibly ARM
has a chance to clean this up and do it more sanely; I'm not sure
whether it's feasible to reverse the ACPI/PCI init order there or not.

Rafael, any thoughts on this whole thing?

Bjorn
Rafael J. Wysocki Feb. 25, 2016, 9:24 p.m. UTC | #12
On Thursday, February 25, 2016 01:59:12 PM Bjorn Helgaas wrote:
> On Thu, Feb 25, 2016 at 12:07:50PM +0000, Lorenzo Pieralisi wrote:
> > On Thu, Feb 25, 2016 at 03:01:19AM +0000, Gabriele Paoloni wrote:
> > 
> > [...]
> > 
> > > > I think the relevant spec is the PCI Firmware Spec, r3.0, sec 4.1.2.
> > > > Note 2 in that section says the address range of an MMCFG region
> > > > must be reserved by declaring a motherboard resource, i.e., included
> > > > in the _CRS of a PNP0C02 or similar device.
> > > 
> > > I had a look a this. So yes the specs says that we should use the 
> > > PNP0C02 device if MCFG is not supported. 
> > 
> > AFAIK, PNP0C02 is a resource reservation mechanism, the spec says that
> > MCFG regions must be reserved using PNP0C02, even though its
> > current usage on x86 is a bit unfathomable to me, in particular
> > in relation to MCFG resources retrieved for hotpluggable bridges (ie
> > through _CBA, which I think consider the MCFG region as reserved
> > by default, regardless of PNP0c02):
> > 
> > see pci_mmcfg_check_reserved() arch/x86/pci/mmconfig-shared.c
> 
> I don't know how _CBA-related resources would be reserved.  I haven't
> personally worked with any host bridges that supply _CBA, so I don't
> know whether or how they handled it.
> 
> I think the spec intent was that the Consumer/Producer bit (Extended
> Address Space Descriptor, General Flags Bit[0], see ACPI spec sec
> 6.4.3.5.4) would be used.  Resources such as ECAM areas would be
> marked "Consumer", meaning the bridge consumed that space itself, and
> windows passed down to the PCI bus would be marked "Producer".
> 
> But BIOSes didn't use that bit consistently, so we couldn't rely on
> it.  I verified experimentally that Windows didn't pay attention to
> that bit either, at least for DWord descriptors:
> https://bugzilla.kernel.org/show_bug.cgi?id=15701
> 
> It's conceivable that we could still use that bit in Extended Address
> Space descriptors, or maybe some hack like pay attention if the bridge
> has _CBA, or some such.

Last time I asked the firmware vendor people in the ASWG about that,
the answer was that the Consumer/Producer bit was useless as they had
never paid any attention to it in general.

It may be good to update the spec to say something along these lines, actually.

Thanks,
Rafael
Gabriele Paoloni Feb. 27, 2016, 9 a.m. UTC | #13
Hi Bjorn, Lorenzo

Many Thanks for your replies and suggestions

> -----Original Message-----
> From: linux-pci-owner@vger.kernel.org [mailto:linux-pci-
> owner@vger.kernel.org] On Behalf Of Bjorn Helgaas
> Sent: 25 February 2016 19:59
> To: Lorenzo Pieralisi
> Cc: Gabriele Paoloni; 'Mark Rutland'; Guohanjun (Hanjun Guo); Wangzhou
> (B); liudongdong (C); Linuxarm; qiujiang; 'bhelgaas@google.com';
> 'arnd@arndb.de'; 'tn@semihalf.com'; 'linux-pci@vger.kernel.org';
> 'linux-kernel@vger.kernel.org'; xuwei (O); 'linux-
> acpi@vger.kernel.org'; 'jcm@redhat.com'; zhangjukuo; Liguozhu
> (Kenneth); 'linux-arm-kernel@lists.infradead.org'
> Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support for
> HiSilicon SoCs Host Controllers
> 
> On Thu, Feb 25, 2016 at 12:07:50PM +0000, Lorenzo Pieralisi wrote:
> > On Thu, Feb 25, 2016 at 03:01:19AM +0000, Gabriele Paoloni wrote:
> >
> > [...]
> >
> > > > I think the relevant spec is the PCI Firmware Spec, r3.0, sec
> 4.1.2.
> > > > Note 2 in that section says the address range of an MMCFG region
> > > > must be reserved by declaring a motherboard resource, i.e.,
> included
> > > > in the _CRS of a PNP0C02 or similar device.
> > >
> > > I had a look a this. So yes the specs says that we should use the
> > > PNP0C02 device if MCFG is not supported.
> >
> > AFAIK, PNP0C02 is a resource reservation mechanism, the spec says
> that
> > MCFG regions must be reserved using PNP0C02, even though its
> > current usage on x86 is a bit unfathomable to me, in particular
> > in relation to MCFG resources retrieved for hotpluggable bridges (ie
> > through _CBA, which I think consider the MCFG region as reserved
> > by default, regardless of PNP0c02):
> >
> > see pci_mmcfg_check_reserved() arch/x86/pci/mmconfig-shared.c

Yes I checked this and it seems to check if an area of memory from
MCFG is overlapping with any area of memory specified by PNP0C02 
_CRS...

However (maybe I am wrong) it looks to me that this part works
independently of the PNP0c02 driver. It seems that goes directly
to walk the ACPI namespace and look for PNP0C02 HID; as it finds it,
it checks the range of memory specified in the _CRS method and see
if it overlaps with the MCFG resource...am I missing something? 

If my interpretation is correct, couldn't we just modify 
pci_mmconfig_map_resource() in the latest Nowicki patchset and add
a similar check before insert_resource_conflict() is called?

On the other side HiSilicon host bridge quirks could use the address
retrieved by the _CRS method of PNP0C02 for our root complex config
rd/wr...?
 
> 
> I don't know how _CBA-related resources would be reserved.  I haven't
> personally worked with any host bridges that supply _CBA, so I don't
> know whether or how they handled it.
> 
> I think the spec intent was that the Consumer/Producer bit (Extended
> Address Space Descriptor, General Flags Bit[0], see ACPI spec sec
> 6.4.3.5.4) would be used.  Resources such as ECAM areas would be
> marked "Consumer", meaning the bridge consumed that space itself, and
> windows passed down to the PCI bus would be marked "Producer".
> 
> But BIOSes didn't use that bit consistently, so we couldn't rely on
> it.  I verified experimentally that Windows didn't pay attention to
> that bit either, at least for DWord descriptors:
> https://bugzilla.kernel.org/show_bug.cgi?id=15701
> 
> It's conceivable that we could still use that bit in Extended Address
> Space descriptors, or maybe some hack like pay attention if the bridge
> has _CBA, or some such.  Or maybe a BIOS could add a PNP0C02 device
> along with the PNP0A03 device that uses _CBA, with the PNP0C02 _CRS
> describing the ECAM area referenced by _CBA.  Seeems hacky no matter
> how we slice it.

Well about this I don't know much but, having looked at the bugzilla
and considering the current mechanism used by pci_mmcfg_check_reserved()
I have the feeling that this last one is easier to implement and it seems
the one currently used (in mmconfig-shared.c )

Cheers

Gab

> 
> > Have a look at drivers/pnp/system.c for PNP0c02
> >
> > > So probably I can use acpi_get_devices("PNP0C02",...) to retrieve
> it
> > > from the quirk match function, I will look into this...
> > >
> > > >
> > > > > On the other side, since this is an exception only for the
> config
> > > > > space address of our host controller (as said before all the
> buses
> > > > > below the root one support ECAM), I think that it is right to
> put
> > > > > this address as a device specific data (in fact the rest of the
> > > > > config space addresses will be parsed from MCFG).
> > > >
> > > > A kernel with no support for your host controller (and thus no
> > > > knowledge of its _DSD) should still be able to operate the rest
> of the
> > > > system correctly.  That means we must have a generic way to learn
> what
> > > > address space is consumed by the host controller so we don't try
> to
> > > > assign it to other devices.
> > >
> > > This is something I don't understand much...
> > > Are you talking about a scenario where we have a Kernel image
> compiled
> > > without our host controller support and running on our platform?
> >
> > I *think* the point here is that your host controller config space
> should be
> > reserved through PNP0c02 so that the kernel will reserve it through
> the
> > generic PNP0c02 driver even if your host controller driver (and
> related
> > _DSD) is not supported in the kernel.
> 
> Right.  Assume you have two top-level devices:
> 
>   PNP0A03  PCI host bridge
>     _CRS     describes windows
>     ????     describes ECAM space consumed
>   PNPxxxx  another ACPI device, currently disabled
>     _PRS     specifies possible resource settings, may specify no
> restrictions
>     _SRS     assign resources and enable device
>     _CRS     empty until device enabled
> 
> When the OS enables PNPxxxx, it must first assign resources to it
> using _PRS and _SRS.  We evaluate _PRS to find out what the addresses
> PNPxxxx can support.  This tells us things like how wide the address
> decoder is, the size of the region required, and any alignment
> restrictions -- basically the same information we get by sizing a PCI
> BAR.
> 
> Now, how do we assign space for PNPxxxx?  In a few cases, _PRS has
> only a few specific possibilities, e.g., an x86 legacy serial port
> that can be at 0x3f8 or 0x2f8.  But in general, _PRS need not impose
> any restrictions.
> 
> So in general the OS can use any space that can be routed to PNPxxxx.
> If there's an upstream bridge, it may define windows that restrict the
> possibilities.  But in this case, there *is* no upstream bridge, so
> the possible choices are the entire physical address space of the
> platform, except for other things that are already allocated: RAM, the
> _CRS settings for other ACPI devices, things reserved by the E820
> table (at least on x86), etc.
> 
> If PNP0A03 consumes address space for ECAM, that space must be
> reported *somewhere* so the OS knows not to place PNPxxxx there.  This
> reporting must be generic (not device-specific like _DSD).  The ACPI
> core (not drivers) is responsible for managing this address space
> because:
> 
>   a) the OS is not guaranteed to have drivers for all devices, and
> 
>   b) even it *did* have drivers for all devices, the PNPxxxx space may
>   be assigned before drivers are initialized.
> 
> > I do not understand how PNP0c02 works, currently, by the way.
> >
> > If I read x86 code correctly, the unassigned PCI bus resources are
> > assigned in arch/x86/pci/i386.c (?)
> fs_initcall(pcibios_assign_resources),
> > with a comment:
> >
> > /**
> >  * called in fs_initcall (one below subsys_initcall),
> >  * give a chance for motherboard reserve resources
> >  */
> >
> > Problem is, motherboard resources are requested through (?):
> >
> > drivers/pnp/system.c
> >
> > which is also initialized at fs_initcall, so it might be called after
> > core x86 code reassign resources, defeating the purpose PNP0c02 was
> > designed for, namely, request motherboard regions before resources
> > are assigned, am I wrong ?
> 
> I think you're right.  This is a long-standing screwup in Linux.
> IMHO, ACPI resources should be parsed and reserved by the ACPI core,
> before any PCI resource management (since PCI host bridges are
> represented in ACPI).  But historically PCI devices have enumerated
> before ACPI got involved.  And the ACPI core doesn't really pay
> attention to _CRS for most devices (with the exception of PNP0C02).
> 
> IMO the PNP0C02 code in drivers/pnp/system.c should really be done in
> the ACPI core for all ACPI devices, similar to the way the PCI core
> reserves BAR space for all PCI devices, even if we don't have drivers
> for them.  I've tried to fix this in the past, but it is really a
> nightmare to unravel everything.
> 
> Because the ACPI core doesn't reserve resources for the _CRS of all
> ACPI devices, we're already vulnerable to the problem of placing a
> device on top of another ACPI device.  We don't see problems because
> on x86, at least, most ACPI devices are already configured by the BIOS
> to be enabled and non-overlapping.  But x86 has the advantage of
> having extensive test coverage courtesy of Windows, and as long as
> _CRS has the right stuff in it, we at least have the potential of
> fixing problems in Linux.
> 
> If the platform doesn't report resource usage correctly on ARM, we may
> not find problems (because we don't have the Windows test suite) and
> if we have resource assignment problems because _CRS is lacking, we'll
> have no way to fix them.
> 
> > As per last Tomasz's patchset, we claim and assign unassigned PCI
> > resources upon ACPI PCI host bridge probing (which happens at
> > subsys_initcall time, courtesy of ACPI current code); at that time
> the
> > kernel did not even register the PNP0c02 driver
> (drivers/pnp/system.c)
> > (it does that at fs_initcall). On the other hand, we insert MCFG
> > regions into the resource tree upon MCFG parsing, so I do not
> > see why we need to rely on PNP0c02 to do that for us (granted, the
> > mechanism is part of the PCI fw specs, which are x86 centric anyway
> > ie we can't certainly rely on Int15 e820 to detect reserved memory
> > on ARM :D)
> >
> > There is lots of legacy x86 here and Bjorn definitely has more
> > visibility into that than I have, the ARM world must understand
> > how this works to make sure we have an agreement.
> 
> As you say, there is lots of unpleasant x86 legacy here.  Possibly ARM
> has a chance to clean this up and do it more sanely; I'm not sure
> whether it's feasible to reverse the ACPI/PCI init order there or not.
> 
> Rafael, any thoughts on this whole thing?
> 
> Bjorn
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lorenzo Pieralisi Feb. 29, 2016, 11:38 a.m. UTC | #14
> > > > > I think the relevant spec is the PCI Firmware Spec, r3.0, sec
> > 4.1.2.
> > > > > Note 2 in that section says the address range of an MMCFG region
> > > > > must be reserved by declaring a motherboard resource, i.e.,
> > included
> > > > > in the _CRS of a PNP0C02 or similar device.
> > > >
> > > > I had a look a this. So yes the specs says that we should use the
> > > > PNP0C02 device if MCFG is not supported.
> > >
> > > AFAIK, PNP0C02 is a resource reservation mechanism, the spec says
> > that
> > > MCFG regions must be reserved using PNP0C02, even though its
> > > current usage on x86 is a bit unfathomable to me, in particular
> > > in relation to MCFG resources retrieved for hotpluggable bridges (ie
> > > through _CBA, which I think consider the MCFG region as reserved
> > > by default, regardless of PNP0c02):
> > >
> > > see pci_mmcfg_check_reserved() arch/x86/pci/mmconfig-shared.c
> 
> Yes I checked this and it seems to check if an area of memory from
> MCFG is overlapping with any area of memory specified by PNP0C02 
> _CRS...
> 
> However (maybe I am wrong) it looks to me that this part works
> independently of the PNP0c02 driver. It seems that goes directly
> to walk the ACPI namespace and look for PNP0C02 HID; as it finds it,
> it checks the range of memory specified in the _CRS method and see
> if it overlaps with the MCFG resource...am I missing something? 

Well, if I understand the code correctly, the x86 MCFG code, for static
MCFG tables, check that the MCFG regions are part of motherboard
resources (by walking the ACPI namespace as you said). If that's the case,
it does not insert resources into the resource tree till a late_initcall,
(pci_mmcfg_late_insert_resources()) that should run after the PNP0C02 driver
was initialized (?) (I guess, that's a nightmare to understand these initcall
ordering unwritten rules dependencies, if they exist). If the MCFG region is
not part of motherboard resources it is discarded (ie pci_mmconfig_insert()
in arch/x86/pci/mmconfig-shared.c).

I really do not know why x86 code has been implemented like that
and whatever I say it is just speculation, honestly it is not easy
to understand.

> If my interpretation is correct, couldn't we just modify 
> pci_mmconfig_map_resource() in the latest Nowicki patchset and add
> a similar check before insert_resource_conflict() is called?

Yes, but I am not sure that would help much. We can prevent adding
MCFG resources to the resource tree if they are not declared as motherboard
resources, but if they do it is totally unclear to me what we should do.

Should we insert the MCFG region resources into the resource tree before
PNP0C02 driver has a chance to be probed ? Or do what x86 does (ie it does
not insert resources into the resource tree till late_initcall
pci_mmcfg_late_insert_resources()) ? I do not know. If the PNP0C02
driver stays as it is, whatever we do with PNP0C02 regions does not make
much sense to me, that's the gist of what I am saying, I don't know why
x86 code was implemented like that, I will go through the commits history
to find some light here.

Lorenzo

> 
> On the other side HiSilicon host bridge quirks could use the address
> retrieved by the _CRS method of PNP0C02 for our root complex config
> rd/wr...?
>  
> > 
> > I don't know how _CBA-related resources would be reserved.  I haven't
> > personally worked with any host bridges that supply _CBA, so I don't
> > know whether or how they handled it.
> > 
> > I think the spec intent was that the Consumer/Producer bit (Extended
> > Address Space Descriptor, General Flags Bit[0], see ACPI spec sec
> > 6.4.3.5.4) would be used.  Resources such as ECAM areas would be
> > marked "Consumer", meaning the bridge consumed that space itself, and
> > windows passed down to the PCI bus would be marked "Producer".
> > 
> > But BIOSes didn't use that bit consistently, so we couldn't rely on
> > it.  I verified experimentally that Windows didn't pay attention to
> > that bit either, at least for DWord descriptors:
> > https://bugzilla.kernel.org/show_bug.cgi?id=15701
> > 
> > It's conceivable that we could still use that bit in Extended Address
> > Space descriptors, or maybe some hack like pay attention if the bridge
> > has _CBA, or some such.  Or maybe a BIOS could add a PNP0C02 device
> > along with the PNP0A03 device that uses _CBA, with the PNP0C02 _CRS
> > describing the ECAM area referenced by _CBA.  Seeems hacky no matter
> > how we slice it.
> 
> Well about this I don't know much but, having looked at the bugzilla
> and considering the current mechanism used by pci_mmcfg_check_reserved()
> I have the feeling that this last one is easier to implement and it seems
> the one currently used (in mmconfig-shared.c )
> 
> Cheers
> 
> Gab
> 
> > 
> > > Have a look at drivers/pnp/system.c for PNP0c02
> > >
> > > > So probably I can use acpi_get_devices("PNP0C02",...) to retrieve
> > it
> > > > from the quirk match function, I will look into this...
> > > >
> > > > >
> > > > > > On the other side, since this is an exception only for the
> > config
> > > > > > space address of our host controller (as said before all the
> > buses
> > > > > > below the root one support ECAM), I think that it is right to
> > put
> > > > > > this address as a device specific data (in fact the rest of the
> > > > > > config space addresses will be parsed from MCFG).
> > > > >
> > > > > A kernel with no support for your host controller (and thus no
> > > > > knowledge of its _DSD) should still be able to operate the rest
> > of the
> > > > > system correctly.  That means we must have a generic way to learn
> > what
> > > > > address space is consumed by the host controller so we don't try
> > to
> > > > > assign it to other devices.
> > > >
> > > > This is something I don't understand much...
> > > > Are you talking about a scenario where we have a Kernel image
> > compiled
> > > > without our host controller support and running on our platform?
> > >
> > > I *think* the point here is that your host controller config space
> > should be
> > > reserved through PNP0c02 so that the kernel will reserve it through
> > the
> > > generic PNP0c02 driver even if your host controller driver (and
> > related
> > > _DSD) is not supported in the kernel.
> > 
> > Right.  Assume you have two top-level devices:
> > 
> >   PNP0A03  PCI host bridge
> >     _CRS     describes windows
> >     ????     describes ECAM space consumed
> >   PNPxxxx  another ACPI device, currently disabled
> >     _PRS     specifies possible resource settings, may specify no
> > restrictions
> >     _SRS     assign resources and enable device
> >     _CRS     empty until device enabled
> > 
> > When the OS enables PNPxxxx, it must first assign resources to it
> > using _PRS and _SRS.  We evaluate _PRS to find out what the addresses
> > PNPxxxx can support.  This tells us things like how wide the address
> > decoder is, the size of the region required, and any alignment
> > restrictions -- basically the same information we get by sizing a PCI
> > BAR.
> > 
> > Now, how do we assign space for PNPxxxx?  In a few cases, _PRS has
> > only a few specific possibilities, e.g., an x86 legacy serial port
> > that can be at 0x3f8 or 0x2f8.  But in general, _PRS need not impose
> > any restrictions.
> > 
> > So in general the OS can use any space that can be routed to PNPxxxx.
> > If there's an upstream bridge, it may define windows that restrict the
> > possibilities.  But in this case, there *is* no upstream bridge, so
> > the possible choices are the entire physical address space of the
> > platform, except for other things that are already allocated: RAM, the
> > _CRS settings for other ACPI devices, things reserved by the E820
> > table (at least on x86), etc.
> > 
> > If PNP0A03 consumes address space for ECAM, that space must be
> > reported *somewhere* so the OS knows not to place PNPxxxx there.  This
> > reporting must be generic (not device-specific like _DSD).  The ACPI
> > core (not drivers) is responsible for managing this address space
> > because:
> > 
> >   a) the OS is not guaranteed to have drivers for all devices, and
> > 
> >   b) even it *did* have drivers for all devices, the PNPxxxx space may
> >   be assigned before drivers are initialized.
> > 
> > > I do not understand how PNP0c02 works, currently, by the way.
> > >
> > > If I read x86 code correctly, the unassigned PCI bus resources are
> > > assigned in arch/x86/pci/i386.c (?)
> > fs_initcall(pcibios_assign_resources),
> > > with a comment:
> > >
> > > /**
> > >  * called in fs_initcall (one below subsys_initcall),
> > >  * give a chance for motherboard reserve resources
> > >  */
> > >
> > > Problem is, motherboard resources are requested through (?):
> > >
> > > drivers/pnp/system.c
> > >
> > > which is also initialized at fs_initcall, so it might be called after
> > > core x86 code reassign resources, defeating the purpose PNP0c02 was
> > > designed for, namely, request motherboard regions before resources
> > > are assigned, am I wrong ?
> > 
> > I think you're right.  This is a long-standing screwup in Linux.
> > IMHO, ACPI resources should be parsed and reserved by the ACPI core,
> > before any PCI resource management (since PCI host bridges are
> > represented in ACPI).  But historically PCI devices have enumerated
> > before ACPI got involved.  And the ACPI core doesn't really pay
> > attention to _CRS for most devices (with the exception of PNP0C02).
> > 
> > IMO the PNP0C02 code in drivers/pnp/system.c should really be done in
> > the ACPI core for all ACPI devices, similar to the way the PCI core
> > reserves BAR space for all PCI devices, even if we don't have drivers
> > for them.  I've tried to fix this in the past, but it is really a
> > nightmare to unravel everything.
> > 
> > Because the ACPI core doesn't reserve resources for the _CRS of all
> > ACPI devices, we're already vulnerable to the problem of placing a
> > device on top of another ACPI device.  We don't see problems because
> > on x86, at least, most ACPI devices are already configured by the BIOS
> > to be enabled and non-overlapping.  But x86 has the advantage of
> > having extensive test coverage courtesy of Windows, and as long as
> > _CRS has the right stuff in it, we at least have the potential of
> > fixing problems in Linux.
> > 
> > If the platform doesn't report resource usage correctly on ARM, we may
> > not find problems (because we don't have the Windows test suite) and
> > if we have resource assignment problems because _CRS is lacking, we'll
> > have no way to fix them.
> > 
> > > As per last Tomasz's patchset, we claim and assign unassigned PCI
> > > resources upon ACPI PCI host bridge probing (which happens at
> > > subsys_initcall time, courtesy of ACPI current code); at that time
> > the
> > > kernel did not even register the PNP0c02 driver
> > (drivers/pnp/system.c)
> > > (it does that at fs_initcall). On the other hand, we insert MCFG
> > > regions into the resource tree upon MCFG parsing, so I do not
> > > see why we need to rely on PNP0c02 to do that for us (granted, the
> > > mechanism is part of the PCI fw specs, which are x86 centric anyway
> > > ie we can't certainly rely on Int15 e820 to detect reserved memory
> > > on ARM :D)
> > >
> > > There is lots of legacy x86 here and Bjorn definitely has more
> > > visibility into that than I have, the ARM world must understand
> > > how this works to make sure we have an agreement.
> > 
> > As you say, there is lots of unpleasant x86 legacy here.  Possibly ARM
> > has a chance to clean this up and do it more sanely; I'm not sure
> > whether it's feasible to reverse the ACPI/PCI init order there or not.
> > 
> > Rafael, any thoughts on this whole thing?
> > 
> > Bjorn
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Lorenzo Pieralisi March 1, 2016, 7:22 p.m. UTC | #15
Hi Bjorn,

On Thu, Feb 25, 2016 at 01:59:12PM -0600, Bjorn Helgaas wrote:
> On Thu, Feb 25, 2016 at 12:07:50PM +0000, Lorenzo Pieralisi wrote:
> > On Thu, Feb 25, 2016 at 03:01:19AM +0000, Gabriele Paoloni wrote:

[...]

> > I do not understand how PNP0c02 works, currently, by the way.
> > 
> > If I read x86 code correctly, the unassigned PCI bus resources are
> > assigned in arch/x86/pci/i386.c (?) fs_initcall(pcibios_assign_resources),
> > with a comment:
> > 
> > /**
> >  * called in fs_initcall (one below subsys_initcall),
> >  * give a chance for motherboard reserve resources
> >  */
> > 
> > Problem is, motherboard resources are requested through (?):
> > 
> > drivers/pnp/system.c
> > 
> > which is also initialized at fs_initcall, so it might be called after
> > core x86 code reassign resources, defeating the purpose PNP0c02 was
> > designed for, namely, request motherboard regions before resources
> > are assigned, am I wrong ?
> 
> I think you're right.  This is a long-standing screwup in Linux.
> IMHO, ACPI resources should be parsed and reserved by the ACPI core,
> before any PCI resource management (since PCI host bridges are
> represented in ACPI).  But historically PCI devices have enumerated
> before ACPI got involved.  And the ACPI core doesn't really pay
> attention to _CRS for most devices (with the exception of PNP0C02).
> 
> IMO the PNP0C02 code in drivers/pnp/system.c should really be done in
> the ACPI core for all ACPI devices, similar to the way the PCI core
> reserves BAR space for all PCI devices, even if we don't have drivers
> for them.  I've tried to fix this in the past, but it is really a
> nightmare to unravel everything.
> 
> Because the ACPI core doesn't reserve resources for the _CRS of all
> ACPI devices, we're already vulnerable to the problem of placing a
> device on top of another ACPI device.  We don't see problems because
> on x86, at least, most ACPI devices are already configured by the BIOS
> to be enabled and non-overlapping.  But x86 has the advantage of
> having extensive test coverage courtesy of Windows, and as long as
> _CRS has the right stuff in it, we at least have the potential of
> fixing problems in Linux.

Thank you for the explanation, that's very useful.

I think it is quite important for all ARM developers to understand this
discussion, so I have two questions.

By "fixing problems in Linux" above, you mean that, given that we
do have a validated _CRS space, we can request/reserve the region the _CRS
reports to prevent assigning those resources to other devices, correct ?

> If the platform doesn't report resource usage correctly on ARM, we may
> not find problems (because we don't have the Windows test suite) and
> if we have resource assignment problems because _CRS is lacking, we'll
> have no way to fix them.

And I think here you mean we can't prevent assigning resource space to
devices that do not necessarily own it because since some devices _CRS
are borked/missing we have no way to detect the address space allocated
to them and we may end up with resources conflicts.

Thank you in advance for the explanation, I find this discussion
extremely helpful.

Lorenzo

> > As per last Tomasz's patchset, we claim and assign unassigned PCI
> > resources upon ACPI PCI host bridge probing (which happens at
> > subsys_initcall time, courtesy of ACPI current code); at that time the
> > kernel did not even register the PNP0c02 driver (drivers/pnp/system.c)
> > (it does that at fs_initcall). On the other hand, we insert MCFG
> > regions into the resource tree upon MCFG parsing, so I do not
> > see why we need to rely on PNP0c02 to do that for us (granted, the
> > mechanism is part of the PCI fw specs, which are x86 centric anyway
> > ie we can't certainly rely on Int15 e820 to detect reserved memory
> > on ARM :D)
> > 
> > There is lots of legacy x86 here and Bjorn definitely has more
> > visibility into that than I have, the ARM world must understand
> > how this works to make sure we have an agreement.
> 
> As you say, there is lots of unpleasant x86 legacy here.  Possibly ARM
> has a chance to clean this up and do it more sanely; I'm not sure
> whether it's feasible to reverse the ACPI/PCI init order there or not.
> 
> Rafael, any thoughts on this whole thing?
> 
> Bjorn
>
Bjorn Helgaas March 2, 2016, 2:32 p.m. UTC | #16
On Thu, Feb 25, 2016 at 10:24:34PM +0100, Rafael J. Wysocki wrote:
> On Thursday, February 25, 2016 01:59:12 PM Bjorn Helgaas wrote:
> > On Thu, Feb 25, 2016 at 12:07:50PM +0000, Lorenzo Pieralisi wrote:
> > > On Thu, Feb 25, 2016 at 03:01:19AM +0000, Gabriele Paoloni wrote:
> > > 
> > > [...]
> > > 
> > > > > I think the relevant spec is the PCI Firmware Spec, r3.0, sec 4.1.2.
> > > > > Note 2 in that section says the address range of an MMCFG region
> > > > > must be reserved by declaring a motherboard resource, i.e., included
> > > > > in the _CRS of a PNP0C02 or similar device.
> > > > 
> > > > I had a look a this. So yes the specs says that we should use the 
> > > > PNP0C02 device if MCFG is not supported. 
> > > 
> > > AFAIK, PNP0C02 is a resource reservation mechanism, the spec says that
> > > MCFG regions must be reserved using PNP0C02, even though its
> > > current usage on x86 is a bit unfathomable to me, in particular
> > > in relation to MCFG resources retrieved for hotpluggable bridges (ie
> > > through _CBA, which I think consider the MCFG region as reserved
> > > by default, regardless of PNP0c02):
> > > 
> > > see pci_mmcfg_check_reserved() arch/x86/pci/mmconfig-shared.c
> > 
> > I don't know how _CBA-related resources would be reserved.  I haven't
> > personally worked with any host bridges that supply _CBA, so I don't
> > know whether or how they handled it.
> > 
> > I think the spec intent was that the Consumer/Producer bit (Extended
> > Address Space Descriptor, General Flags Bit[0], see ACPI spec sec
> > 6.4.3.5.4) would be used.  Resources such as ECAM areas would be
> > marked "Consumer", meaning the bridge consumed that space itself, and
> > windows passed down to the PCI bus would be marked "Producer".
> > 
> > But BIOSes didn't use that bit consistently, so we couldn't rely on
> > it.  I verified experimentally that Windows didn't pay attention to
> > that bit either, at least for DWord descriptors:
> > https://bugzilla.kernel.org/show_bug.cgi?id=15701
> > 
> > It's conceivable that we could still use that bit in Extended Address
> > Space descriptors, or maybe some hack like pay attention if the bridge
> > has _CBA, or some such.
> 
> Last time I asked the firmware vendor people in the ASWG about that,
> the answer was that the Consumer/Producer bit was useless as they had
> never paid any attention to it in general.
> 
> It may be good to update the spec to say something along these lines, actually.

Yes, we've tried updating the spec, but the update was kind of a
disaster.

ACPI 2.0b documented the Consumer/Producer bit in the General Flags
byte of QWORD, DWORD, and WORD Address Space Descriptors (6.4.3.5).

ACPI 2.0c removed that General Flags description, but left
ResourceConsumer in various examples and in the ASL grammar (16.1.3).

ACPI 3.0 added back the Consumer/Producer bit in General Flags (I
think this was an editing mistake) and added the Extended Address
Space Descriptor, also with a Consumer/Producer bit.

ACPI 5.0 removed the Consumer/Producer bit from QWORD, DWORD, and WORD
Address Space Descriptors, but kept it for Extended Address Space
Descriptors (the revision history says this change actually happened
in 4.0a, which I don't have a copy of: "4.0a Apr. 2010:
Consumer/Producer bit is ignored (Restored 2.0C change that had been
lost)").

ACPI 6.0 contains the following incorrect or misleading things:

  - 6.4.3.5.4 Extended Address Space Descriptor documents the
    Consumer/Producer bit in General Flags.

  - 19.2.8 ASL Resource Template Terms defines a ResourceUsage term
    for DWordIO, DWordMemory, DWordSpace, etc., even though I think
    there's no way to encode in in the AML.

  - 19.6.33 DWordIO and similar sections mention ResourceUsage, even
    though I think there's no way to encode it in the AML.

  - Various examples (5.2.21.10, 6.2.4, 9.12.4) show QWordMemory
    using ResourceConsumer.

It's sort of a mess.

Bjorn
Bjorn Helgaas March 2, 2016, 3:51 p.m. UTC | #17
On Tue, Mar 01, 2016 at 07:22:47PM +0000, Lorenzo Pieralisi wrote:
> Hi Bjorn,
> 
> On Thu, Feb 25, 2016 at 01:59:12PM -0600, Bjorn Helgaas wrote:
> > On Thu, Feb 25, 2016 at 12:07:50PM +0000, Lorenzo Pieralisi wrote:
> > > On Thu, Feb 25, 2016 at 03:01:19AM +0000, Gabriele Paoloni wrote:
> 
> [...]
> 
> > > I do not understand how PNP0c02 works, currently, by the way.
> > > 
> > > If I read x86 code correctly, the unassigned PCI bus resources are
> > > assigned in arch/x86/pci/i386.c (?) fs_initcall(pcibios_assign_resources),
> > > with a comment:
> > > 
> > > /**
> > >  * called in fs_initcall (one below subsys_initcall),
> > >  * give a chance for motherboard reserve resources
> > >  */
> > > 
> > > Problem is, motherboard resources are requested through (?):
> > > 
> > > drivers/pnp/system.c
> > > 
> > > which is also initialized at fs_initcall, so it might be called after
> > > core x86 code reassign resources, defeating the purpose PNP0c02 was
> > > designed for, namely, request motherboard regions before resources
> > > are assigned, am I wrong ?
> > 
> > I think you're right.  This is a long-standing screwup in Linux.
> > IMHO, ACPI resources should be parsed and reserved by the ACPI core,
> > before any PCI resource management (since PCI host bridges are
> > represented in ACPI).  But historically PCI devices have enumerated
> > before ACPI got involved.  And the ACPI core doesn't really pay
> > attention to _CRS for most devices (with the exception of PNP0C02).
> > 
> > IMO the PNP0C02 code in drivers/pnp/system.c should really be done in
> > the ACPI core for all ACPI devices, similar to the way the PCI core
> > reserves BAR space for all PCI devices, even if we don't have drivers
> > for them.  I've tried to fix this in the past, but it is really a
> > nightmare to unravel everything.
> > 
> > Because the ACPI core doesn't reserve resources for the _CRS of all
> > ACPI devices, we're already vulnerable to the problem of placing a
> > device on top of another ACPI device.  We don't see problems because
> > on x86, at least, most ACPI devices are already configured by the BIOS
> > to be enabled and non-overlapping.  But x86 has the advantage of
> > having extensive test coverage courtesy of Windows, and as long as
> > _CRS has the right stuff in it, we at least have the potential of
> > fixing problems in Linux.
> 
> ...
> By "fixing problems in Linux" above, you mean that, given that we
> do have a validated _CRS space, we can request/reserve the region the _CRS
> reports to prevent assigning those resources to other devices, correct ?

Yes.

I think part of what makes this difficult in Linux is that the
resource tree is too strict about overlapping resources.  We get
address space information from E820 (on x86), static ACPI tables like
MCFG, and dynamic things like ACPI _CRS.  There's no real requirement
that the BIOS should make all these consistent, but yet we try to jam
it all into the same resource tree.

For example, E820 might tell us that range A is reserved and
unavailable to Linux.  We stick it in the resource tree.  Then we
might have a _CRS that tells us about range B.  We *want* to put range
B in the resource tree, but if B overlaps part of A, the insert will
fail.

All we really need from E820 is the information that "you can't put
devices in A".  We don't need to enforce any relationship between A
and B, but the current resource tree imposes unnecessary hierarchical
requirements.

I think issues like this are the biggest reason why the ACPI core
doesn't reserve all _CRS space early on (Rafael may correct me here).

> > If the platform doesn't report resource usage correctly on ARM, we may
> > not find problems (because we don't have the Windows test suite) and
> > if we have resource assignment problems because _CRS is lacking, we'll
> > have no way to fix them.
> 
> And I think here you mean we can't prevent assigning resource space to
> devices that do not necessarily own it because since some devices _CRS
> are borked/missing we have no way to detect the address space allocated
> to them and we may end up with resources conflicts.

The ACPI core currently doesn't reserve the space consumed by ACPI
devices.  Some drivers, e.g., for PNP0C02 (motherboard) and PNP0A03
(PCI host bridge), do reserve their space, but the core itself does
not.  

If we have drivers for all the ACPI devices, those drivers will
probably use _CRS and reserve the space, and we'll probably notice any
_CRS errors.  But if we don't have drivers, e.g., for performance
monitors or other non-essential things, nothing will use _CRS, and
nothing will reserve the space used by the device, and it's hard to
find errors.

If we ever assign top-level resources, there's nothing to prevent us
from creating a conflict.  The only reason we don't trip over this is
that we usually don't assign top-level resources because firmware does
it for us.

> > > As per last Tomasz's patchset, we claim and assign unassigned PCI
> > > resources upon ACPI PCI host bridge probing (which happens at
> > > subsys_initcall time, courtesy of ACPI current code); at that time the
> > > kernel did not even register the PNP0c02 driver (drivers/pnp/system.c)
> > > (it does that at fs_initcall). On the other hand, we insert MCFG
> > > regions into the resource tree upon MCFG parsing, so I do not
> > > see why we need to rely on PNP0c02 to do that for us (granted, the
> > > mechanism is part of the PCI fw specs, which are x86 centric anyway
> > > ie we can't certainly rely on Int15 e820 to detect reserved memory
> > > on ARM :D)
> > > 
> > > There is lots of legacy x86 here and Bjorn definitely has more
> > > visibility into that than I have, the ARM world must understand
> > > how this works to make sure we have an agreement.
> > 
> > As you say, there is lots of unpleasant x86 legacy here.  Possibly ARM
> > has a chance to clean this up and do it more sanely; I'm not sure
> > whether it's feasible to reverse the ACPI/PCI init order there or not.
> > 
> > Rafael, any thoughts on this whole thing?
> > 
> > Bjorn
> > 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gabriele Paoloni March 3, 2016, 2:21 p.m. UTC | #18
Hi Lorenzo, many thanks for replying

> -----Original Message-----
> From: Lorenzo Pieralisi [mailto:lorenzo.pieralisi@arm.com]
> Sent: 29 February 2016 11:38
> To: Gabriele Paoloni
> Cc: Bjorn Helgaas; 'Mark Rutland'; Guohanjun (Hanjun Guo); Wangzhou
> (B); liudongdong (C); Linuxarm; qiujiang; 'bhelgaas@google.com';
> 'arnd@arndb.de'; 'tn@semihalf.com'; 'linux-pci@vger.kernel.org';
> 'linux-kernel@vger.kernel.org'; xuwei (O); 'linux-
> acpi@vger.kernel.org'; 'jcm@redhat.com'; zhangjukuo; Liguozhu
> (Kenneth); 'linux-arm-kernel@lists.infradead.org'
> Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support for
> HiSilicon SoCs Host Controllers
> 
> > > > > > I think the relevant spec is the PCI Firmware Spec, r3.0, sec
> > > 4.1.2.
> > > > > > Note 2 in that section says the address range of an MMCFG
> region
> > > > > > must be reserved by declaring a motherboard resource, i.e.,
> > > included
> > > > > > in the _CRS of a PNP0C02 or similar device.
> > > > >
> > > > > I had a look a this. So yes the specs says that we should use
> the
> > > > > PNP0C02 device if MCFG is not supported.
> > > >
> > > > AFAIK, PNP0C02 is a resource reservation mechanism, the spec says
> > > that
> > > > MCFG regions must be reserved using PNP0C02, even though its
> > > > current usage on x86 is a bit unfathomable to me, in particular
> > > > in relation to MCFG resources retrieved for hotpluggable bridges
> (ie
> > > > through _CBA, which I think consider the MCFG region as reserved
> > > > by default, regardless of PNP0c02):
> > > >
> > > > see pci_mmcfg_check_reserved() arch/x86/pci/mmconfig-shared.c
> >
> > Yes I checked this and it seems to check if an area of memory from
> > MCFG is overlapping with any area of memory specified by PNP0C02
> > _CRS...
> >
> > However (maybe I am wrong) it looks to me that this part works
> > independently of the PNP0c02 driver. It seems that goes directly
> > to walk the ACPI namespace and look for PNP0C02 HID; as it finds it,
> > it checks the range of memory specified in the _CRS method and see
> > if it overlaps with the MCFG resource...am I missing something?
> 
> Well, if I understand the code correctly, the x86 MCFG code, for static
> MCFG tables, check that the MCFG regions are part of motherboard
> resources (by walking the ACPI namespace as you said). If that's the
> case,
> it does not insert resources into the resource tree till a
> late_initcall,
> (pci_mmcfg_late_insert_resources()) that should run after the PNP0C02
> driver
> was initialized (?) (I guess, that's a nightmare to understand these
> initcall
> ordering unwritten rules dependencies, if they exist). If the MCFG
> region is
> not part of motherboard resources it is discarded (ie
> pci_mmconfig_insert()
> in arch/x86/pci/mmconfig-shared.c)

Yes it looks like at boot time it parses the ACPI namespace, if a
cfg region is marked as reserved, then it is mapped (cfg->virt is
assigned) and is added to pci_mmcfg_list so that later on this list
can be used by pci_mmcfg_late_insert_resources() to insert the cfg->res
in the iomem_resource resource tree.

However there are some things that are not clear to me:
-  if an MCFG region is not marked as reserved, is it just discarded by
   the x86 framework?
   I see that in the Nowicki generic "pci_mmconfig_map_resource" we just
   map the mcfg config regions with no need for them to be reserved...?
-  I do not understand the role of the PNP system driver. It looks like
   this also inserts resources under iomem_resource resource tree (in
   Case of memory resources)...

Thanks

Gab

> 
> I really do not know why x86 code has been implemented like that
> and whatever I say it is just speculation, honestly it is not easy
> to understand.
> 
> > If my interpretation is correct, couldn't we just modify
> > pci_mmconfig_map_resource() in the latest Nowicki patchset and add
> > a similar check before insert_resource_conflict() is called?
> 
> Yes, but I am not sure that would help much. We can prevent adding
> MCFG resources to the resource tree if they are not declared as
> motherboard
> resources, but if they do it is totally unclear to me what we should
> do.
> 
> Should we insert the MCFG region resources into the resource tree
> before
> PNP0C02 driver has a chance to be probed ? Or do what x86 does (ie it
> does
> not insert resources into the resource tree till late_initcall
> pci_mmcfg_late_insert_resources()) ? I do not know. If the PNP0C02
> driver stays as it is, whatever we do with PNP0C02 regions does not
> make
> much sense to me, that's the gist of what I am saying, I don't know why
> x86 code was implemented like that, I will go through the commits
> history
> to find some light here.
> 
> Lorenzo
> 
> >
> > On the other side HiSilicon host bridge quirks could use the address
> > retrieved by the _CRS method of PNP0C02 for our root complex config
> > rd/wr...?
> >
> > >
> > > I don't know how _CBA-related resources would be reserved.  I
> haven't
> > > personally worked with any host bridges that supply _CBA, so I
> don't
> > > know whether or how they handled it.
> > >
> > > I think the spec intent was that the Consumer/Producer bit
> (Extended
> > > Address Space Descriptor, General Flags Bit[0], see ACPI spec sec
> > > 6.4.3.5.4) would be used.  Resources such as ECAM areas would be
> > > marked "Consumer", meaning the bridge consumed that space itself,
> and
> > > windows passed down to the PCI bus would be marked "Producer".
> > >
> > > But BIOSes didn't use that bit consistently, so we couldn't rely on
> > > it.  I verified experimentally that Windows didn't pay attention to
> > > that bit either, at least for DWord descriptors:
> > > https://bugzilla.kernel.org/show_bug.cgi?id=15701
> > >
> > > It's conceivable that we could still use that bit in Extended
> Address
> > > Space descriptors, or maybe some hack like pay attention if the
> bridge
> > > has _CBA, or some such.  Or maybe a BIOS could add a PNP0C02 device
> > > along with the PNP0A03 device that uses _CBA, with the PNP0C02 _CRS
> > > describing the ECAM area referenced by _CBA.  Seeems hacky no
> matter
> > > how we slice it.
> >
> > Well about this I don't know much but, having looked at the bugzilla
> > and considering the current mechanism used by
> pci_mmcfg_check_reserved()
> > I have the feeling that this last one is easier to implement and it
> seems
> > the one currently used (in mmconfig-shared.c )
> >
> > Cheers
> >
> > Gab
> >
> > >
> > > > Have a look at drivers/pnp/system.c for PNP0c02
> > > >
> > > > > So probably I can use acpi_get_devices("PNP0C02",...) to
> retrieve
> > > it
> > > > > from the quirk match function, I will look into this...
> > > > >
> > > > > >
> > > > > > > On the other side, since this is an exception only for the
> > > config
> > > > > > > space address of our host controller (as said before all
> the
> > > buses
> > > > > > > below the root one support ECAM), I think that it is right
> to
> > > put
> > > > > > > this address as a device specific data (in fact the rest of
> the
> > > > > > > config space addresses will be parsed from MCFG).
> > > > > >
> > > > > > A kernel with no support for your host controller (and thus
> no
> > > > > > knowledge of its _DSD) should still be able to operate the
> rest
> > > of the
> > > > > > system correctly.  That means we must have a generic way to
> learn
> > > what
> > > > > > address space is consumed by the host controller so we don't
> try
> > > to
> > > > > > assign it to other devices.
> > > > >
> > > > > This is something I don't understand much...
> > > > > Are you talking about a scenario where we have a Kernel image
> > > compiled
> > > > > without our host controller support and running on our
> platform?
> > > >
> > > > I *think* the point here is that your host controller config
> space
> > > should be
> > > > reserved through PNP0c02 so that the kernel will reserve it
> through
> > > the
> > > > generic PNP0c02 driver even if your host controller driver (and
> > > related
> > > > _DSD) is not supported in the kernel.
> > >
> > > Right.  Assume you have two top-level devices:
> > >
> > >   PNP0A03  PCI host bridge
> > >     _CRS     describes windows
> > >     ????     describes ECAM space consumed
> > >   PNPxxxx  another ACPI device, currently disabled
> > >     _PRS     specifies possible resource settings, may specify no
> > > restrictions
> > >     _SRS     assign resources and enable device
> > >     _CRS     empty until device enabled
> > >
> > > When the OS enables PNPxxxx, it must first assign resources to it
> > > using _PRS and _SRS.  We evaluate _PRS to find out what the
> addresses
> > > PNPxxxx can support.  This tells us things like how wide the
> address
> > > decoder is, the size of the region required, and any alignment
> > > restrictions -- basically the same information we get by sizing a
> PCI
> > > BAR.
> > >
> > > Now, how do we assign space for PNPxxxx?  In a few cases, _PRS has
> > > only a few specific possibilities, e.g., an x86 legacy serial port
> > > that can be at 0x3f8 or 0x2f8.  But in general, _PRS need not
> impose
> > > any restrictions.
> > >
> > > So in general the OS can use any space that can be routed to
> PNPxxxx.
> > > If there's an upstream bridge, it may define windows that restrict
> the
> > > possibilities.  But in this case, there *is* no upstream bridge, so
> > > the possible choices are the entire physical address space of the
> > > platform, except for other things that are already allocated: RAM,
> the
> > > _CRS settings for other ACPI devices, things reserved by the E820
> > > table (at least on x86), etc.
> > >
> > > If PNP0A03 consumes address space for ECAM, that space must be
> > > reported *somewhere* so the OS knows not to place PNPxxxx there.
> This
> > > reporting must be generic (not device-specific like _DSD).  The
> ACPI
> > > core (not drivers) is responsible for managing this address space
> > > because:
> > >
> > >   a) the OS is not guaranteed to have drivers for all devices, and
> > >
> > >   b) even it *did* have drivers for all devices, the PNPxxxx space
> may
> > >   be assigned before drivers are initialized.
> > >
> > > > I do not understand how PNP0c02 works, currently, by the way.
> > > >
> > > > If I read x86 code correctly, the unassigned PCI bus resources
> are
> > > > assigned in arch/x86/pci/i386.c (?)
> > > fs_initcall(pcibios_assign_resources),
> > > > with a comment:
> > > >
> > > > /**
> > > >  * called in fs_initcall (one below subsys_initcall),
> > > >  * give a chance for motherboard reserve resources
> > > >  */
> > > >
> > > > Problem is, motherboard resources are requested through (?):
> > > >
> > > > drivers/pnp/system.c
> > > >
> > > > which is also initialized at fs_initcall, so it might be called
> after
> > > > core x86 code reassign resources, defeating the purpose PNP0c02
> was
> > > > designed for, namely, request motherboard regions before
> resources
> > > > are assigned, am I wrong ?
> > >
> > > I think you're right.  This is a long-standing screwup in Linux.
> > > IMHO, ACPI resources should be parsed and reserved by the ACPI
> core,
> > > before any PCI resource management (since PCI host bridges are
> > > represented in ACPI).  But historically PCI devices have enumerated
> > > before ACPI got involved.  And the ACPI core doesn't really pay
> > > attention to _CRS for most devices (with the exception of PNP0C02).
> > >
> > > IMO the PNP0C02 code in drivers/pnp/system.c should really be done
> in
> > > the ACPI core for all ACPI devices, similar to the way the PCI core
> > > reserves BAR space for all PCI devices, even if we don't have
> drivers
> > > for them.  I've tried to fix this in the past, but it is really a
> > > nightmare to unravel everything.
> > >
> > > Because the ACPI core doesn't reserve resources for the _CRS of all
> > > ACPI devices, we're already vulnerable to the problem of placing a
> > > device on top of another ACPI device.  We don't see problems
> because
> > > on x86, at least, most ACPI devices are already configured by the
> BIOS
> > > to be enabled and non-overlapping.  But x86 has the advantage of
> > > having extensive test coverage courtesy of Windows, and as long as
> > > _CRS has the right stuff in it, we at least have the potential of
> > > fixing problems in Linux.
> > >
> > > If the platform doesn't report resource usage correctly on ARM, we
> may
> > > not find problems (because we don't have the Windows test suite)
> and
> > > if we have resource assignment problems because _CRS is lacking,
> we'll
> > > have no way to fix them.
> > >
> > > > As per last Tomasz's patchset, we claim and assign unassigned PCI
> > > > resources upon ACPI PCI host bridge probing (which happens at
> > > > subsys_initcall time, courtesy of ACPI current code); at that
> time
> > > the
> > > > kernel did not even register the PNP0c02 driver
> > > (drivers/pnp/system.c)
> > > > (it does that at fs_initcall). On the other hand, we insert MCFG
> > > > regions into the resource tree upon MCFG parsing, so I do not
> > > > see why we need to rely on PNP0c02 to do that for us (granted,
> the
> > > > mechanism is part of the PCI fw specs, which are x86 centric
> anyway
> > > > ie we can't certainly rely on Int15 e820 to detect reserved
> memory
> > > > on ARM :D)
> > > >
> > > > There is lots of legacy x86 here and Bjorn definitely has more
> > > > visibility into that than I have, the ARM world must understand
> > > > how this works to make sure we have an agreement.
> > >
> > > As you say, there is lots of unpleasant x86 legacy here.  Possibly
> ARM
> > > has a chance to clean this up and do it more sanely; I'm not sure
> > > whether it's feasible to reverse the ACPI/PCI init order there or
> not.
> > >
> > > Rafael, any thoughts on this whole thing?
> > >
> > > Bjorn
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-
> pci" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
Gabriele Paoloni March 9, 2016, 7:41 a.m. UTC | #19
Hi Bjorn, Lorenzo

> -----Original Message-----
> From: Bjorn Helgaas [mailto:helgaas@kernel.org]
> Sent: 02 March 2016 15:51
> To: Lorenzo Pieralisi
> Cc: Gabriele Paoloni; 'Mark Rutland'; Guohanjun (Hanjun Guo); Wangzhou
> (B); liudongdong (C); Linuxarm; qiujiang; 'bhelgaas@google.com';
> 'arnd@arndb.de'; 'tn@semihalf.com'; 'linux-pci@vger.kernel.org';
> 'linux-kernel@vger.kernel.org'; xuwei (O); 'linux-
> acpi@vger.kernel.org'; 'jcm@redhat.com'; zhangjukuo; Liguozhu
> (Kenneth); 'linux-arm-kernel@lists.infradead.org'
> Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support for
> HiSilicon SoCs Host Controllers
> 
> On Tue, Mar 01, 2016 at 07:22:47PM +0000, Lorenzo Pieralisi wrote:
> > Hi Bjorn,
> >
> > On Thu, Feb 25, 2016 at 01:59:12PM -0600, Bjorn Helgaas wrote:
> > > On Thu, Feb 25, 2016 at 12:07:50PM +0000, Lorenzo Pieralisi wrote:
> > > > On Thu, Feb 25, 2016 at 03:01:19AM +0000, Gabriele Paoloni wrote:
> >
> > [...]
> >
> > > > I do not understand how PNP0c02 works, currently, by the way.
> > > >
> > > > If I read x86 code correctly, the unassigned PCI bus resources
> are
> > > > assigned in arch/x86/pci/i386.c (?)
> fs_initcall(pcibios_assign_resources),
> > > > with a comment:
> > > >
> > > > /**
> > > >  * called in fs_initcall (one below subsys_initcall),
> > > >  * give a chance for motherboard reserve resources
> > > >  */
> > > >
> > > > Problem is, motherboard resources are requested through (?):
> > > >
> > > > drivers/pnp/system.c
> > > >
> > > > which is also initialized at fs_initcall, so it might be called
> after
> > > > core x86 code reassign resources, defeating the purpose PNP0c02
> was
> > > > designed for, namely, request motherboard regions before
> resources
> > > > are assigned, am I wrong ?
> > >
> > > I think you're right.  This is a long-standing screwup in Linux.
> > > IMHO, ACPI resources should be parsed and reserved by the ACPI
> core,
> > > before any PCI resource management (since PCI host bridges are
> > > represented in ACPI).  But historically PCI devices have enumerated
> > > before ACPI got involved.  And the ACPI core doesn't really pay
> > > attention to _CRS for most devices (with the exception of PNP0C02).
> > >
> > > IMO the PNP0C02 code in drivers/pnp/system.c should really be done
> in
> > > the ACPI core for all ACPI devices, similar to the way the PCI core
> > > reserves BAR space for all PCI devices, even if we don't have
> drivers
> > > for them.  I've tried to fix this in the past, but it is really a
> > > nightmare to unravel everything.
> > >
> > > Because the ACPI core doesn't reserve resources for the _CRS of all
> > > ACPI devices, we're already vulnerable to the problem of placing a
> > > device on top of another ACPI device.  We don't see problems
> because
> > > on x86, at least, most ACPI devices are already configured by the
> BIOS
> > > to be enabled and non-overlapping.  But x86 has the advantage of
> > > having extensive test coverage courtesy of Windows, and as long as
> > > _CRS has the right stuff in it, we at least have the potential of
> > > fixing problems in Linux.
> >
> > ...
> > By "fixing problems in Linux" above, you mean that, given that we
> > do have a validated _CRS space, we can request/reserve the region the
> _CRS
> > reports to prevent assigning those resources to other devices,
> correct ?
> 
> Yes.
> 
> I think part of what makes this difficult in Linux is that the
> resource tree is too strict about overlapping resources.  We get
> address space information from E820 (on x86), static ACPI tables like
> MCFG, and dynamic things like ACPI _CRS.  There's no real requirement
> that the BIOS should make all these consistent, but yet we try to jam
> it all into the same resource tree.
> 
> For example, E820 might tell us that range A is reserved and
> unavailable to Linux.  We stick it in the resource tree.  Then we
> might have a _CRS that tells us about range B.  We *want* to put range
> B in the resource tree, but if B overlaps part of A, the insert will
> fail.
> 
> All we really need from E820 is the information that "you can't put
> devices in A".  We don't need to enforce any relationship between A
> and B, but the current resource tree imposes unnecessary hierarchical
> requirements.
> 
> I think issues like this are the biggest reason why the ACPI core
> doesn't reserve all _CRS space early on (Rafael may correct me here).
> 
> > > If the platform doesn't report resource usage correctly on ARM, we
> may
> > > not find problems (because we don't have the Windows test suite)
> and
> > > if we have resource assignment problems because _CRS is lacking,
> we'll
> > > have no way to fix them.
> >
> > And I think here you mean we can't prevent assigning resource space
> to
> > devices that do not necessarily own it because since some devices
> _CRS
> > are borked/missing we have no way to detect the address space
> allocated
> > to them and we may end up with resources conflicts.
> 
> The ACPI core currently doesn't reserve the space consumed by ACPI
> devices.  Some drivers, e.g., for PNP0C02 (motherboard) and PNP0A03
> (PCI host bridge), do reserve their space, but the core itself does
> not.
> 
> If we have drivers for all the ACPI devices, those drivers will
> probably use _CRS and reserve the space, and we'll probably notice any
> _CRS errors.  But if we don't have drivers, e.g., for performance
> monitors or other non-essential things, nothing will use _CRS, and
> nothing will reserve the space used by the device, and it's hard to
> find errors.
> 
> If we ever assign top-level resources, there's nothing to prevent us
> from creating a conflict.  The only reason we don't trip over this is
> that we usually don't assign top-level resources because firmware does
> it for us.

It seems that in this thread we have touched quite few issues.

First is how to describe a PCI host controller config space resource:
as you highlighted before mentioning the specs, these resources should
be marked as "PNP0C02"; therefore I guess the current Nowicki patchset
must be reworked to check the resources to be motherboard reserved when
parsing the MCFG table.

Also with respect to the ACPI table for my specific PCIe controller I
would use the following approach:

	// PCIe Root bus
	Device (PCI1)
	{
		Name (_HID, "HISI0080") // PCI Express Root Bridge
		Name (_CID, "PNP0A03") // Compatible PCI Root Bridge
		Name(_SEG, 1) // Segment of this Root complex
		Name(_BBN, 0) // Base Bus Number
		Name(_CCA, 1)
		Method (_CRS, 0, Serialized) { // Root complex resources
			Name (RBUF, ResourceTemplate () {
				WordBusNumber ( // Bus numbers assigned to this root
					ResourceProducer, MinFixed, MaxFixed, PosDecode,
					0, // AddressGranularity
					0, // AddressMinimum - Minimum Bus Number
					63, // AddressMaximum - Maximum Bus Number
					0, // AddressTranslation - Set to 0
					64 // RangeLength - Number of Busses
				)
				QWordMemory ( // 64-bit BAR Windows
					ResourceProducer,
					PosDecode,
					MinFixed,
					MaxFixed,
					Cacheable,
					ReadWrite,
					0x0000000000000000, // Granularity
					0x00000000b0000000, // Min Base Address pci address
					0x00000000bbfeffff, // Max Base Address
					0x0000021f54000000, // Translate
					0x000000000bff0000 // Length
				)
				QWordIO (
					ResourceProducer,
					MinFixed,
					MaxFixed,
					PosDecode,
					EntireRange,
					0x0000000000000000, // Granularity
					0x0000000000000000, // Min Base Address
					0x000000000000ffff, // Max Base Address
					0x000002200fff0000, // Translate
					0x0000000000010000 // Length
				)
			}) // Name(RBUF)
			Return (RBUF)
		} // Method(_CRS)
		Device (RES0)
		{
			Name (_HID, "HISI0081") // HiSi PCIe RC config base address
			Name (_CID, "PNP0C02")  // Motherboard reserved resource
			Name (_CRS, ResourceTemplate (){
                         Memory32Fixed (ReadWrite, 0xb0080000 , 0x10000)
			})
			
		}
   
So in the table above I have a sub-device under the RC to pass the address
for the RC config space (the rest of the config space addresses for bus 1
to 63 are passed in the MCFG). As you can see the device _CID is PNP0C02
As per ACPI specs.
Do you see anything wrong with this approach?
 
The second issue is when and how to reserve HW resources. As per
conversation above this seems quite a tricky issue and probably needs to
consider different aspects...

I was wondering if we can take a gradual approach; maybe for the time being
we can just rework Nowicki patchset to check the MCFG resources to be
motherboard reserved and later on we can make an effort to fix the resource
insertion mechanism making sure that it works right on both x86 and ARM.

What do you think about?

Many Thanks

Gab

> 
> > > > As per last Tomasz's patchset, we claim and assign unassigned PCI
> > > > resources upon ACPI PCI host bridge probing (which happens at
> > > > subsys_initcall time, courtesy of ACPI current code); at that
> time the
> > > > kernel did not even register the PNP0c02 driver
> (drivers/pnp/system.c)
> > > > (it does that at fs_initcall). On the other hand, we insert MCFG
> > > > regions into the resource tree upon MCFG parsing, so I do not
> > > > see why we need to rely on PNP0c02 to do that for us (granted,
> the
> > > > mechanism is part of the PCI fw specs, which are x86 centric
> anyway
> > > > ie we can't certainly rely on Int15 e820 to detect reserved
> memory
> > > > on ARM :D)
> > > >
> > > > There is lots of legacy x86 here and Bjorn definitely has more
> > > > visibility into that than I have, the ARM world must understand
> > > > how this works to make sure we have an agreement.
> > >
> > > As you say, there is lots of unpleasant x86 legacy here.  Possibly
> ARM
> > > has a chance to clean this up and do it more sanely; I'm not sure
> > > whether it's feasible to reverse the ACPI/PCI init order there or
> not.
> > >
> > > Rafael, any thoughts on this whole thing?
> > >
> > > Bjorn
> > >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-acpi"
> in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas March 14, 2016, 7:16 p.m. UTC | #20
On Wed, Mar 09, 2016 at 07:41:01AM +0000, Gabriele Paoloni wrote:
> Hi Bjorn, Lorenzo
> 
> > -----Original Message-----
> > From: Bjorn Helgaas [mailto:helgaas@kernel.org]
> > Sent: 02 March 2016 15:51
> > To: Lorenzo Pieralisi
> > Cc: Gabriele Paoloni; 'Mark Rutland'; Guohanjun (Hanjun Guo); Wangzhou
> > (B); liudongdong (C); Linuxarm; qiujiang; 'bhelgaas@google.com';
> > 'arnd@arndb.de'; 'tn@semihalf.com'; 'linux-pci@vger.kernel.org';
> > 'linux-kernel@vger.kernel.org'; xuwei (O); 'linux-
> > acpi@vger.kernel.org'; 'jcm@redhat.com'; zhangjukuo; Liguozhu
> > (Kenneth); 'linux-arm-kernel@lists.infradead.org'
> > Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support for
> > HiSilicon SoCs Host Controllers
> > 
> > On Tue, Mar 01, 2016 at 07:22:47PM +0000, Lorenzo Pieralisi wrote:
> > > Hi Bjorn,
> > >
> > > On Thu, Feb 25, 2016 at 01:59:12PM -0600, Bjorn Helgaas wrote:
> > > > On Thu, Feb 25, 2016 at 12:07:50PM +0000, Lorenzo Pieralisi wrote:
> > > > > On Thu, Feb 25, 2016 at 03:01:19AM +0000, Gabriele Paoloni wrote:
> > >
> > > [...]
> > >
> > > > > I do not understand how PNP0c02 works, currently, by the way.
> > > > >
> > > > > If I read x86 code correctly, the unassigned PCI bus resources
> > are
> > > > > assigned in arch/x86/pci/i386.c (?)
> > fs_initcall(pcibios_assign_resources),
> > > > > with a comment:
> > > > >
> > > > > /**
> > > > >  * called in fs_initcall (one below subsys_initcall),
> > > > >  * give a chance for motherboard reserve resources
> > > > >  */
> > > > >
> > > > > Problem is, motherboard resources are requested through (?):
> > > > >
> > > > > drivers/pnp/system.c
> > > > >
> > > > > which is also initialized at fs_initcall, so it might be called
> > after
> > > > > core x86 code reassign resources, defeating the purpose PNP0c02
> > was
> > > > > designed for, namely, request motherboard regions before
> > resources
> > > > > are assigned, am I wrong ?
> > > >
> > > > I think you're right.  This is a long-standing screwup in Linux.
> > > > IMHO, ACPI resources should be parsed and reserved by the ACPI
> > core,
> > > > before any PCI resource management (since PCI host bridges are
> > > > represented in ACPI).  But historically PCI devices have enumerated
> > > > before ACPI got involved.  And the ACPI core doesn't really pay
> > > > attention to _CRS for most devices (with the exception of PNP0C02).
> > > >
> > > > IMO the PNP0C02 code in drivers/pnp/system.c should really be done
> > in
> > > > the ACPI core for all ACPI devices, similar to the way the PCI core
> > > > reserves BAR space for all PCI devices, even if we don't have
> > drivers
> > > > for them.  I've tried to fix this in the past, but it is really a
> > > > nightmare to unravel everything.
> > > >
> > > > Because the ACPI core doesn't reserve resources for the _CRS of all
> > > > ACPI devices, we're already vulnerable to the problem of placing a
> > > > device on top of another ACPI device.  We don't see problems
> > because
> > > > on x86, at least, most ACPI devices are already configured by the
> > BIOS
> > > > to be enabled and non-overlapping.  But x86 has the advantage of
> > > > having extensive test coverage courtesy of Windows, and as long as
> > > > _CRS has the right stuff in it, we at least have the potential of
> > > > fixing problems in Linux.
> > >
> > > ...
> > > By "fixing problems in Linux" above, you mean that, given that we
> > > do have a validated _CRS space, we can request/reserve the region the
> > _CRS
> > > reports to prevent assigning those resources to other devices,
> > correct ?
> > 
> > Yes.
> > 
> > I think part of what makes this difficult in Linux is that the
> > resource tree is too strict about overlapping resources.  We get
> > address space information from E820 (on x86), static ACPI tables like
> > MCFG, and dynamic things like ACPI _CRS.  There's no real requirement
> > that the BIOS should make all these consistent, but yet we try to jam
> > it all into the same resource tree.
> > 
> > For example, E820 might tell us that range A is reserved and
> > unavailable to Linux.  We stick it in the resource tree.  Then we
> > might have a _CRS that tells us about range B.  We *want* to put range
> > B in the resource tree, but if B overlaps part of A, the insert will
> > fail.
> > 
> > All we really need from E820 is the information that "you can't put
> > devices in A".  We don't need to enforce any relationship between A
> > and B, but the current resource tree imposes unnecessary hierarchical
> > requirements.
> > 
> > I think issues like this are the biggest reason why the ACPI core
> > doesn't reserve all _CRS space early on (Rafael may correct me here).
> > 
> > > > If the platform doesn't report resource usage correctly on ARM, we
> > may
> > > > not find problems (because we don't have the Windows test suite)
> > and
> > > > if we have resource assignment problems because _CRS is lacking,
> > we'll
> > > > have no way to fix them.
> > >
> > > And I think here you mean we can't prevent assigning resource space
> > to
> > > devices that do not necessarily own it because since some devices
> > _CRS
> > > are borked/missing we have no way to detect the address space
> > allocated
> > > to them and we may end up with resources conflicts.
> > 
> > The ACPI core currently doesn't reserve the space consumed by ACPI
> > devices.  Some drivers, e.g., for PNP0C02 (motherboard) and PNP0A03
> > (PCI host bridge), do reserve their space, but the core itself does
> > not.
> > 
> > If we have drivers for all the ACPI devices, those drivers will
> > probably use _CRS and reserve the space, and we'll probably notice any
> > _CRS errors.  But if we don't have drivers, e.g., for performance
> > monitors or other non-essential things, nothing will use _CRS, and
> > nothing will reserve the space used by the device, and it's hard to
> > find errors.
> > 
> > If we ever assign top-level resources, there's nothing to prevent us
> > from creating a conflict.  The only reason we don't trip over this is
> > that we usually don't assign top-level resources because firmware does
> > it for us.
> 
> It seems that in this thread we have touched quite few issues.
> 
> First is how to describe a PCI host controller config space resource:
> as you highlighted before mentioning the specs, these resources should
> be marked as "PNP0C02"; therefore I guess the current Nowicki patchset
> must be reworked to check the resources to be motherboard reserved when
> parsing the MCFG table.

I think checking whether MCFG resources are reserved by motherboard
("PNP0C02") devices is a hack that was added on x86 because there were
issues getting ECAM to work reliably.  The theory at the time was that
the problem was BIOS bugs.  I don't know whether that's actually true
or not.

I'm not convinced that checking for PNP0C02 resources is something we
should do in generic code.  MCFG is a static table, and I don't think
we should add dependencies on the ACPI namespace, because the point of
static tables is to describe things we might need before the namespace
is available.

> Also with respect to the ACPI table for my specific PCIe controller I
> would use the following approach:
> 
> 	// PCIe Root bus
> 	Device (PCI1)
> 	{
> 		Name (_HID, "HISI0080") // PCI Express Root Bridge
> 		Name (_CID, "PNP0A03") // Compatible PCI Root Bridge
> 		Name(_SEG, 1) // Segment of this Root complex
> 		Name(_BBN, 0) // Base Bus Number
> 		Name(_CCA, 1)
> 		Method (_CRS, 0, Serialized) { // Root complex resources
> 			Name (RBUF, ResourceTemplate () {
> 				WordBusNumber ( // Bus numbers assigned to this root
> 					ResourceProducer, MinFixed, MaxFixed, PosDecode,
> 					0, // AddressGranularity
> 					0, // AddressMinimum - Minimum Bus Number
> 					63, // AddressMaximum - Maximum Bus Number
> 					0, // AddressTranslation - Set to 0
> 					64 // RangeLength - Number of Busses
> 				)
> 				QWordMemory ( // 64-bit BAR Windows
> 					ResourceProducer,
> 					PosDecode,
> 					MinFixed,
> 					MaxFixed,
> 					Cacheable,
> 					ReadWrite,
> 					0x0000000000000000, // Granularity
> 					0x00000000b0000000, // Min Base Address pci address
> 					0x00000000bbfeffff, // Max Base Address
> 					0x0000021f54000000, // Translate
> 					0x000000000bff0000 // Length
> 				)
> 				QWordIO (
> 					ResourceProducer,
> 					MinFixed,
> 					MaxFixed,
> 					PosDecode,
> 					EntireRange,
> 					0x0000000000000000, // Granularity
> 					0x0000000000000000, // Min Base Address
> 					0x000000000000ffff, // Max Base Address
> 					0x000002200fff0000, // Translate
> 					0x0000000000010000 // Length
> 				)
> 			}) // Name(RBUF)
> 			Return (RBUF)
> 		} // Method(_CRS)
> 		Device (RES0)
> 		{
> 			Name (_HID, "HISI0081") // HiSi PCIe RC config base address
> 			Name (_CID, "PNP0C02")  // Motherboard reserved resource
> 			Name (_CRS, ResourceTemplate (){
>                          Memory32Fixed (ReadWrite, 0xb0080000 , 0x10000)
> 			})
> 			
> 		}
>    
> So in the table above I have a sub-device under the RC to pass the address
> for the RC config space (the rest of the config space addresses for bus 1
> to 63 are passed in the MCFG). As you can see the device _CID is PNP0C02
> As per ACPI specs.
> Do you see anything wrong with this approach?

It looks OK to me.  The PCI Firmware spec r3.0, sec 4.1.2, does say
that the motherboard resource would usually appear at the root of the
namespace (under \_SB).  That's not an absolute statement, but I don't
know why it would be included at all unless the authors thought it was
important for some reason.

> The second issue is when and how to reserve HW resources. As per
> conversation above this seems quite a tricky issue and probably needs to
> consider different aspects...

I don't think resources should be reserved based on MCFG.  Maybe we
need to reserve MCFG areas on x86 for legacy reasons, but I don't
think we should do it on arm64.

> I was wondering if we can take a gradual approach; maybe for the time being
> we can just rework Nowicki patchset to check the MCFG resources to be
> motherboard reserved and later on we can make an effort to fix the resource
> insertion mechanism making sure that it works right on both x86 and ARM.
> 
> What do you think about?
> 
> Many Thanks
> 
> Gab
> 
> > 
> > > > > As per last Tomasz's patchset, we claim and assign unassigned PCI
> > > > > resources upon ACPI PCI host bridge probing (which happens at
> > > > > subsys_initcall time, courtesy of ACPI current code); at that
> > time the
> > > > > kernel did not even register the PNP0c02 driver
> > (drivers/pnp/system.c)
> > > > > (it does that at fs_initcall). On the other hand, we insert MCFG
> > > > > regions into the resource tree upon MCFG parsing, so I do not
> > > > > see why we need to rely on PNP0c02 to do that for us (granted,
> > the
> > > > > mechanism is part of the PCI fw specs, which are x86 centric
> > anyway
> > > > > ie we can't certainly rely on Int15 e820 to detect reserved
> > memory
> > > > > on ARM :D)
> > > > >
> > > > > There is lots of legacy x86 here and Bjorn definitely has more
> > > > > visibility into that than I have, the ARM world must understand
> > > > > how this works to make sure we have an agreement.
> > > >
> > > > As you say, there is lots of unpleasant x86 legacy here.  Possibly
> > ARM
> > > > has a chance to clean this up and do it more sanely; I'm not sure
> > > > whether it's feasible to reverse the ACPI/PCI init order there or
> > not.
> > > >
> > > > Rafael, any thoughts on this whole thing?
> > > >
> > > > Bjorn
> > > >
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-acpi"
> > in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gabriele Paoloni March 15, 2016, 11:13 a.m. UTC | #21
Hi Bjorn, many thanks for coming back on this

> -----Original Message-----
> From: Bjorn Helgaas [mailto:helgaas@kernel.org]
> Sent: 14 March 2016 19:17
> To: Gabriele Paoloni
> Cc: Lorenzo Pieralisi; 'Mark Rutland'; Guohanjun (Hanjun Guo); Wangzhou
> (B); liudongdong (C); Linuxarm; qiujiang; 'bhelgaas@google.com';
> 'arnd@arndb.de'; 'tn@semihalf.com'; 'linux-pci@vger.kernel.org';
> 'linux-kernel@vger.kernel.org'; xuwei (O); 'linux-
> acpi@vger.kernel.org'; 'jcm@redhat.com'; zhangjukuo; Liguozhu
> (Kenneth); 'linux-arm-kernel@lists.infradead.org'
> Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support for
> HiSilicon SoCs Host Controllers
> 
> On Wed, Mar 09, 2016 at 07:41:01AM +0000, Gabriele Paoloni wrote:
> > Hi Bjorn, Lorenzo
> >
> > > -----Original Message-----
> > > From: Bjorn Helgaas [mailto:helgaas@kernel.org]
> > > Sent: 02 March 2016 15:51
> > > To: Lorenzo Pieralisi
> > > Cc: Gabriele Paoloni; 'Mark Rutland'; Guohanjun (Hanjun Guo);
> Wangzhou
> > > (B); liudongdong (C); Linuxarm; qiujiang; 'bhelgaas@google.com';
> > > 'arnd@arndb.de'; 'tn@semihalf.com'; 'linux-pci@vger.kernel.org';
> > > 'linux-kernel@vger.kernel.org'; xuwei (O); 'linux-
> > > acpi@vger.kernel.org'; 'jcm@redhat.com'; zhangjukuo; Liguozhu
> > > (Kenneth); 'linux-arm-kernel@lists.infradead.org'
> > > Subject: Re: [RFC PATCH v3 3/3] PCI/ACPI: hisi: Add ACPI support
> for
> > > HiSilicon SoCs Host Controllers
> > >
> > > On Tue, Mar 01, 2016 at 07:22:47PM +0000, Lorenzo Pieralisi wrote:
> > > > Hi Bjorn,
> > > >
> > > > On Thu, Feb 25, 2016 at 01:59:12PM -0600, Bjorn Helgaas wrote:
> > > > > On Thu, Feb 25, 2016 at 12:07:50PM +0000, Lorenzo Pieralisi
> wrote:
> > > > > > On Thu, Feb 25, 2016 at 03:01:19AM +0000, Gabriele Paoloni
> wrote:
> > > >
> > > > [...]
> > > >
> > > > > > I do not understand how PNP0c02 works, currently, by the way.
> > > > > >
> > > > > > If I read x86 code correctly, the unassigned PCI bus
> resources
> > > are
> > > > > > assigned in arch/x86/pci/i386.c (?)
> > > fs_initcall(pcibios_assign_resources),
> > > > > > with a comment:
> > > > > >
> > > > > > /**
> > > > > >  * called in fs_initcall (one below subsys_initcall),
> > > > > >  * give a chance for motherboard reserve resources
> > > > > >  */
> > > > > >
> > > > > > Problem is, motherboard resources are requested through (?):
> > > > > >
> > > > > > drivers/pnp/system.c
> > > > > >
> > > > > > which is also initialized at fs_initcall, so it might be
> called
> > > after
> > > > > > core x86 code reassign resources, defeating the purpose
> PNP0c02
> > > was
> > > > > > designed for, namely, request motherboard regions before
> > > resources
> > > > > > are assigned, am I wrong ?
> > > > >
> > > > > I think you're right.  This is a long-standing screwup in
> Linux.
> > > > > IMHO, ACPI resources should be parsed and reserved by the ACPI
> > > core,
> > > > > before any PCI resource management (since PCI host bridges are
> > > > > represented in ACPI).  But historically PCI devices have
> enumerated
> > > > > before ACPI got involved.  And the ACPI core doesn't really pay
> > > > > attention to _CRS for most devices (with the exception of
> PNP0C02).
> > > > >
> > > > > IMO the PNP0C02 code in drivers/pnp/system.c should really be
> done
> > > in
> > > > > the ACPI core for all ACPI devices, similar to the way the PCI
> core
> > > > > reserves BAR space for all PCI devices, even if we don't have
> > > drivers
> > > > > for them.  I've tried to fix this in the past, but it is really
> a
> > > > > nightmare to unravel everything.
> > > > >
> > > > > Because the ACPI core doesn't reserve resources for the _CRS of
> all
> > > > > ACPI devices, we're already vulnerable to the problem of
> placing a
> > > > > device on top of another ACPI device.  We don't see problems
> > > because
> > > > > on x86, at least, most ACPI devices are already configured by
> the
> > > BIOS
> > > > > to be enabled and non-overlapping.  But x86 has the advantage
> of
> > > > > having extensive test coverage courtesy of Windows, and as long
> as
> > > > > _CRS has the right stuff in it, we at least have the potential
> of
> > > > > fixing problems in Linux.
> > > >
> > > > ...
> > > > By "fixing problems in Linux" above, you mean that, given that we
> > > > do have a validated _CRS space, we can request/reserve the region
> the
> > > _CRS
> > > > reports to prevent assigning those resources to other devices,
> > > correct ?
> > >
> > > Yes.
> > >
> > > I think part of what makes this difficult in Linux is that the
> > > resource tree is too strict about overlapping resources.  We get
> > > address space information from E820 (on x86), static ACPI tables
> like
> > > MCFG, and dynamic things like ACPI _CRS.  There's no real
> requirement
> > > that the BIOS should make all these consistent, but yet we try to
> jam
> > > it all into the same resource tree.
> > >
> > > For example, E820 might tell us that range A is reserved and
> > > unavailable to Linux.  We stick it in the resource tree.  Then we
> > > might have a _CRS that tells us about range B.  We *want* to put
> range
> > > B in the resource tree, but if B overlaps part of A, the insert
> will
> > > fail.
> > >
> > > All we really need from E820 is the information that "you can't put
> > > devices in A".  We don't need to enforce any relationship between A
> > > and B, but the current resource tree imposes unnecessary
> hierarchical
> > > requirements.
> > >
> > > I think issues like this are the biggest reason why the ACPI core
> > > doesn't reserve all _CRS space early on (Rafael may correct me
> here).
> > >
> > > > > If the platform doesn't report resource usage correctly on ARM,
> we
> > > may
> > > > > not find problems (because we don't have the Windows test
> suite)
> > > and
> > > > > if we have resource assignment problems because _CRS is
> lacking,
> > > we'll
> > > > > have no way to fix them.
> > > >
> > > > And I think here you mean we can't prevent assigning resource
> space
> > > to
> > > > devices that do not necessarily own it because since some devices
> > > _CRS
> > > > are borked/missing we have no way to detect the address space
> > > allocated
> > > > to them and we may end up with resources conflicts.
> > >
> > > The ACPI core currently doesn't reserve the space consumed by ACPI
> > > devices.  Some drivers, e.g., for PNP0C02 (motherboard) and PNP0A03
> > > (PCI host bridge), do reserve their space, but the core itself does
> > > not.
> > >
> > > If we have drivers for all the ACPI devices, those drivers will
> > > probably use _CRS and reserve the space, and we'll probably notice
> any
> > > _CRS errors.  But if we don't have drivers, e.g., for performance
> > > monitors or other non-essential things, nothing will use _CRS, and
> > > nothing will reserve the space used by the device, and it's hard to
> > > find errors.
> > >
> > > If we ever assign top-level resources, there's nothing to prevent
> us
> > > from creating a conflict.  The only reason we don't trip over this
> is
> > > that we usually don't assign top-level resources because firmware
> does
> > > it for us.
> >
> > It seems that in this thread we have touched quite few issues.
> >
> > First is how to describe a PCI host controller config space resource:
> > as you highlighted before mentioning the specs, these resources
> should
> > be marked as "PNP0C02"; therefore I guess the current Nowicki
> patchset
> > must be reworked to check the resources to be motherboard reserved
> when
> > parsing the MCFG table.
> 
> I think checking whether MCFG resources are reserved by motherboard
> ("PNP0C02") devices is a hack that was added on x86 because there were
> issues getting ECAM to work reliably.  The theory at the time was that
> the problem was BIOS bugs.  I don't know whether that's actually true
> or not.
> 
> I'm not convinced that checking for PNP0C02 resources is something we
> should do in generic code.  MCFG is a static table, and I don't think
> we should add dependencies on the ACPI namespace, because the point of
> static tables is to describe things we might need before the namespace
> is available.

Ok fine, then the current Nowicki patchset is good from this perspective

> 
> > Also with respect to the ACPI table for my specific PCIe controller I
> > would use the following approach:
> >
> > 	// PCIe Root bus
> > 	Device (PCI1)
> > 	{
> > 		Name (_HID, "HISI0080") // PCI Express Root Bridge
> > 		Name (_CID, "PNP0A03") // Compatible PCI Root Bridge
> > 		Name(_SEG, 1) // Segment of this Root complex
> > 		Name(_BBN, 0) // Base Bus Number
> > 		Name(_CCA, 1)
> > 		Method (_CRS, 0, Serialized) { // Root complex resources
> > 			Name (RBUF, ResourceTemplate () {
> > 				WordBusNumber ( // Bus numbers assigned to this
> root
> > 					ResourceProducer, MinFixed, MaxFixed,
> PosDecode,
> > 					0, // AddressGranularity
> > 					0, // AddressMinimum - Minimum Bus Number
> > 					63, // AddressMaximum - Maximum Bus
> Number
> > 					0, // AddressTranslation - Set to 0
> > 					64 // RangeLength - Number of Busses
> > 				)
> > 				QWordMemory ( // 64-bit BAR Windows
> > 					ResourceProducer,
> > 					PosDecode,
> > 					MinFixed,
> > 					MaxFixed,
> > 					Cacheable,
> > 					ReadWrite,
> > 					0x0000000000000000, // Granularity
> > 					0x00000000b0000000, // Min Base Address
> pci address
> > 					0x00000000bbfeffff, // Max Base Address
> > 					0x0000021f54000000, // Translate
> > 					0x000000000bff0000 // Length
> > 				)
> > 				QWordIO (
> > 					ResourceProducer,
> > 					MinFixed,
> > 					MaxFixed,
> > 					PosDecode,
> > 					EntireRange,
> > 					0x0000000000000000, // Granularity
> > 					0x0000000000000000, // Min Base Address
> > 					0x000000000000ffff, // Max Base Address
> > 					0x000002200fff0000, // Translate
> > 					0x0000000000010000 // Length
> > 				)
> > 			}) // Name(RBUF)
> > 			Return (RBUF)
> > 		} // Method(_CRS)
> > 		Device (RES0)
> > 		{
> > 			Name (_HID, "HISI0081") // HiSi PCIe RC config base
> address
> > 			Name (_CID, "PNP0C02")  // Motherboard reserved
> resource
> > 			Name (_CRS, ResourceTemplate (){
> >                          Memory32Fixed (ReadWrite, 0xb0080000 ,
> 0x10000)
> > 			})
> >
> > 		}
> >
> > So in the table above I have a sub-device under the RC to pass the
> address
> > for the RC config space (the rest of the config space addresses for
> bus 1
> > to 63 are passed in the MCFG). As you can see the device _CID is
> PNP0C02
> > As per ACPI specs.
> > Do you see anything wrong with this approach?
> 
> It looks OK to me.  The PCI Firmware spec r3.0, sec 4.1.2, does say
> that the motherboard resource would usually appear at the root of the
> namespace (under \_SB).  That's not an absolute statement, but I don't
> know why it would be included at all unless the authors thought it was
> important for some reason.

Ok great so I'll start reworking my ACPI based quirk according to the
table above

Many Thanks

Gab

> 
> > The second issue is when and how to reserve HW resources. As per
> > conversation above this seems quite a tricky issue and probably needs
> to
> > consider different aspects...
> 
> I don't think resources should be reserved based on MCFG.  Maybe we
> need to reserve MCFG areas on x86 for legacy reasons, but I don't
> think we should do it on arm64.
> 
> > I was wondering if we can take a gradual approach; maybe for the time
> being
> > we can just rework Nowicki patchset to check the MCFG resources to be
> > motherboard reserved and later on we can make an effort to fix the
> resource
> > insertion mechanism making sure that it works right on both x86 and
> ARM.
> >
> > What do you think about?
> >
> > Many Thanks
> >
> > Gab
> >
> > >
> > > > > > As per last Tomasz's patchset, we claim and assign unassigned
> PCI
> > > > > > resources upon ACPI PCI host bridge probing (which happens at
> > > > > > subsys_initcall time, courtesy of ACPI current code); at that
> > > time the
> > > > > > kernel did not even register the PNP0c02 driver
> > > (drivers/pnp/system.c)
> > > > > > (it does that at fs_initcall). On the other hand, we insert
> MCFG
> > > > > > regions into the resource tree upon MCFG parsing, so I do not
> > > > > > see why we need to rely on PNP0c02 to do that for us
> (granted,
> > > the
> > > > > > mechanism is part of the PCI fw specs, which are x86 centric
> > > anyway
> > > > > > ie we can't certainly rely on Int15 e820 to detect reserved
> > > memory
> > > > > > on ARM :D)
> > > > > >
> > > > > > There is lots of legacy x86 here and Bjorn definitely has
> more
> > > > > > visibility into that than I have, the ARM world must
> understand
> > > > > > how this works to make sure we have an agreement.
> > > > >
> > > > > As you say, there is lots of unpleasant x86 legacy here.
> Possibly
> > > ARM
> > > > > has a chance to clean this up and do it more sanely; I'm not
> sure
> > > > > whether it's feasible to reverse the ACPI/PCI init order there
> or
> > > not.
> > > > >
> > > > > Rafael, any thoughts on this whole thing?
> > > > >
> > > > > Bjorn
> > > > >
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-
> acpi"
> > > in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-
> info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-acpi"
> in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index d69f436..f184c3e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8412,6 +8412,7 @@  F:	Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
 F:	drivers/pci/host/pcie-hisi.h
 F:	drivers/pci/host/pcie-hisi.c
 F:	drivers/pci/host/pcie-hisi-common.c
+F:	drivers/pci/host/pcie-hisi-acpi.c
 
 PCIE DRIVER FOR QUALCOMM MSM
 M:     Stanimir Varbanov <svarbanov@mm-sol.com>
diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig
index 75a6054..65b1add 100644
--- a/drivers/pci/host/Kconfig
+++ b/drivers/pci/host/Kconfig
@@ -181,6 +181,14 @@  config PCI_HISI
 	  Say Y here if you want PCIe controller support on HiSilicon
 	  Hip05 and Hip06 SoCs
 
+config PCI_HISI_ACPI
+	depends on ACPI
+	bool "HiSilicon Hip05 and Hip06 SoCs ACPI PCIe controllers"
+	select ACPI_PCI_HOST_GENERIC
+	help
+	  Say Y here if you want ACPI PCIe controller support on HiSilicon
+	  Hip05 and Hip06 SoCs
+
 config PCIE_QCOM
 	bool "Qualcomm PCIe controller"
 	depends on ARCH_QCOM && OF
diff --git a/drivers/pci/host/Makefile b/drivers/pci/host/Makefile
index 8c93c0f..57e4379 100644
--- a/drivers/pci/host/Makefile
+++ b/drivers/pci/host/Makefile
@@ -21,4 +21,5 @@  obj-$(CONFIG_PCIE_IPROC_BCMA) += pcie-iproc-bcma.o
 obj-$(CONFIG_PCIE_ALTERA) += pcie-altera.o
 obj-$(CONFIG_PCIE_ALTERA_MSI) += pcie-altera-msi.o
 obj-$(CONFIG_PCI_HISI) += pcie-hisi.o pcie-hisi-common.o
+obj-$(CONFIG_PCI_HISI_ACPI) += pcie-hisi-acpi.o pcie-hisi-common.o
 obj-$(CONFIG_PCIE_QCOM) += pcie-qcom.o
diff --git a/drivers/pci/host/pcie-hisi-acpi.c b/drivers/pci/host/pcie-hisi-acpi.c
new file mode 100644
index 0000000..3605260
--- /dev/null
+++ b/drivers/pci/host/pcie-hisi-acpi.c
@@ -0,0 +1,188 @@ 
+/*
+ * PCIe host controller driver for HiSilicon HipXX SoCs
+ *
+ * Copyright (C) 2016 HiSilicon Co., Ltd. http://www.hisilicon.com
+ *
+ * Author: Dongdong Liu <liudongdong3@huawei.com>
+ *         Gabriele Paoloni <gabriele.paoloni@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/interrupt.h>
+#include <linux/acpi.h>
+#include <linux/ecam.h>
+#include <linux/pci.h>
+#include <linux/pci-acpi.h>
+#include "pcie-hisi.h"
+
+#define GET_PCIE_LINK_STATUS  0x0
+
+/* uuid 6d30f553-836c-408e-b6ad-45bccc957949 */
+const u8 hisi_pcie_acpi_dsm_uuid[] = {
+	0x53, 0xf5, 0x30, 0x6d, 0x6c, 0x83, 0x8e, 0x40,
+	0xb6, 0xad, 0x45, 0xbc, 0xcc, 0x95, 0x79, 0x49
+};
+
+static const struct acpi_device_id hisi_pcie_ids[] = {
+	{"HISI0080", 0},
+	{"", 0},
+};
+
+static int hisi_pcie_get_addr(struct acpi_pci_root *root, const char *name,
+				void __iomem **addr)
+{
+	struct acpi_device *device;
+	u64 base;
+	u64 size;
+	u32 buf[4];
+	int ret;
+
+	device =  root->device;
+	ret = fwnode_property_read_u32_array(&device->fwnode, name,
+					buf, ARRAY_SIZE(buf));
+	if (ret) {
+		dev_err(&device->dev, "can't get %s\n", name);
+		return ret;
+	}
+
+	base = ((u64)buf[0] << 32) | buf[1];
+	size =  ((u64)buf[2] << 32) | buf[3];
+	*addr = devm_ioremap(&device->dev, base, size);
+	if (!(*addr)) {
+		dev_err(&device->dev, "error with ioremap\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+
+static int hisi_pcie_link_up_acpi(struct acpi_pci_root *root)
+{
+	u32 val;
+	struct acpi_device *device;
+	union acpi_object *obj;
+
+	device = root->device;
+	obj = acpi_evaluate_dsm(device->handle,
+		hisi_pcie_acpi_dsm_uuid, 0,
+		GET_PCIE_LINK_STATUS, NULL);
+
+	if (!obj  ||  obj->type != ACPI_TYPE_INTEGER)  {
+		dev_err(&device->dev, "can't get link status from _DSM\n");
+		return 0;
+	}
+	val = obj->integer.value;
+
+	return ((val & PCIE_LTSSM_STATE_MASK) == PCIE_LTSSM_LINKUP_STATE);
+
+}
+
+/*
+ * Retrieve rc_dbi base and size from _DSD
+ * Name (_DSD, Package () {
+ *	ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
+ *	Package () {
+ *	Package () {"rc-dbi", Package () { 0x0, 0xb0080000, 0x0, 0x10000 }},
+ *	}
+ *	})
+ */
+static int hisi_pcie_init(struct acpi_pci_root *root)
+{
+	int ret;
+	struct acpi_device *device;
+	void __iomem *reg_base;
+
+	device =  root->device;
+	ret = hisi_pcie_get_addr(root, "rc-dbi", &reg_base);
+	if (ret) {
+		dev_err(&device->dev, "can't get rc-dbi\n");
+		return ret;
+	}
+
+	root->sysdata = reg_base;
+	return 0;
+}
+
+static int hisi_pcie_match(struct pci_mcfg_fixup *fixup,
+			struct acpi_pci_root *root)
+{
+	int ret;
+	struct acpi_device *device;
+
+	device = root->device;
+	ret = acpi_match_device_ids(device, hisi_pcie_ids);
+	if (ret)
+		return 0;
+
+	ret = hisi_pcie_init(root);
+	if (ret)
+		dev_warn(&device->dev, "hisi pcie init fail\n");
+
+	return 1;
+}
+
+static int hisi_pcie_acpi_valid_config(struct acpi_pci_root *root,
+				struct pci_bus *bus, int dev)
+{
+	/* If there is no link, then there is no device */
+	if (bus->number != root->secondary.start) {
+		if (!hisi_pcie_link_up_acpi(root))
+			return 0;
+	}
+
+	/* access only one slot on each root port */
+	if (bus->number == root->secondary.start && dev > 0)
+		return 0;
+
+	/*
+	 * do not read more than one device on the bus directly attached
+	 * to RC's (Virtual Bridge's) DS side.
+	 */
+	if (bus->primary == root->secondary.start && dev > 0)
+		return 0;
+
+	return 1;
+}
+
+static int hisi_pcie_acpi_rd_conf(struct pci_bus *bus, u32 devfn, int where,
+		int size, u32 *val)
+{
+	struct acpi_pci_root *root = bus->sysdata;
+	void __iomem *reg_base = root->sysdata;
+
+	if (hisi_pcie_acpi_valid_config(root, bus, PCI_SLOT(devfn)) == 0)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (bus->number == root->secondary.start)
+		return hisi_pcie_common_cfg_read(reg_base, where, size, val);
+
+	return pci_generic_config_read(bus, devfn, where, size, val);
+}
+
+static int hisi_pcie_acpi_wr_conf(struct pci_bus *bus, u32 devfn,
+		int where, int size, u32 val)
+{
+	struct acpi_pci_root *root = bus->sysdata;
+	void __iomem *reg_base = root->sysdata;
+
+	if (hisi_pcie_acpi_valid_config(root, bus, PCI_SLOT(devfn)) == 0)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (bus->number == root->secondary.start)
+		return hisi_pcie_common_cfg_write(reg_base, where, size, val);
+
+	return pci_generic_config_write(bus, devfn, where, size, val);
+}
+
+struct pci_ops hisi_pcie_acpi_ops = {
+	.map_bus = pci_mcfg_dev_base,
+	.read = hisi_pcie_acpi_rd_conf,
+	.write = hisi_pcie_acpi_wr_conf,
+};
+
+
+DECLARE_ACPI_MCFG_FIXUP(NULL, hisi_pcie_match, &hisi_pcie_acpi_ops,
+	PCI_MCFG_DOMAIN_ANY, PCI_MCFG_BUS_ANY);
diff --git a/drivers/pci/host/pcie-hisi.c b/drivers/pci/host/pcie-hisi.c
index d3e2047..d1ef346 100644
--- a/drivers/pci/host/pcie-hisi.c
+++ b/drivers/pci/host/pcie-hisi.c
@@ -23,8 +23,6 @@ 
 #include "pcie-designware.h"
 #include "pcie-hisi.h"
 
-#define PCIE_LTSSM_LINKUP_STATE				0x11
-#define PCIE_LTSSM_STATE_MASK				0x3F
 #define PCIE_SUBCTRL_SYS_STATE4_REG			0x6818
 #define PCIE_SYS_STATE4						0x31c
 #define PCIE_HIP06_CTRL_OFF					0x1000
diff --git a/drivers/pci/host/pcie-hisi.h b/drivers/pci/host/pcie-hisi.h
index 29e0790..45cf0fd 100644
--- a/drivers/pci/host/pcie-hisi.h
+++ b/drivers/pci/host/pcie-hisi.h
@@ -14,6 +14,8 @@ 
 #ifndef PCIE_HISI_H_
 #define PCIE_HISI_H_
 
+#define PCIE_LTSSM_LINKUP_STATE				0x11
+#define PCIE_LTSSM_STATE_MASK				0x3F
 
 int hisi_pcie_common_cfg_read(void __iomem *reg_base, int where, int size,
 			      u32 *val);