diff mbox

[v2,3/7] PCI: aardvark: set host and device to the same MAX payload size

Message ID 20170928125838.11887-4-thomas.petazzoni@free-electrons.com (mailing list archive)
State New, archived
Headers show

Commit Message

Thomas Petazzoni Sept. 28, 2017, 12:58 p.m. UTC
From: Victor Gu <xigu@marvell.com>

Since the Aardvark does not implement a PCIe root bus, the Linux PCIe
subsystem will not align the MAX payload size between the host and the
device. This patch ensures that the host and device have the same MAX
payload size, fixing a number of problems with various PCIe devices.

This is part of fixing bug
https://bugzilla.kernel.org/show_bug.cgi?id=196339, this commit was
reported as the user to be important to get a Intel 7260 mini-PCIe
WiFi card working.

Fixes: Fixes: 8c39d710363c1 ("PCI: aardvark: Add Aardvark PCI host controller driver")
Signed-off-by: Victor Gu <xigu@marvell.com>
Reviewed-by: Evan Wang <xswang@marvell.com>
Reviewed-by: Nadav Haklai <nadavh@marvell.com>
[Thomas: tweak commit log.]
Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
---
 drivers/pci/host/pci-aardvark.c | 60 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

Comments

Bjorn Helgaas Oct. 5, 2017, 5:31 p.m. UTC | #1
On Thu, Sep 28, 2017 at 02:58:34PM +0200, Thomas Petazzoni wrote:
> From: Victor Gu <xigu@marvell.com>
> 
> Since the Aardvark does not implement a PCIe root bus, 

What exactly do you mean by "does not implement a PCIe root bus"?  I
assume there is still a hierarchy of PCI buses, and I assume the
hierarchy has a top-most ("root") bus.

Maybe there's no Root Port?  There are other systems that don't have
Root Ports, and we've made changes to accommodate that, e.g.,

http://git.kernel.org/cgit/linux/kernel/git/helgaas/pci.git/commit/?id=1b8a6079015f

If we can make the generic code work for whatever the Aardvark
topology is, that would be better than adding Aardvark-specific code
here.

> the Linux PCIe
> subsystem will not align the MAX payload size between the host and the
> device. This patch ensures that the host and device have the same MAX
> payload size, fixing a number of problems with various PCIe devices.
> 
> This is part of fixing bug
> https://bugzilla.kernel.org/show_bug.cgi?id=196339, this commit was
> reported as the user to be important to get a Intel 7260 mini-PCIe
> WiFi card working.
> 
> Fixes: Fixes: 8c39d710363c1 ("PCI: aardvark: Add Aardvark PCI host controller driver")
> Signed-off-by: Victor Gu <xigu@marvell.com>
> Reviewed-by: Evan Wang <xswang@marvell.com>
> Reviewed-by: Nadav Haklai <nadavh@marvell.com>
> [Thomas: tweak commit log.]
> Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
> ---
>  drivers/pci/host/pci-aardvark.c | 60 ++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 59 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/host/pci-aardvark.c b/drivers/pci/host/pci-aardvark.c
> index af7a9c4a61a4..c8a97bad6c4c 100644
> --- a/drivers/pci/host/pci-aardvark.c
> +++ b/drivers/pci/host/pci-aardvark.c
> @@ -30,8 +30,10 @@
>  #define PCIE_CORE_DEV_CTRL_STATS_REG				0xc8
>  #define     PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE	(0 << 4)
>  #define     PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT	5
> +#define     PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ		0x2
>  #define     PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE		(0 << 11)
>  #define     PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT	12
> +#define     PCIE_CORE_MPS_UNIT_BYTE				128
>  #define PCIE_CORE_LINK_CTRL_STAT_REG				0xd0
>  #define     PCIE_CORE_LINK_L0S_ENTRY				BIT(0)
>  #define     PCIE_CORE_LINK_TRAINING				BIT(5)
> @@ -297,7 +299,8 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie)
>  
>  	/* Set PCIe Device Control and Status 1 PF0 register */
>  	reg = PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE |
> -		(7 << PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT) |
> +		(PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ <<
> +		 PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT) |
>  		PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE |
>  		PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT;
>  	advk_writel(pcie, reg, PCIE_CORE_DEV_CTRL_STATS_REG);
> @@ -879,6 +882,58 @@ static int advk_pcie_parse_request_of_pci_ranges(struct advk_pcie *pcie)
>  	return err;
>  }
>  
> +static int advk_pcie_find_smpss(struct pci_dev *dev, void *data)
> +{
> +	u8 *smpss = data;
> +
> +	if (!dev)
> +		return 0;
> +
> +	if (!pci_is_pcie(dev))
> +		return 0;
> +
> +	if (*smpss > dev->pcie_mpss)
> +		*smpss = dev->pcie_mpss;
> +
> +	return 0;
> +}
> +
> +static int advk_pcie_bus_configure_mps(struct pci_dev *dev, void *data)
> +{
> +	int mps;
> +
> +	if (!dev)
> +		return 0;
> +
> +	if (!pci_is_pcie(dev))
> +		return 0;
> +
> +	mps = PCIE_CORE_MPS_UNIT_BYTE << *(u8 *)data;
> +	pcie_set_mps(dev, mps);
> +
> +	return 0;
> +}
> +
> +static void advk_pcie_configure_mps(struct pci_bus *bus, struct advk_pcie *pcie)
> +{
> +	u8 smpss = PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ;
> +	u32 reg;
> +
> +	/* Find the minimal supported MAX payload size */
> +	advk_pcie_find_smpss(bus->self, &smpss);
> +	pci_walk_bus(bus, advk_pcie_find_smpss, &smpss);
> +
> +	/* Configure RC MAX payload size */
> +	reg = advk_readl(pcie, PCIE_CORE_DEV_CTRL_STATS_REG);
> +	reg &= ~PCI_EXP_DEVCTL_PAYLOAD;
> +	reg |= smpss << PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT;
> +	advk_writel(pcie, reg, PCIE_CORE_DEV_CTRL_STATS_REG);
> +
> +	/* Configure device MAX payload size */
> +	advk_pcie_bus_configure_mps(bus->self, &smpss);
> +	pci_walk_bus(bus, advk_pcie_bus_configure_mps, &smpss);
> +}
> +
>  static int advk_pcie_probe(struct platform_device *pdev)
>  {
>  	struct device *dev = &pdev->dev;
> @@ -950,6 +1005,9 @@ static int advk_pcie_probe(struct platform_device *pdev)
>  	list_for_each_entry(child, &bus->children, node)
>  		pcie_bus_configure_settings(child);
>  
> +	/* Configure the MAX pay load size */
> +	advk_pcie_configure_mps(bus, pcie);
> +
>  	pci_bus_add_devices(bus);
>  	return 0;
>  }
> -- 
> 2.13.5
>
Thomas Petazzoni Jan. 9, 2018, 3:39 p.m. UTC | #2
Hello Bjorn,

On Thu, 5 Oct 2017 12:31:02 -0500, Bjorn Helgaas wrote:
> On Thu, Sep 28, 2017 at 02:58:34PM +0200, Thomas Petazzoni wrote:
> > From: Victor Gu <xigu@marvell.com>
> > 
> > Since the Aardvark does not implement a PCIe root bus,   
> 
> What exactly do you mean by "does not implement a PCIe root bus"?  I
> assume there is still a hierarchy of PCI buses, and I assume the
> hierarchy has a top-most ("root") bus.
> 
> Maybe there's no Root Port?  There are other systems that don't have
> Root Ports, and we've made changes to accommodate that, e.g.,
> 
> http://git.kernel.org/cgit/linux/kernel/git/helgaas/pci.git/commit/?id=1b8a6079015f

I'm trying to get back (finally) to this topic. Unfortunately, your
branch has been rebased, and this commit no longer exists. Do you have
an updated pointer about what you suggest to use for systems that don't
have Root Ports ?

Thanks!

Thomas
Bjorn Helgaas Jan. 9, 2018, 10:14 p.m. UTC | #3
[+cc Lorenzo, who maintains this area now]

On Tue, Jan 09, 2018 at 04:39:18PM +0100, Thomas Petazzoni wrote:
> Hello Bjorn,
> 
> On Thu, 5 Oct 2017 12:31:02 -0500, Bjorn Helgaas wrote:
> > On Thu, Sep 28, 2017 at 02:58:34PM +0200, Thomas Petazzoni wrote:
> > > From: Victor Gu <xigu@marvell.com>
> > > 
> > > Since the Aardvark does not implement a PCIe root bus,   
> > 
> > What exactly do you mean by "does not implement a PCIe root bus"?  I
> > assume there is still a hierarchy of PCI buses, and I assume the
> > hierarchy has a top-most ("root") bus.
> > 
> > Maybe there's no Root Port?  There are other systems that don't have
> > Root Ports, and we've made changes to accommodate that, e.g.,
> > 
> > http://git.kernel.org/cgit/linux/kernel/git/helgaas/pci.git/commit/?id=1b8a6079015f
> 
> I'm trying to get back (finally) to this topic. Unfortunately, your
> branch has been rebased, and this commit no longer exists. Do you have
> an updated pointer about what you suggest to use for systems that don't
> have Root Ports ?

Sorry, about that; here's the upstream commit, FWIW:

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ee8bdfb6568d

If the OS sees no Root Port (I haven't seen the full lspci or kernel
enumeration log, so I don't know what the topology actually is), I
assume you probably have some Endpoints that have valid Link
Capabilities, Control, and Status registers.  Those refer to the
downstream end of the Link, and the Root Port would normally have
corresponding registers that refer to the upstream end.

The lack of the Root Port means we can't do any management of those
top-level Links, so no ASPM, no MPS, no link width/speed management,
etc.

I see that advk_pcie_probe() calls pcie_bus_configure_settings() like
all other drivers, and ideally we would try to make that work just
like it does on other platforms.  The code is:

  pci_scan_root_bus_bridge(bridge);
  bus = bridge->bus;
  list_for_each_entry(child, &bus->children, node)
    pcie_bus_configure_settings(child);

This MPS setting is all strictly in the PCIe domain (it's not in the
Aardvark domain and shouldn't have any Aardvark dependencies), so I
would expect the core code to just work, modulo some possible
confusion if it expects to find a Root Port but doesn't.

Can you collect "lspci -vv" output and details about what currently
goes wrong?  Then we'd have something more concrete to talk about.

Bjorn
Thomas Petazzoni Jan. 12, 2018, 10:14 a.m. UTC | #4
Hello,

On Tue, 9 Jan 2018 16:14:36 -0600, Bjorn Helgaas wrote:

> > I'm trying to get back (finally) to this topic. Unfortunately, your
> > branch has been rebased, and this commit no longer exists. Do you have
> > an updated pointer about what you suggest to use for systems that don't
> > have Root Ports ?  
> 
> Sorry, about that; here's the upstream commit, FWIW:
> 
> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ee8bdfb6568d

Thanks. I don't see how this commit can fix our problem though, see below.

> If the OS sees no Root Port (I haven't seen the full lspci or kernel
> enumeration log, so I don't know what the topology actually is), I
> assume you probably have some Endpoints that have valid Link
> Capabilities, Control, and Status registers.  Those refer to the
> downstream end of the Link, and the Root Port would normally have
> corresponding registers that refer to the upstream end.
> 
> The lack of the Root Port means we can't do any management of those
> top-level Links, so no ASPM, no MPS, no link width/speed management,
> etc.
> 
> I see that advk_pcie_probe() calls pcie_bus_configure_settings() like
> all other drivers, and ideally we would try to make that work just
> like it does on other platforms.  The code is:
> 
>   pci_scan_root_bus_bridge(bridge);
>   bus = bridge->bus;
>   list_for_each_entry(child, &bus->children, node)
>     pcie_bus_configure_settings(child);
> 
> This MPS setting is all strictly in the PCIe domain (it's not in the
> Aardvark domain and shouldn't have any Aardvark dependencies), so I
> would expect the core code to just work, modulo some possible
> confusion if it expects to find a Root Port but doesn't.
> 
> Can you collect "lspci -vv" output and details about what currently
> goes wrong?  Then we'd have something more concrete to talk about.

With an E1000E PCIe NIC connected, the entire lspci -vvv output is:

# lspci -vv
00:00.0 Ethernet controller: Intel Corporation 82572EI Gigabit Ethernet Controller (Copper) (rev 06)
	Subsystem: Intel Corporation PRO/1000 PT Server Adapter
	Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx+
	Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
	Latency: 0, Cache Line Size: 32 bytes
	Interrupt: pin A routed to IRQ 40
	Region 0: Memory at e8000000 (32-bit, non-prefetchable) [size=128K]
	Region 1: Memory at e8020000 (32-bit, non-prefetchable) [size=128K]
	Region 2: I/O ports at 1000 [disabled] [size=32]
	Expansion ROM at e8040000 [disabled] [size=128K]
	Capabilities: [c8] Power Management version 2
		Flags: PMEClk- DSI+ D1- D2- AuxCurrent=0mA PME(D0+,D1-,D2-,D3hot+,D3cold+)
		Status: D0 NoSoftRst- PME-Enable- DSel=0 DScale=1 PME-
	Capabilities: [d0] MSI: Enable+ Count=1/1 Maskable- 64bit+
		Address: 000000001d1f6f68  Data: 0028
	Capabilities: [e0] Express (v1) Endpoint, MSI 00
		DevCap:	MaxPayload 256 bytes, PhantFunc 0, Latency L0s <512ns, L1 <64us
			ExtTag- AttnBtn- AttnInd- PwrInd- RBE- FLReset- SlotPowerLimit 0.000W
		DevCtl:	Report errors: Correctable+ Non-Fatal+ Fatal+ Unsupported+
			RlxdOrd+ ExtTag- PhantFunc- AuxPwr- NoSnoop+
			MaxPayload 256 bytes, MaxReadReq 512 bytes
		DevSta:	CorrErr- UncorrErr- FatalErr- UnsuppReq- AuxPwr+ TransPend-
		LnkCap:	Port #0, Speed 2.5GT/s, Width x1, ASPM L0s, Exit Latency L0s <4us
			ClockPM- Surprise- LLActRep- BwNot- ASPMOptComp-
		LnkCtl:	ASPM Disabled; RCB 64 bytes Disabled- CommClk-
			ExtSynch- ClockPM- AutWidDis- BWInt- AutBWInt-
		LnkSta:	Speed 2.5GT/s, Width x1, TrErr- Train- SlotClk+ DLActive- BWMgmt- ABWMgmt-
	Capabilities: [100 v1] Advanced Error Reporting
		UESta:	DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
		UEMsk:	DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
		UESvrt:	DLP+ SDES- TLP- FCP+ CmpltTO- CmpltAbrt- UnxCmplt- RxOF+ MalfTLP+ ECRC- UnsupReq- ACSViol-
		CESta:	RxErr- BadTLP- BadDLLP- Rollover- Timeout- NonFatalErr-
		CEMsk:	RxErr- BadTLP- BadDLLP- Rollover- Timeout- NonFatalErr-
		AERCap:	First Error Pointer: 00, ECRCGenCap- ECRCGenEn- ECRCChkCap- ECRCChkEn-
			MultHdrRecCap- MultHdrRecEn- TLPPfxPres- HdrLogCap-
		HeaderLog: 00000000 00000000 00000000 00000000
	Capabilities: [140 v1] Device Serial Number 00-1b-21-ff-ff-c1-c4-fe
	Kernel driver in use: e1000e

I.e, there is no Root Port. Therefore, I don't see how the kernel
can know what is the maximum allowed payload size of the PCIe
controller, nor how to adjust the payload size to use. Same for the L0s
configuration.

This is why we need those changes, one to update the PCIe controller
MPS according to the Maximum Payload Size acceptable by the endpoint,
and one to disable L0s entirely to avoid issues with non-L0s compliant
devices.

Does that make more sense ?

Best regards,

Thomas
Bjorn Helgaas Jan. 12, 2018, 2:40 p.m. UTC | #5
On Fri, Jan 12, 2018 at 11:14:48AM +0100, Thomas Petazzoni wrote:
> Hello,
> 
> On Tue, 9 Jan 2018 16:14:36 -0600, Bjorn Helgaas wrote:
> 
> > > I'm trying to get back (finally) to this topic. Unfortunately, your
> > > branch has been rebased, and this commit no longer exists. Do you have
> > > an updated pointer about what you suggest to use for systems that don't
> > > have Root Ports ?  
> > 
> > Sorry, about that; here's the upstream commit, FWIW:
> > 
> > http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ee8bdfb6568d
> 
> Thanks. I don't see how this commit can fix our problem though, see below.

Sorry, I didn't mean that commit would fix your problem.  It's just an
example of another case where generic code incorrectly assumed a Root
Port would always be present.

> > If the OS sees no Root Port (I haven't seen the full lspci or kernel
> > enumeration log, so I don't know what the topology actually is), I
> > assume you probably have some Endpoints that have valid Link
> > Capabilities, Control, and Status registers.  Those refer to the
> > downstream end of the Link, and the Root Port would normally have
> > corresponding registers that refer to the upstream end.
> > 
> > The lack of the Root Port means we can't do any management of those
> > top-level Links, so no ASPM, no MPS, no link width/speed management,
> > etc.
> > 
> > I see that advk_pcie_probe() calls pcie_bus_configure_settings() like
> > all other drivers, and ideally we would try to make that work just
> > like it does on other platforms.  The code is:
> > 
> >   pci_scan_root_bus_bridge(bridge);
> >   bus = bridge->bus;
> >   list_for_each_entry(child, &bus->children, node)
> >     pcie_bus_configure_settings(child);
> > 
> > This MPS setting is all strictly in the PCIe domain (it's not in the
> > Aardvark domain and shouldn't have any Aardvark dependencies), so I
> > would expect the core code to just work, modulo some possible
> > confusion if it expects to find a Root Port but doesn't.
> > 
> > Can you collect "lspci -vv" output and details about what currently
> > goes wrong?  Then we'd have something more concrete to talk about.
> 
> With an E1000E PCIe NIC connected, the entire lspci -vvv output is:
> 
> # lspci -vv
> 00:00.0 Ethernet controller: Intel Corporation 82572EI Gigabit Ethernet Controller (Copper) (rev 06)
> 	Subsystem: Intel Corporation PRO/1000 PT Server Adapter
> 	Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx+
> 	Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
> 	Latency: 0, Cache Line Size: 32 bytes
> 	Interrupt: pin A routed to IRQ 40
> 	Region 0: Memory at e8000000 (32-bit, non-prefetchable) [size=128K]
> 	Region 1: Memory at e8020000 (32-bit, non-prefetchable) [size=128K]
> 	Region 2: I/O ports at 1000 [disabled] [size=32]
> 	Expansion ROM at e8040000 [disabled] [size=128K]
> 	Capabilities: [c8] Power Management version 2
> 		Flags: PMEClk- DSI+ D1- D2- AuxCurrent=0mA PME(D0+,D1-,D2-,D3hot+,D3cold+)
> 		Status: D0 NoSoftRst- PME-Enable- DSel=0 DScale=1 PME-
> 	Capabilities: [d0] MSI: Enable+ Count=1/1 Maskable- 64bit+
> 		Address: 000000001d1f6f68  Data: 0028
> 	Capabilities: [e0] Express (v1) Endpoint, MSI 00
> 		DevCap:	MaxPayload 256 bytes, PhantFunc 0, Latency L0s <512ns, L1 <64us
> 			ExtTag- AttnBtn- AttnInd- PwrInd- RBE- FLReset- SlotPowerLimit 0.000W
> 		DevCtl:	Report errors: Correctable+ Non-Fatal+ Fatal+ Unsupported+
> 			RlxdOrd+ ExtTag- PhantFunc- AuxPwr- NoSnoop+
> 			MaxPayload 256 bytes, MaxReadReq 512 bytes
> 		DevSta:	CorrErr- UncorrErr- FatalErr- UnsuppReq- AuxPwr+ TransPend-
> 		LnkCap:	Port #0, Speed 2.5GT/s, Width x1, ASPM L0s, Exit Latency L0s <4us
> 			ClockPM- Surprise- LLActRep- BwNot- ASPMOptComp-
> 		LnkCtl:	ASPM Disabled; RCB 64 bytes Disabled- CommClk-
> 			ExtSynch- ClockPM- AutWidDis- BWInt- AutBWInt-
> 		LnkSta:	Speed 2.5GT/s, Width x1, TrErr- Train- SlotClk+ DLActive- BWMgmt- ABWMgmt-
> 	Capabilities: [100 v1] Advanced Error Reporting
> 		UESta:	DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
> 		UEMsk:	DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
> 		UESvrt:	DLP+ SDES- TLP- FCP+ CmpltTO- CmpltAbrt- UnxCmplt- RxOF+ MalfTLP+ ECRC- UnsupReq- ACSViol-
> 		CESta:	RxErr- BadTLP- BadDLLP- Rollover- Timeout- NonFatalErr-
> 		CEMsk:	RxErr- BadTLP- BadDLLP- Rollover- Timeout- NonFatalErr-
> 		AERCap:	First Error Pointer: 00, ECRCGenCap- ECRCGenEn- ECRCChkCap- ECRCChkEn-
> 			MultHdrRecCap- MultHdrRecEn- TLPPfxPres- HdrLogCap-
> 		HeaderLog: 00000000 00000000 00000000 00000000
> 	Capabilities: [140 v1] Device Serial Number 00-1b-21-ff-ff-c1-c4-fe
> 	Kernel driver in use: e1000e
> 
> I.e, there is no Root Port. Therefore, I don't see how the kernel
> can know what is the maximum allowed payload size of the PCIe
> controller, nor how to adjust the payload size to use. Same for the L0s
> configuration.

The Device Control MPS field defaults to 128 bytes.  Generic software
can only change that default if it knows that every element that might
receive a packet from the device can handle it.  In this case, we have
no information about what the invisible Root Port can handle, so I
would argue that we cannot change MPS.

In the lspci above, MPS is set to 256 bytes.  If that was done by
firmware, it might be safe because it knows things about the Root Port
that Linux doesn't.  But I don't think the Linux PCI core could set it
to 256.

ASPM L0s is similar.  We should only enable L0s if we can tell that
both ends of the link support it.  If there's no Root Port, we don't
have any ASPM capability information for the upstream end of the link,
so we shouldn't enable ASPM at all.

> This is why we need those changes, one to update the PCIe controller
> MPS according to the Maximum Payload Size acceptable by the endpoint,
> and one to disable L0s entirely to avoid issues with non-L0s compliant
> devices.

The generic core code should perform minimal, guaranteed-to-work
configuration using the least information and fewest assumptions
possible.  That may not lead to optimal performance, but it should at
least be functional.  This should work even if there is no Root Port.

Once we have that figured out, then we can worry about whether we can
or should do platform-specific tweaks to improve performance, e.g.,
increase MPS if we know Root Port capabilities implicitly.

I had the impression that these patches were required for correct
functionality, not just to improve performance.  But maybe I
misunderstood?

Bjorn
Thomas Petazzoni Jan. 12, 2018, 3:46 p.m. UTC | #6
Hello Bjorn,

Thanks again for this discussion!

On Fri, 12 Jan 2018 08:40:24 -0600, Bjorn Helgaas wrote:

> Sorry, I didn't mean that commit would fix your problem.  It's just an
> example of another case where generic code incorrectly assumed a Root
> Port would always be present.

OK, understood.

> > I.e, there is no Root Port. Therefore, I don't see how the kernel
> > can know what is the maximum allowed payload size of the PCIe
> > controller, nor how to adjust the payload size to use. Same for the L0s
> > configuration.  
> 
> The Device Control MPS field defaults to 128 bytes.  Generic software
> can only change that default if it knows that every element that might
> receive a packet from the device can handle it.  In this case, we have
> no information about what the invisible Root Port can handle, so I
> would argue that we cannot change MPS.
> 
> In the lspci above, MPS is set to 256 bytes.  If that was done by
> firmware, it might be safe because it knows things about the Root Port
> that Linux doesn't.  But I don't think the Linux PCI core could set it
> to 256.

So you're suspecting that the firmware/bootloader has configured the
MPS on the E1000E device to 256 bytes ?

Isn't it dangerous for the kernel to rely on the firmware/bootloader
configuration ? Indeed, the firmware/bootloader might have configured
MPS to X bytes on the endpoint, but when the kernel boots and
initializes the PCIe controller, its sets the PCIe controller MPS to Y
bytes, with Y > X.

> ASPM L0s is similar.  We should only enable L0s if we can tell that
> both ends of the link support it.  If there's no Root Port, we don't
> have any ASPM capability information for the upstream end of the link,
> so we shouldn't enable ASPM at all.

Well, even without the Root Port, we are able to use the endpoint
configuration space to figure out whether it supports L0s, and adjust
the root complex configuration accordingly. This is what our patch is
doing for MPS, and which could be done similarly for L0s, no?

> 
> > This is why we need those changes, one to update the PCIe controller
> > MPS according to the Maximum Payload Size acceptable by the endpoint,
> > and one to disable L0s entirely to avoid issues with non-L0s compliant
> > devices.  
> 
> The generic core code should perform minimal, guaranteed-to-work
> configuration using the least information and fewest assumptions
> possible.  That may not lead to optimal performance, but it should at
> least be functional.  This should work even if there is no Root Port.
> 
> Once we have that figured out, then we can worry about whether we can
> or should do platform-specific tweaks to improve performance, e.g.,
> increase MPS if we know Root Port capabilities implicitly.
> 
> I had the impression that these patches were required for correct
> functionality, not just to improve performance.  But maybe I
> misunderstood?

I don't myself have the device that wasn't working, and that this patch
got to work, so I can't double check myself. However, indeed, I was
told that without this fix, some devices would not work.

One question: is it valid/working to have the root complex configured
with MPS = 128 bytes but the endpoint configured with MPS = 256 or 512
bytes ? Or should the MPS value be strictly equal on both sides ?

Depending on your answer, there are two options:

 - It is a valid situation to have a root complex MPS lower than the
   endpoint MPS. In this case, we could for now simply unconditionally
   set the MPS to 128 bytes in the root complex, as a fix to get all
   devices working. And then separately, work on improving performance
   by increasing the MPS according to the endpoint capabilities.

 - It is not valid for the root complex MPS to be different than the
   endpoint MPS. In this case, then I don't see how we can do things
   differently than the proposed patch: we have to see what the
   endpoint MPS is, and adjust the root complex MPS accordingly.
   Indeed, the bootloader/firmware might have changed the endpoint MPS
   so that it is no longer the default of 128 bytes.

Best regards,

Thomas
Bjorn Helgaas Jan. 12, 2018, 7:39 p.m. UTC | #7
On Fri, Jan 12, 2018 at 04:46:44PM +0100, Thomas Petazzoni wrote:
> On Fri, 12 Jan 2018 08:40:24 -0600, Bjorn Helgaas wrote:

> > The Device Control MPS field defaults to 128 bytes.  Generic software
> > can only change that default if it knows that every element that might
> > receive a packet from the device can handle it.  In this case, we have
> > no information about what the invisible Root Port can handle, so I
> > would argue that we cannot change MPS.
> > 
> > In the lspci above, MPS is set to 256 bytes.  If that was done by
> > firmware, it might be safe because it knows things about the Root Port
> > that Linux doesn't.  But I don't think the Linux PCI core could set it
> > to 256.
> 
> So you're suspecting that the firmware/bootloader has configured the
> MPS on the E1000E device to 256 bytes ?

I didn't word that very well.  It looks like *something* set it to
256, but I don't know what.  It's possible Linux did, but I think that
would be a bug and should be fixed.  We'd have to instrument the code
or analyze it more closely than I have.

> Isn't it dangerous for the kernel to rely on the firmware/bootloader
> configuration ? Indeed, the firmware/bootloader might have configured
> MPS to X bytes on the endpoint, but when the kernel boots and
> initializes the PCIe controller, its sets the PCIe controller MPS to Y
> bytes, with Y > X.
> 
> > ASPM L0s is similar.  We should only enable L0s if we can tell that
> > both ends of the link support it.  If there's no Root Port, we don't
> > have any ASPM capability information for the upstream end of the link,
> > so we shouldn't enable ASPM at all.
> 
> Well, even without the Root Port, we are able to use the endpoint
> configuration space to figure out whether it supports L0s, and adjust
> the root complex configuration accordingly. This is what our patch is
> doing for MPS, and which could be done similarly for L0s, no?

Yes, this is where it would get machine-specific.  I don't know how
that should be structured, or even whether it's really worthwhile.  I
think the core should make it *work* with the least-common-denominator
approach, but I'm not convinced that a lot of effort should be put
into optimizing a topology that doesn't follow the spec.  A driver
could do this outside the core, but I think it would be better to put
the effort into making the topology more standard.

Why exactly *doesn't* Aardvark expose the Root Port?  I assume it does
actually exist, since there is actually a link leading to the slot,
and there has to be *something* at the upstream end of that link.

> > I had the impression that these patches were required for correct
> > functionality, not just to improve performance.  But maybe I
> > misunderstood?
> 
> I don't myself have the device that wasn't working, and that this patch
> got to work, so I can't double check myself. However, indeed, I was
> told that without this fix, some devices would not work.
> 
> One question: is it valid/working to have the root complex configured
> with MPS = 128 bytes but the endpoint configured with MPS = 256 or 512
> bytes ? Or should the MPS value be strictly equal on both sides ?

Per PCIe r4.0, sec 2.2.2, a device cannot transmit a TLP with a
payload larger than its MPS.  A device receiving a TLP with a payload
larger than its MPS must treat the TLP as malformed.

I think that means MPS really should be set the same on both sides so
the device can do both DMA reads and writes safely.

> Depending on your answer, there are two options:
> 
>  - It is a valid situation to have a root complex MPS lower than the
>    endpoint MPS. In this case, we could for now simply unconditionally
>    set the MPS to 128 bytes in the root complex, as a fix to get all
>    devices working. And then separately, work on improving performance
>    by increasing the MPS according to the endpoint capabilities.
> 
>  - It is not valid for the root complex MPS to be different than the
>    endpoint MPS. In this case, then I don't see how we can do things
>    differently than the proposed patch: we have to see what the
>    endpoint MPS is, and adjust the root complex MPS accordingly.
>    Indeed, the bootloader/firmware might have changed the endpoint MPS
>    so that it is no longer the default of 128 bytes.

All devices are guaranteed to support MPS = 128 bytes, so if you set
the Root Port to that in the driver, we should be able to make the PCI
core leave (or set, if necessary) all devices with MPS = 128.

I think we should start with that first, then worry about performance
optimizations separately.  I guess I'm still just shaking my head over
the invisible Root Port mystery.  I think the other cases I know about
are related to virtualization, where I can sort of understand why the
Root Port is missing, but I don't think that's the situation here.

Bjorn
diff mbox

Patch

diff --git a/drivers/pci/host/pci-aardvark.c b/drivers/pci/host/pci-aardvark.c
index af7a9c4a61a4..c8a97bad6c4c 100644
--- a/drivers/pci/host/pci-aardvark.c
+++ b/drivers/pci/host/pci-aardvark.c
@@ -30,8 +30,10 @@ 
 #define PCIE_CORE_DEV_CTRL_STATS_REG				0xc8
 #define     PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE	(0 << 4)
 #define     PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT	5
+#define     PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ		0x2
 #define     PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE		(0 << 11)
 #define     PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT	12
+#define     PCIE_CORE_MPS_UNIT_BYTE				128
 #define PCIE_CORE_LINK_CTRL_STAT_REG				0xd0
 #define     PCIE_CORE_LINK_L0S_ENTRY				BIT(0)
 #define     PCIE_CORE_LINK_TRAINING				BIT(5)
@@ -297,7 +299,8 @@  static void advk_pcie_setup_hw(struct advk_pcie *pcie)
 
 	/* Set PCIe Device Control and Status 1 PF0 register */
 	reg = PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE |
-		(7 << PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT) |
+		(PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ <<
+		 PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT) |
 		PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE |
 		PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT;
 	advk_writel(pcie, reg, PCIE_CORE_DEV_CTRL_STATS_REG);
@@ -879,6 +882,58 @@  static int advk_pcie_parse_request_of_pci_ranges(struct advk_pcie *pcie)
 	return err;
 }
 
+static int advk_pcie_find_smpss(struct pci_dev *dev, void *data)
+{
+	u8 *smpss = data;
+
+	if (!dev)
+		return 0;
+
+	if (!pci_is_pcie(dev))
+		return 0;
+
+	if (*smpss > dev->pcie_mpss)
+		*smpss = dev->pcie_mpss;
+
+	return 0;
+}
+
+static int advk_pcie_bus_configure_mps(struct pci_dev *dev, void *data)
+{
+	int mps;
+
+	if (!dev)
+		return 0;
+
+	if (!pci_is_pcie(dev))
+		return 0;
+
+	mps = PCIE_CORE_MPS_UNIT_BYTE << *(u8 *)data;
+	pcie_set_mps(dev, mps);
+
+	return 0;
+}
+
+static void advk_pcie_configure_mps(struct pci_bus *bus, struct advk_pcie *pcie)
+{
+	u8 smpss = PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ;
+	u32 reg;
+
+	/* Find the minimal supported MAX payload size */
+	advk_pcie_find_smpss(bus->self, &smpss);
+	pci_walk_bus(bus, advk_pcie_find_smpss, &smpss);
+
+	/* Configure RC MAX payload size */
+	reg = advk_readl(pcie, PCIE_CORE_DEV_CTRL_STATS_REG);
+	reg &= ~PCI_EXP_DEVCTL_PAYLOAD;
+	reg |= smpss << PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT;
+	advk_writel(pcie, reg, PCIE_CORE_DEV_CTRL_STATS_REG);
+
+	/* Configure device MAX payload size */
+	advk_pcie_bus_configure_mps(bus->self, &smpss);
+	pci_walk_bus(bus, advk_pcie_bus_configure_mps, &smpss);
+}
+
 static int advk_pcie_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -950,6 +1005,9 @@  static int advk_pcie_probe(struct platform_device *pdev)
 	list_for_each_entry(child, &bus->children, node)
 		pcie_bus_configure_settings(child);
 
+	/* Configure the MAX pay load size */
+	advk_pcie_configure_mps(bus, pcie);
+
 	pci_bus_add_devices(bus);
 	return 0;
 }