Use maximum latency when determining L1/L0s ASPM v2
diff mbox series

Message ID 20200803145832.11234-1-ian.kumlien@gmail.com
State Changes Requested
Delegated to: Bjorn Helgaas
Headers show
Series
  • Use maximum latency when determining L1/L0s ASPM v2
Related show

Commit Message

Ian Kumlien Aug. 3, 2020, 2:58 p.m. UTC
Changes:
* Handle L0s correclty as well, making it per direction
* Moved the switch cost in to the if statement since a non L1 switch has
  no additional cost.

For L0s:
We sumarize the entire latency per direction to see if it's acceptable
for the PCIe endpoint.

If it's not, we clear the link for the path that had too large latency.

For L1:
Currently we check the maximum latency of upstream and downstream
per link, not the maximum for the path

This would work if all links have the same latency, but:
endpoint -> c -> b -> a -> root  (in the order we walk the path)

If c or b has the higest latency, it will not register

Fix this by maintaining the maximum latency value for the path

This change fixes a regression introduced (but not caused) by:
66ff14e59e8a (PCI/ASPM: Allow ASPM on links to PCIe-to-PCI/PCI-X Bridges)

Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>
---
 drivers/pci/pcie/aspm.c | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

Comments

Ian Kumlien Aug. 15, 2020, 7:39 p.m. UTC | #1
Hi again,

Just trying to bump and also asking a question ...

That 1 us, could that be related to L0s latency - so should we add a
potential workaround by doing
max_t(u32, 1000, L0s.up + L0s.dw) to ensure that the time is never
greater on any hardware ;)

And also warn about it... =)

On Mon, Aug 3, 2020 at 4:58 PM Ian Kumlien <ian.kumlien@gmail.com> wrote:
>
> Changes:
> * Handle L0s correclty as well, making it per direction
> * Moved the switch cost in to the if statement since a non L1 switch has
>   no additional cost.
>
> For L0s:
> We sumarize the entire latency per direction to see if it's acceptable
> for the PCIe endpoint.
>
> If it's not, we clear the link for the path that had too large latency.
>
> For L1:
> Currently we check the maximum latency of upstream and downstream
> per link, not the maximum for the path
>
> This would work if all links have the same latency, but:
> endpoint -> c -> b -> a -> root  (in the order we walk the path)
>
> If c or b has the higest latency, it will not register
>
> Fix this by maintaining the maximum latency value for the path
>
> This change fixes a regression introduced (but not caused) by:
> 66ff14e59e8a (PCI/ASPM: Allow ASPM on links to PCIe-to-PCI/PCI-X Bridges)
>
> Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>
> ---
>  drivers/pci/pcie/aspm.c | 41 ++++++++++++++++++++++++++---------------
>  1 file changed, 26 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
> index b17e5ffd31b1..bc512e217258 100644
> --- a/drivers/pci/pcie/aspm.c
> +++ b/drivers/pci/pcie/aspm.c
> @@ -434,7 +434,8 @@ static void pcie_get_aspm_reg(struct pci_dev *pdev,
>
>  static void pcie_aspm_check_latency(struct pci_dev *endpoint)
>  {
> -       u32 latency, l1_switch_latency = 0;
> +       u32 latency, l1_max_latency = 0, l1_switch_latency = 0,
> +               l0s_latency_up = 0, l0s_latency_dw = 0;
>         struct aspm_latency *acceptable;
>         struct pcie_link_state *link;
>
> @@ -447,15 +448,22 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint)
>         acceptable = &link->acceptable[PCI_FUNC(endpoint->devfn)];
>
>         while (link) {
> -               /* Check upstream direction L0s latency */
> -               if ((link->aspm_capable & ASPM_STATE_L0S_UP) &&
> -                   (link->latency_up.l0s > acceptable->l0s))
> -                       link->aspm_capable &= ~ASPM_STATE_L0S_UP;
> -
> -               /* Check downstream direction L0s latency */
> -               if ((link->aspm_capable & ASPM_STATE_L0S_DW) &&
> -                   (link->latency_dw.l0s > acceptable->l0s))
> -                       link->aspm_capable &= ~ASPM_STATE_L0S_DW;
> +               if (link->aspm_capable & ASPM_STATE_L0S) {
> +                       /* Check upstream direction L0s latency */
> +                       if (link->aspm_capable & ASPM_STATE_L0S_UP) {
> +                               l0s_latency_up += link->latency_up.l0s;
> +                               if (l0s_latency_up > acceptable->l0s)
> +                                       link->aspm_capable &= ~ASPM_STATE_L0S_UP;
> +                       }
> +
> +                       /* Check downstream direction L0s latency */
> +                       if (link->aspm_capable & ASPM_STATE_L0S_DW) {
> +                               l0s_latency_dw += link->latency_dw.l0s;
> +                               if (l0s_latency_dw > acceptable->l0s)
> +                                       link->aspm_capable &= ~ASPM_STATE_L0S_DW;
> +                       }
> +               }
> +
>                 /*
>                  * Check L1 latency.
>                  * Every switch on the path to root complex need 1
> @@ -469,11 +477,14 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint)
>                  * L1 exit latencies advertised by a device include L1
>                  * substate latencies (and hence do not do any check).
>                  */
> -               latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> -               if ((link->aspm_capable & ASPM_STATE_L1) &&
> -                   (latency + l1_switch_latency > acceptable->l1))
> -                       link->aspm_capable &= ~ASPM_STATE_L1;
> -               l1_switch_latency += 1000;
> +               if (link->aspm_capable & ASPM_STATE_L1) {
> +                       latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> +                       l1_max_latency = max_t(u32, latency, l1_max_latency);
> +                       if (l1_max_latency + l1_switch_latency > acceptable->l1)
> +                               link->aspm_capable &= ~ASPM_STATE_L1;
> +
> +                       l1_switch_latency += 1000;
> +               }
>
>                 link = link->parent;
>         }
> --
> 2.28.0
>
Ian Kumlien Sept. 18, 2020, 10:47 p.m. UTC | #2
Any comments on this? I'm still patching my kernels...

On Mon, Aug 3, 2020 at 4:58 PM Ian Kumlien <ian.kumlien@gmail.com> wrote:
>
> Changes:
> * Handle L0s correclty as well, making it per direction
> * Moved the switch cost in to the if statement since a non L1 switch has
>   no additional cost.
>
> For L0s:
> We sumarize the entire latency per direction to see if it's acceptable
> for the PCIe endpoint.
>
> If it's not, we clear the link for the path that had too large latency.
>
> For L1:
> Currently we check the maximum latency of upstream and downstream
> per link, not the maximum for the path
>
> This would work if all links have the same latency, but:
> endpoint -> c -> b -> a -> root  (in the order we walk the path)
>
> If c or b has the higest latency, it will not register
>
> Fix this by maintaining the maximum latency value for the path
>
> This change fixes a regression introduced (but not caused) by:
> 66ff14e59e8a (PCI/ASPM: Allow ASPM on links to PCIe-to-PCI/PCI-X Bridges)
>
> Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>
> ---
>  drivers/pci/pcie/aspm.c | 41 ++++++++++++++++++++++++++---------------
>  1 file changed, 26 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
> index b17e5ffd31b1..bc512e217258 100644
> --- a/drivers/pci/pcie/aspm.c
> +++ b/drivers/pci/pcie/aspm.c
> @@ -434,7 +434,8 @@ static void pcie_get_aspm_reg(struct pci_dev *pdev,
>
>  static void pcie_aspm_check_latency(struct pci_dev *endpoint)
>  {
> -       u32 latency, l1_switch_latency = 0;
> +       u32 latency, l1_max_latency = 0, l1_switch_latency = 0,
> +               l0s_latency_up = 0, l0s_latency_dw = 0;
>         struct aspm_latency *acceptable;
>         struct pcie_link_state *link;
>
> @@ -447,15 +448,22 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint)
>         acceptable = &link->acceptable[PCI_FUNC(endpoint->devfn)];
>
>         while (link) {
> -               /* Check upstream direction L0s latency */
> -               if ((link->aspm_capable & ASPM_STATE_L0S_UP) &&
> -                   (link->latency_up.l0s > acceptable->l0s))
> -                       link->aspm_capable &= ~ASPM_STATE_L0S_UP;
> -
> -               /* Check downstream direction L0s latency */
> -               if ((link->aspm_capable & ASPM_STATE_L0S_DW) &&
> -                   (link->latency_dw.l0s > acceptable->l0s))
> -                       link->aspm_capable &= ~ASPM_STATE_L0S_DW;
> +               if (link->aspm_capable & ASPM_STATE_L0S) {
> +                       /* Check upstream direction L0s latency */
> +                       if (link->aspm_capable & ASPM_STATE_L0S_UP) {
> +                               l0s_latency_up += link->latency_up.l0s;
> +                               if (l0s_latency_up > acceptable->l0s)
> +                                       link->aspm_capable &= ~ASPM_STATE_L0S_UP;
> +                       }
> +
> +                       /* Check downstream direction L0s latency */
> +                       if (link->aspm_capable & ASPM_STATE_L0S_DW) {
> +                               l0s_latency_dw += link->latency_dw.l0s;
> +                               if (l0s_latency_dw > acceptable->l0s)
> +                                       link->aspm_capable &= ~ASPM_STATE_L0S_DW;
> +                       }
> +               }
> +
>                 /*
>                  * Check L1 latency.
>                  * Every switch on the path to root complex need 1
> @@ -469,11 +477,14 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint)
>                  * L1 exit latencies advertised by a device include L1
>                  * substate latencies (and hence do not do any check).
>                  */
> -               latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> -               if ((link->aspm_capable & ASPM_STATE_L1) &&
> -                   (latency + l1_switch_latency > acceptable->l1))
> -                       link->aspm_capable &= ~ASPM_STATE_L1;
> -               l1_switch_latency += 1000;
> +               if (link->aspm_capable & ASPM_STATE_L1) {
> +                       latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> +                       l1_max_latency = max_t(u32, latency, l1_max_latency);
> +                       if (l1_max_latency + l1_switch_latency > acceptable->l1)
> +                               link->aspm_capable &= ~ASPM_STATE_L1;
> +
> +                       l1_switch_latency += 1000;
> +               }
>
>                 link = link->parent;
>         }
> --
> 2.28.0
>
Bjorn Helgaas Sept. 22, 2020, 8:19 p.m. UTC | #3
[+cc Saheed, Puranjay]

On Mon, Aug 03, 2020 at 04:58:32PM +0200, Ian Kumlien wrote:
> Changes:
> * Handle L0s correclty as well, making it per direction
> * Moved the switch cost in to the if statement since a non L1 switch has
>   no additional cost.
> 
> For L0s:
> We sumarize the entire latency per direction to see if it's acceptable
> for the PCIe endpoint.
> 
> If it's not, we clear the link for the path that had too large latency.
> 
> For L1:
> Currently we check the maximum latency of upstream and downstream
> per link, not the maximum for the path
> 
> This would work if all links have the same latency, but:
> endpoint -> c -> b -> a -> root  (in the order we walk the path)
> 
> If c or b has the higest latency, it will not register
> 
> Fix this by maintaining the maximum latency value for the path
> 
> This change fixes a regression introduced (but not caused) by:
> 66ff14e59e8a (PCI/ASPM: Allow ASPM on links to PCIe-to-PCI/PCI-X Bridges)

We need to include some info about the problem here, e.g., the sort of
info hinted at in
https://lore.kernel.org/r/CAA85sZvJQge6ETwF1GkdvK1Mpwazh_cYJcmeZVAohmt0FjbMZg@mail.gmail.com

It would be *really* nice to have "lspci -vv" output for the system
when broken and when working correctly.  If they were attached to a
bugzilla.kernel.org entry, we could include the URL to that here.

> Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>
> ---
>  drivers/pci/pcie/aspm.c | 41 ++++++++++++++++++++++++++---------------
>  1 file changed, 26 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
> index b17e5ffd31b1..bc512e217258 100644
> --- a/drivers/pci/pcie/aspm.c
> +++ b/drivers/pci/pcie/aspm.c
> @@ -434,7 +434,8 @@ static void pcie_get_aspm_reg(struct pci_dev *pdev,
>  
>  static void pcie_aspm_check_latency(struct pci_dev *endpoint)
>  {
> -	u32 latency, l1_switch_latency = 0;
> +	u32 latency, l1_max_latency = 0, l1_switch_latency = 0,
> +		l0s_latency_up = 0, l0s_latency_dw = 0;
>  	struct aspm_latency *acceptable;
>  	struct pcie_link_state *link;
>  
> @@ -447,15 +448,22 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint)
>  	acceptable = &link->acceptable[PCI_FUNC(endpoint->devfn)];
>  
>  	while (link) {
> -		/* Check upstream direction L0s latency */
> -		if ((link->aspm_capable & ASPM_STATE_L0S_UP) &&
> -		    (link->latency_up.l0s > acceptable->l0s))
> -			link->aspm_capable &= ~ASPM_STATE_L0S_UP;
> -
> -		/* Check downstream direction L0s latency */
> -		if ((link->aspm_capable & ASPM_STATE_L0S_DW) &&
> -		    (link->latency_dw.l0s > acceptable->l0s))
> -			link->aspm_capable &= ~ASPM_STATE_L0S_DW;
> +		if (link->aspm_capable & ASPM_STATE_L0S) {
> +			/* Check upstream direction L0s latency */
> +			if (link->aspm_capable & ASPM_STATE_L0S_UP) {
> +				l0s_latency_up += link->latency_up.l0s;
> +				if (l0s_latency_up > acceptable->l0s)
> +					link->aspm_capable &= ~ASPM_STATE_L0S_UP;
> +			}
> +
> +			/* Check downstream direction L0s latency */
> +			if (link->aspm_capable & ASPM_STATE_L0S_DW) {
> +				l0s_latency_dw += link->latency_dw.l0s;
> +				if (l0s_latency_dw > acceptable->l0s)
> +					link->aspm_capable &= ~ASPM_STATE_L0S_DW;
> +			}
> +		}

The main point of the above is to accumulate l0s_latency_up and
l0s_latency_dw along the entire path.  I don't understand the
additional:

  if (link->aspm_capable & ASPM_STATE_L0S)

around the whole block.  I don't think it's *wrong*, but unless I'm
missing something it's unnecessary since we already check for
ASPM_STATE_L0S_UP and ASPM_STATE_L0S_DW.  It does make it arguably
more parallel with the ASPM_STATE_L1 case below, but it complicates
the diff enough that I'm not sure it's worth it.

I think this could also be split off as a separate patch to fix the
L0s latency checking.

>  		/*
>  		 * Check L1 latency.
>  		 * Every switch on the path to root complex need 1

Let's take the opportunity to update the comment to add the spec
citation for this additional 1 usec of L1 exit latency for every
switch.  I think it is PCIe r5.0, sec 5.4.1.2.2, which says:

  A Switch is required to initiate an L1 exit transition on its
  Upstream Port Link after no more than 1 μs from the beginning of an
  L1 exit transition on any of its Downstream Port Links.

> @@ -469,11 +477,14 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint)
>  		 * L1 exit latencies advertised by a device include L1
>  		 * substate latencies (and hence do not do any check).
>  		 */
> -		latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> -		if ((link->aspm_capable & ASPM_STATE_L1) &&
> -		    (latency + l1_switch_latency > acceptable->l1))
> -			link->aspm_capable &= ~ASPM_STATE_L1;
> -		l1_switch_latency += 1000;
> +		if (link->aspm_capable & ASPM_STATE_L1) {
> +			latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> +			l1_max_latency = max_t(u32, latency, l1_max_latency);
> +			if (l1_max_latency + l1_switch_latency > acceptable->l1)
> +				link->aspm_capable &= ~ASPM_STATE_L1;
> +
> +			l1_switch_latency += 1000;
> +		}

This accumulates the 1 usec delays for a Switch to propagate the exit
transition from its Downstream Port to its Upstream Port, but it
doesn't accumulate the L1 exit latencies themselves for the entire
path, does it?  I.e., we don't accumulate "latency" for the whole
path.  Don't we need that?

>  		link = link->parent;
>  	}
> -- 
> 2.28.0
>
Ian Kumlien Sept. 22, 2020, 9:02 p.m. UTC | #4
On Tue, Sep 22, 2020 at 10:19 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> [+cc Saheed, Puranjay]
>
> On Mon, Aug 03, 2020 at 04:58:32PM +0200, Ian Kumlien wrote:
> > Changes:
> > * Handle L0s correclty as well, making it per direction
> > * Moved the switch cost in to the if statement since a non L1 switch has
> >   no additional cost.
> >
> > For L0s:
> > We sumarize the entire latency per direction to see if it's acceptable
> > for the PCIe endpoint.
> >
> > If it's not, we clear the link for the path that had too large latency.
> >
> > For L1:
> > Currently we check the maximum latency of upstream and downstream
> > per link, not the maximum for the path
> >
> > This would work if all links have the same latency, but:
> > endpoint -> c -> b -> a -> root  (in the order we walk the path)
> >
> > If c or b has the higest latency, it will not register
> >
> > Fix this by maintaining the maximum latency value for the path
> >
> > This change fixes a regression introduced (but not caused) by:
> > 66ff14e59e8a (PCI/ASPM: Allow ASPM on links to PCIe-to-PCI/PCI-X Bridges)
>
> We need to include some info about the problem here, e.g., the sort of
> info hinted at in
> https://lore.kernel.org/r/CAA85sZvJQge6ETwF1GkdvK1Mpwazh_cYJcmeZVAohmt0FjbMZg@mail.gmail.com
>
> It would be *really* nice to have "lspci -vv" output for the system
> when broken and when working correctly.  If they were attached to a
> bugzilla.kernel.org entry, we could include the URL to that here.

I did create a bugzilla entry, and i see what you mean, will fix

> > Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>
> > ---
> >  drivers/pci/pcie/aspm.c | 41 ++++++++++++++++++++++++++---------------
> >  1 file changed, 26 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
> > index b17e5ffd31b1..bc512e217258 100644
> > --- a/drivers/pci/pcie/aspm.c
> > +++ b/drivers/pci/pcie/aspm.c
> > @@ -434,7 +434,8 @@ static void pcie_get_aspm_reg(struct pci_dev *pdev,
> >
> >  static void pcie_aspm_check_latency(struct pci_dev *endpoint)
> >  {
> > -     u32 latency, l1_switch_latency = 0;
> > +     u32 latency, l1_max_latency = 0, l1_switch_latency = 0,
> > +             l0s_latency_up = 0, l0s_latency_dw = 0;
> >       struct aspm_latency *acceptable;
> >       struct pcie_link_state *link;
> >
> > @@ -447,15 +448,22 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint)
> >       acceptable = &link->acceptable[PCI_FUNC(endpoint->devfn)];
> >
> >       while (link) {
> > -             /* Check upstream direction L0s latency */
> > -             if ((link->aspm_capable & ASPM_STATE_L0S_UP) &&
> > -                 (link->latency_up.l0s > acceptable->l0s))
> > -                     link->aspm_capable &= ~ASPM_STATE_L0S_UP;
> > -
> > -             /* Check downstream direction L0s latency */
> > -             if ((link->aspm_capable & ASPM_STATE_L0S_DW) &&
> > -                 (link->latency_dw.l0s > acceptable->l0s))
> > -                     link->aspm_capable &= ~ASPM_STATE_L0S_DW;
> > +             if (link->aspm_capable & ASPM_STATE_L0S) {
> > +                     /* Check upstream direction L0s latency */
> > +                     if (link->aspm_capable & ASPM_STATE_L0S_UP) {
> > +                             l0s_latency_up += link->latency_up.l0s;
> > +                             if (l0s_latency_up > acceptable->l0s)
> > +                                     link->aspm_capable &= ~ASPM_STATE_L0S_UP;
> > +                     }
> > +
> > +                     /* Check downstream direction L0s latency */
> > +                     if (link->aspm_capable & ASPM_STATE_L0S_DW) {
> > +                             l0s_latency_dw += link->latency_dw.l0s;
> > +                             if (l0s_latency_dw > acceptable->l0s)
> > +                                     link->aspm_capable &= ~ASPM_STATE_L0S_DW;
> > +                     }
> > +             }
>
> The main point of the above is to accumulate l0s_latency_up and
> l0s_latency_dw along the entire path.  I don't understand the
> additional:
>
>   if (link->aspm_capable & ASPM_STATE_L0S)
>
> around the whole block.  I don't think it's *wrong*, but unless I'm
> missing something it's unnecessary since we already check for
> ASPM_STATE_L0S_UP and ASPM_STATE_L0S_DW.  It does make it arguably
> more parallel with the ASPM_STATE_L1 case below, but it complicates
> the diff enough that I'm not sure it's worth it.

Yeah it's a leftover from a earlier version of the patch actually,
sorry about that

> I think this could also be split off as a separate patch to fix the
> L0s latency checking.

Ok, will do!

> >               /*
> >                * Check L1 latency.
> >                * Every switch on the path to root complex need 1
>
> Let's take the opportunity to update the comment to add the spec
> citation for this additional 1 usec of L1 exit latency for every
> switch.  I think it is PCIe r5.0, sec 5.4.1.2.2, which says:
>
>   A Switch is required to initiate an L1 exit transition on its
>   Upstream Port Link after no more than 1 μs from the beginning of an
>   L1 exit transition on any of its Downstream Port Links.

Will do!

> > @@ -469,11 +477,14 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint)
> >                * L1 exit latencies advertised by a device include L1
> >                * substate latencies (and hence do not do any check).
> >                */
> > -             latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> > -             if ((link->aspm_capable & ASPM_STATE_L1) &&
> > -                 (latency + l1_switch_latency > acceptable->l1))
> > -                     link->aspm_capable &= ~ASPM_STATE_L1;
> > -             l1_switch_latency += 1000;
> > +             if (link->aspm_capable & ASPM_STATE_L1) {
> > +                     latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> > +                     l1_max_latency = max_t(u32, latency, l1_max_latency);
> > +                     if (l1_max_latency + l1_switch_latency > acceptable->l1)
> > +                             link->aspm_capable &= ~ASPM_STATE_L1;
> > +
> > +                     l1_switch_latency += 1000;
> > +             }
>
> This accumulates the 1 usec delays for a Switch to propagate the exit
> transition from its Downstream Port to its Upstream Port, but it
> doesn't accumulate the L1 exit latencies themselves for the entire
> path, does it?  I.e., we don't accumulate "latency" for the whole
> path.  Don't we need that?

Not for L1's apparently, from what I gather the maximum link latency
is "largest latency" + 1us * number-of-hops

Ie, just like the comment above states - the L1 total time might be
more but  1us is all that is needed to "start" and that propagates
over the link.

> >               link = link->parent;
> >       }
> > --
> > 2.28.0
> >

Included inline for discussion, will send it separately as well - when
i know it's ok :)


So this now became:
commit dde058f731f97b6f86600eb6578c8b02b8720edb (HEAD)
Author: Ian Kumlien <ian.kumlien@gmail.com>
Date:   Tue Sep 22 22:58:19 2020 +0200

    PCIe L0s latency is cumulative from the root

    From what I have been able to figure out, L0s latency is
    cumulative from the root and should be treated as such.

    Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>

diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 893b37669087..15d64832a988 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -434,7 +434,8 @@ static void pcie_get_aspm_reg(struct pci_dev *pdev,

 static void pcie_aspm_check_latency(struct pci_dev *endpoint)
 {
-       u32 latency, l1_max_latency = 0, l1_switch_latency = 0;
+       u32 latency, l1_max_latency = 0, l1_switch_latency = 0,
+               l0s_latency_up = 0, l0s_latency_dw = 0;
        struct aspm_latency *acceptable;
        struct pcie_link_state *link;

@@ -448,14 +449,18 @@ static void pcie_aspm_check_latency(struct
pci_dev *endpoint)

        while (link) {
                /* Check upstream direction L0s latency */
-               if ((link->aspm_capable & ASPM_STATE_L0S_UP) &&
-                   (link->latency_up.l0s > acceptable->l0s))
-                       link->aspm_capable &= ~ASPM_STATE_L0S_UP;
+               if (link->aspm_capable & ASPM_STATE_L0S_UP) {
+                       l0s_latency_up += link->latency_up.l0s;
+                       if (l0s_latency_up > acceptable->l0s)
+                               link->aspm_capable &= ~ASPM_STATE_L0S_UP;
+               }

                /* Check downstream direction L0s latency */
-               if ((link->aspm_capable & ASPM_STATE_L0S_DW) &&
-                   (link->latency_dw.l0s > acceptable->l0s))
-                       link->aspm_capable &= ~ASPM_STATE_L0S_DW;
+               if (link->aspm_capable & ASPM_STATE_L0S_DW) {
+                       l0s_latency_dw += link->latency_dw.l0s;
+                       if (l0s_latency_dw > acceptable->l0s)
+                               link->aspm_capable &= ~ASPM_STATE_L0S_DW;
+               }

                /*
                 * Check L1 latency.
--------------------------

And:
commit db3d9c4baf4ab177d87b5cd41f624f5901e7390f
Author: Ian Kumlien <ian.kumlien@gmail.com>
Date:   Sun Jul 26 16:01:15 2020 +0200

    Use maximum latency when determining L1 ASPM

    If it's not, we clear the link for the path that had too large latency.

    Currently we check the maximum latency of upstream and downstream
    per link, not the maximum for the path

    This would work if all links have the same latency, but:
    endpoint -> c -> b -> a -> root  (in the order we walk the path)

    If c or b has the higest latency, it will not register

    Fix this by maintaining the maximum latency value for the path

    See this bugzilla for more information:
    https://bugzilla.kernel.org/show_bug.cgi?id=208741

    This fixes an issue for me where my desktops machines maximum bandwidth
    for remote connections dropped from 933 MBit to ~40 MBit.

    The bug became obvious once we enabled ASPM on all links:
    66ff14e59e8a (PCI/ASPM: Allow ASPM on links to PCIe-to-PCI/PCI-X Bridges)

    Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>

diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 253c30cc1967..893b37669087 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -434,7 +434,7 @@ static void pcie_get_aspm_reg(struct pci_dev *pdev,

 static void pcie_aspm_check_latency(struct pci_dev *endpoint)
 {
-       u32 latency, l1_switch_latency = 0;
+       u32 latency, l1_max_latency = 0, l1_switch_latency = 0;
        struct aspm_latency *acceptable;
        struct pcie_link_state *link;

@@ -456,10 +456,14 @@ static void pcie_aspm_check_latency(struct
pci_dev *endpoint)
                if ((link->aspm_capable & ASPM_STATE_L0S_DW) &&
                    (link->latency_dw.l0s > acceptable->l0s))
                        link->aspm_capable &= ~ASPM_STATE_L0S_DW;
+
                /*
                 * Check L1 latency.
-                * Every switch on the path to root complex need 1
-                * more microsecond for L1. Spec doesn't mention L0s.
+                *
+                * PCIe r5.0, sec 5.4.1.2.2 states:
+                * A Switch is required to initiate an L1 exit transition on its
+                * Upstream Port Link after no more than 1 μs from the
beginning of an
+                * L1 exit transition on any of its Downstream Port Links.
                 *
                 * The exit latencies for L1 substates are not advertised
                 * by a device.  Since the spec also doesn't mention a way
@@ -469,11 +473,14 @@ static void pcie_aspm_check_latency(struct
pci_dev *endpoint)
                 * L1 exit latencies advertised by a device include L1
                 * substate latencies (and hence do not do any check).
                 */
-               latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
-               if ((link->aspm_capable & ASPM_STATE_L1) &&
-                   (latency + l1_switch_latency > acceptable->l1))
-                       link->aspm_capable &= ~ASPM_STATE_L1;
-               l1_switch_latency += 1000;
+               if (link->aspm_capable & ASPM_STATE_L1) {
+                       latency = max_t(u32, link->latency_up.l1,
link->latency_dw.l1);
+                       l1_max_latency = max_t(u32, latency, l1_max_latency);
+                       if (l1_max_latency + l1_switch_latency > acceptable->l1)
+                               link->aspm_capable &= ~ASPM_STATE_L1;
+
+                       l1_switch_latency += 1000;
+               }

                link = link->parent;
        }
----------------------
Bjorn Helgaas Sept. 22, 2020, 11 p.m. UTC | #5
[+cc Alexander]

On Tue, Sep 22, 2020 at 11:02:35PM +0200, Ian Kumlien wrote:
> On Tue, Sep 22, 2020 at 10:19 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > On Mon, Aug 03, 2020 at 04:58:32PM +0200, Ian Kumlien wrote:

> > > @@ -469,11 +477,14 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint)
> > >                * L1 exit latencies advertised by a device include L1
> > >                * substate latencies (and hence do not do any check).
> > >                */
> > > -             latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> > > -             if ((link->aspm_capable & ASPM_STATE_L1) &&
> > > -                 (latency + l1_switch_latency > acceptable->l1))
> > > -                     link->aspm_capable &= ~ASPM_STATE_L1;
> > > -             l1_switch_latency += 1000;
> > > +             if (link->aspm_capable & ASPM_STATE_L1) {
> > > +                     latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> > > +                     l1_max_latency = max_t(u32, latency, l1_max_latency);
> > > +                     if (l1_max_latency + l1_switch_latency > acceptable->l1)
> > > +                             link->aspm_capable &= ~ASPM_STATE_L1;
> > > +
> > > +                     l1_switch_latency += 1000;
> > > +             }
> >
> > This accumulates the 1 usec delays for a Switch to propagate the exit
> > transition from its Downstream Port to its Upstream Port, but it
> > doesn't accumulate the L1 exit latencies themselves for the entire
> > path, does it?  I.e., we don't accumulate "latency" for the whole
> > path.  Don't we need that?
> 
> Not for L1's apparently, from what I gather the maximum link latency
> is "largest latency" + 1us * number-of-hops
> 
> Ie, just like the comment above states - the L1 total time might be
> more but  1us is all that is needed to "start" and that propagates
> over the link.

Ah, you're right!  I don't think this is clear from the existing code
comment, but it *is* clear from the example in sec 5.4.1.2.2 (Figure
5-8) of the spec.

> @@ -448,14 +449,18 @@ static void pcie_aspm_check_latency(struct
> pci_dev *endpoint)
> 
>         while (link) {
>                 /* Check upstream direction L0s latency */
> -               if ((link->aspm_capable & ASPM_STATE_L0S_UP) &&
> -                   (link->latency_up.l0s > acceptable->l0s))
> -                       link->aspm_capable &= ~ASPM_STATE_L0S_UP;
> +               if (link->aspm_capable & ASPM_STATE_L0S_UP) {
> +                       l0s_latency_up += link->latency_up.l0s;

It's pretty clear from sec 5.4.1.2.2 that we *don't* need to
accumulate the L1 exit latencies.  Unfortunately sec 5.4.1.1.2 about
L0s exit doesn't have such a nice example.

The L0s *language* is similar though:

  5.4.1.1.2: If the Upstream component is a Switch (i.e., it is not
  the Root Complex), then it must initiate a transition on its
  Upstream Port Transmit Lanes (if the Upstream Port's Transmit Lanes
  are in a low-power state) as soon as it detects an exit from L0s on
  any of its Downstream Ports.

  5.4.1.2.1: A Switch is required to initiate an L1 exit transition on
  its Upstream Port Link after no more than 1 μs from the beginning of
  an L1 exit transition on any of its Downstream Port Links.  during
  L1 exit.

So a switch must start upstream L0s exit "as soon as" it sees L0s exit
on any downstream port, while it must start L1 exit "no more than 1 μs"
after seeing an L1 exit.

And I really can't tell from the spec whether we need to accumulate
the L0s exit latencies or not.  Maybe somebody can clarify this.

> commit db3d9c4baf4ab177d87b5cd41f624f5901e7390f
> Author: Ian Kumlien <ian.kumlien@gmail.com>
> Date:   Sun Jul 26 16:01:15 2020 +0200
> 
>     Use maximum latency when determining L1 ASPM
> 
>     If it's not, we clear the link for the path that had too large latency.
> 
>     Currently we check the maximum latency of upstream and downstream
>     per link, not the maximum for the path
> 
>     This would work if all links have the same latency, but:
>     endpoint -> c -> b -> a -> root  (in the order we walk the path)
> 
>     If c or b has the higest latency, it will not register
> 
>     Fix this by maintaining the maximum latency value for the path
> 
>     See this bugzilla for more information:
>     https://bugzilla.kernel.org/show_bug.cgi?id=208741
> 
>     This fixes an issue for me where my desktops machines maximum bandwidth
>     for remote connections dropped from 933 MBit to ~40 MBit.
> 
>     The bug became obvious once we enabled ASPM on all links:
>     66ff14e59e8a (PCI/ASPM: Allow ASPM on links to PCIe-to-PCI/PCI-X Bridges)

I can't connect the dots here yet.  I don't see a PCIe-to-PCI/PCI-X
bridge in your lspci, so I can't figure out why this commit would make
a difference for you.

IIUC, the problem device is 03:00.0, the Intel I211 NIC.  Here's the
path to it:

  00:01.2 Root Port              to [bus 01-07]
  01:00.0 Switch Upstream Port   to [bus 02-07]
  02:03.0 Switch Downstream Port to [bus 03]
  03:00.0 Endpoint (Intel I211 NIC)

And I think this is the relevant info:

						    LnkCtl    LnkCtl
	   ------DevCap-------  ----LnkCap-------  -Before-  -After--
  00:01.2                                L1 <32us       L1+       L1-
  01:00.0                                L1 <32us       L1+       L1-
  02:03.0                                L1 <32us       L1+       L1+
  03:00.0  L0s <512ns L1 <64us  L0s <2us L1 <16us  L0s- L1-  L0s- L1-

The NIC says it can tolerate at most 512ns of L0s exit latency and at
most 64us of L1 exit latency.

02:03.0 doesn't support L0s, and the NIC itself can't exit L0s that
fast anyway (it can only do <2us), so L0s should be out of the picture
completely.

Before your patch, apparently we (or BIOS) enabled L1 on the link from
00:01.2 to 01:00.0, and partially enabled it on the link from 02:03.0
to 03:00.0.

It looks like we *should* be able to enable L1 on both links since the
exit latency should be <33us (first link starts exit at T=0, completes
by T=32; second link starts exit at T=1, completes by T=33), and
03:00.0 can tolerate up to 64us.

I guess the effect of your patch is to disable L1 on the 00:01.2 -
01:00.0 link?  And that makes the NIC work better?  I am obviously
missing something because I don't understand why the patch does that
or why it works better.

I added Alexander to cc since it sounds like he's helped debug this,
too.

>     Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>
> 
> diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
> index 253c30cc1967..893b37669087 100644
> --- a/drivers/pci/pcie/aspm.c
> +++ b/drivers/pci/pcie/aspm.c
> @@ -434,7 +434,7 @@ static void pcie_get_aspm_reg(struct pci_dev *pdev,
> 
>  static void pcie_aspm_check_latency(struct pci_dev *endpoint)
>  {
> -       u32 latency, l1_switch_latency = 0;
> +       u32 latency, l1_max_latency = 0, l1_switch_latency = 0;
>         struct aspm_latency *acceptable;
>         struct pcie_link_state *link;
> 
> @@ -456,10 +456,14 @@ static void pcie_aspm_check_latency(struct
> pci_dev *endpoint)
>                 if ((link->aspm_capable & ASPM_STATE_L0S_DW) &&
>                     (link->latency_dw.l0s > acceptable->l0s))
>                         link->aspm_capable &= ~ASPM_STATE_L0S_DW;
> +
>                 /*
>                  * Check L1 latency.
> -                * Every switch on the path to root complex need 1
> -                * more microsecond for L1. Spec doesn't mention L0s.
> +                *
> +                * PCIe r5.0, sec 5.4.1.2.2 states:
> +                * A Switch is required to initiate an L1 exit transition on its
> +                * Upstream Port Link after no more than 1 μs from the
> beginning of an
> +                * L1 exit transition on any of its Downstream Port Links.
>                  *
>                  * The exit latencies for L1 substates are not advertised
>                  * by a device.  Since the spec also doesn't mention a way
> @@ -469,11 +473,14 @@ static void pcie_aspm_check_latency(struct
> pci_dev *endpoint)
>                  * L1 exit latencies advertised by a device include L1
>                  * substate latencies (and hence do not do any check).
>                  */
> -               latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> -               if ((link->aspm_capable & ASPM_STATE_L1) &&
> -                   (latency + l1_switch_latency > acceptable->l1))
> -                       link->aspm_capable &= ~ASPM_STATE_L1;
> -               l1_switch_latency += 1000;
> +               if (link->aspm_capable & ASPM_STATE_L1) {
> +                       latency = max_t(u32, link->latency_up.l1,
> link->latency_dw.l1);
> +                       l1_max_latency = max_t(u32, latency, l1_max_latency);
> +                       if (l1_max_latency + l1_switch_latency > acceptable->l1)
> +                               link->aspm_capable &= ~ASPM_STATE_L1;
> +
> +                       l1_switch_latency += 1000;
> +               }
> 
>                 link = link->parent;
>         }
> ----------------------
Ian Kumlien Sept. 22, 2020, 11:29 p.m. UTC | #6
On Wed, Sep 23, 2020 at 1:00 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> [+cc Alexander]
>
> On Tue, Sep 22, 2020 at 11:02:35PM +0200, Ian Kumlien wrote:
> > On Tue, Sep 22, 2020 at 10:19 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > On Mon, Aug 03, 2020 at 04:58:32PM +0200, Ian Kumlien wrote:
>
> > > > @@ -469,11 +477,14 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint)
> > > >                * L1 exit latencies advertised by a device include L1
> > > >                * substate latencies (and hence do not do any check).
> > > >                */
> > > > -             latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> > > > -             if ((link->aspm_capable & ASPM_STATE_L1) &&
> > > > -                 (latency + l1_switch_latency > acceptable->l1))
> > > > -                     link->aspm_capable &= ~ASPM_STATE_L1;
> > > > -             l1_switch_latency += 1000;
> > > > +             if (link->aspm_capable & ASPM_STATE_L1) {
> > > > +                     latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> > > > +                     l1_max_latency = max_t(u32, latency, l1_max_latency);
> > > > +                     if (l1_max_latency + l1_switch_latency > acceptable->l1)
> > > > +                             link->aspm_capable &= ~ASPM_STATE_L1;
> > > > +
> > > > +                     l1_switch_latency += 1000;
> > > > +             }
> > >
> > > This accumulates the 1 usec delays for a Switch to propagate the exit
> > > transition from its Downstream Port to its Upstream Port, but it
> > > doesn't accumulate the L1 exit latencies themselves for the entire
> > > path, does it?  I.e., we don't accumulate "latency" for the whole
> > > path.  Don't we need that?
> >
> > Not for L1's apparently, from what I gather the maximum link latency
> > is "largest latency" + 1us * number-of-hops
> >
> > Ie, just like the comment above states - the L1 total time might be
> > more but  1us is all that is needed to "start" and that propagates
> > over the link.
>
> Ah, you're right!  I don't think this is clear from the existing code
> comment, but it *is* clear from the example in sec 5.4.1.2.2 (Figure
> 5-8) of the spec.
>
> > @@ -448,14 +449,18 @@ static void pcie_aspm_check_latency(struct
> > pci_dev *endpoint)
> >
> >         while (link) {
> >                 /* Check upstream direction L0s latency */
> > -               if ((link->aspm_capable & ASPM_STATE_L0S_UP) &&
> > -                   (link->latency_up.l0s > acceptable->l0s))
> > -                       link->aspm_capable &= ~ASPM_STATE_L0S_UP;
> > +               if (link->aspm_capable & ASPM_STATE_L0S_UP) {
> > +                       l0s_latency_up += link->latency_up.l0s;
>
> It's pretty clear from sec 5.4.1.2.2 that we *don't* need to
> accumulate the L1 exit latencies.  Unfortunately sec 5.4.1.1.2 about
> L0s exit doesn't have such a nice example.
>
> The L0s *language* is similar though:
>
>   5.4.1.1.2: If the Upstream component is a Switch (i.e., it is not
>   the Root Complex), then it must initiate a transition on its
>   Upstream Port Transmit Lanes (if the Upstream Port's Transmit Lanes
>   are in a low-power state) as soon as it detects an exit from L0s on
>   any of its Downstream Ports.

"it detects an exit"

>   5.4.1.2.1: A Switch is required to initiate an L1 exit transition on
>   its Upstream Port Link after no more than 1 μs from the beginning of
>   an L1 exit transition on any of its Downstream Port Links.  during
>   L1 exit.

vs "from the beginning of"

So to me, this looks like edge triggering - only sense i could make of
it would be cumulative

(you should also note that i have no L0s issues, but I suspect that
the code is wrong currently)

> So a switch must start upstream L0s exit "as soon as" it sees L0s exit
> on any downstream port, while it must start L1 exit "no more than 1 μs"
> after seeing an L1 exit.
>
> And I really can't tell from the spec whether we need to accumulate
> the L0s exit latencies or not.  Maybe somebody can clarify this.
>
> > commit db3d9c4baf4ab177d87b5cd41f624f5901e7390f
> > Author: Ian Kumlien <ian.kumlien@gmail.com>
> > Date:   Sun Jul 26 16:01:15 2020 +0200
> >
> >     Use maximum latency when determining L1 ASPM
> >
> >     If it's not, we clear the link for the path that had too large latency.
> >
> >     Currently we check the maximum latency of upstream and downstream
> >     per link, not the maximum for the path
> >
> >     This would work if all links have the same latency, but:
> >     endpoint -> c -> b -> a -> root  (in the order we walk the path)
> >
> >     If c or b has the higest latency, it will not register
> >
> >     Fix this by maintaining the maximum latency value for the path
> >
> >     See this bugzilla for more information:
> >     https://bugzilla.kernel.org/show_bug.cgi?id=208741
> >
> >     This fixes an issue for me where my desktops machines maximum bandwidth
> >     for remote connections dropped from 933 MBit to ~40 MBit.
> >
> >     The bug became obvious once we enabled ASPM on all links:
> >     66ff14e59e8a (PCI/ASPM: Allow ASPM on links to PCIe-to-PCI/PCI-X Bridges)
>
> I can't connect the dots here yet.  I don't see a PCIe-to-PCI/PCI-X
> bridge in your lspci, so I can't figure out why this commit would make
> a difference for you.
>
> IIUC, the problem device is 03:00.0, the Intel I211 NIC.  Here's the
> path to it:
>
>   00:01.2 Root Port              to [bus 01-07]
>   01:00.0 Switch Upstream Port   to [bus 02-07]
>   02:03.0 Switch Downstream Port to [bus 03]
>   03:00.0 Endpoint (Intel I211 NIC)
>
> And I think this is the relevant info:
>
>                                                     LnkCtl    LnkCtl
>            ------DevCap-------  ----LnkCap-------  -Before-  -After--
>   00:01.2                                L1 <32us       L1+       L1-
>   01:00.0                                L1 <32us       L1+       L1-
>   02:03.0                                L1 <32us       L1+       L1+
>   03:00.0  L0s <512ns L1 <64us  L0s <2us L1 <16us  L0s- L1-  L0s- L1-
>
> The NIC says it can tolerate at most 512ns of L0s exit latency and at
> most 64us of L1 exit latency.
>
> 02:03.0 doesn't support L0s, and the NIC itself can't exit L0s that
> fast anyway (it can only do <2us), so L0s should be out of the picture
> completely.
>
> Before your patch, apparently we (or BIOS) enabled L1 on the link from
> 00:01.2 to 01:00.0, and partially enabled it on the link from 02:03.0
> to 03:00.0.

According to the spec, this is managed by the OS - which was the
change introduced...

> It looks like we *should* be able to enable L1 on both links since the
> exit latency should be <33us (first link starts exit at T=0, completes
> by T=32; second link starts exit at T=1, completes by T=33), and
> 03:00.0 can tolerate up to 64us.
>
> I guess the effect of your patch is to disable L1 on the 00:01.2 -
> 01:00.0 link?  And that makes the NIC work better?  I am obviously
> missing something because I don't understand why the patch does that
> or why it works better.

It makes it work like normal again, like if i disable ASPM on the nic itself...

I don't know which value that reflects, up or down - since we do max
of both values and
it actually disables ASPM.

What we can see is that the first device that passes the threshold is 01:00.0

How can I read more data from PCIe without needing to add kprint...

This is what lspci uses apparently:
#define  PCI_EXP_LNKCAP_L0S     0x07000 /* L0s Exit Latency */
#define  PCI_EXP_LNKCAP_L1      0x38000 /* L1 Exit Latency */

But which latencies are those? up or down?

> I added Alexander to cc since it sounds like he's helped debug this,
> too.
>
> >     Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>
> >
> > diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
> > index 253c30cc1967..893b37669087 100644
> > --- a/drivers/pci/pcie/aspm.c
> > +++ b/drivers/pci/pcie/aspm.c
> > @@ -434,7 +434,7 @@ static void pcie_get_aspm_reg(struct pci_dev *pdev,
> >
> >  static void pcie_aspm_check_latency(struct pci_dev *endpoint)
> >  {
> > -       u32 latency, l1_switch_latency = 0;
> > +       u32 latency, l1_max_latency = 0, l1_switch_latency = 0;
> >         struct aspm_latency *acceptable;
> >         struct pcie_link_state *link;
> >
> > @@ -456,10 +456,14 @@ static void pcie_aspm_check_latency(struct
> > pci_dev *endpoint)
> >                 if ((link->aspm_capable & ASPM_STATE_L0S_DW) &&
> >                     (link->latency_dw.l0s > acceptable->l0s))
> >                         link->aspm_capable &= ~ASPM_STATE_L0S_DW;
> > +
> >                 /*
> >                  * Check L1 latency.
> > -                * Every switch on the path to root complex need 1
> > -                * more microsecond for L1. Spec doesn't mention L0s.
> > +                *
> > +                * PCIe r5.0, sec 5.4.1.2.2 states:
> > +                * A Switch is required to initiate an L1 exit transition on its
> > +                * Upstream Port Link after no more than 1 μs from the
> > beginning of an
> > +                * L1 exit transition on any of its Downstream Port Links.
> >                  *
> >                  * The exit latencies for L1 substates are not advertised
> >                  * by a device.  Since the spec also doesn't mention a way
> > @@ -469,11 +473,14 @@ static void pcie_aspm_check_latency(struct
> > pci_dev *endpoint)
> >                  * L1 exit latencies advertised by a device include L1
> >                  * substate latencies (and hence do not do any check).
> >                  */
> > -               latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> > -               if ((link->aspm_capable & ASPM_STATE_L1) &&
> > -                   (latency + l1_switch_latency > acceptable->l1))
> > -                       link->aspm_capable &= ~ASPM_STATE_L1;
> > -               l1_switch_latency += 1000;
> > +               if (link->aspm_capable & ASPM_STATE_L1) {
> > +                       latency = max_t(u32, link->latency_up.l1,
> > link->latency_dw.l1);
> > +                       l1_max_latency = max_t(u32, latency, l1_max_latency);
> > +                       if (l1_max_latency + l1_switch_latency > acceptable->l1)
> > +                               link->aspm_capable &= ~ASPM_STATE_L1;
> > +
> > +                       l1_switch_latency += 1000;
> > +               }
> >
> >                 link = link->parent;
> >         }
> > ----------------------
Ian Kumlien Sept. 22, 2020, 11:31 p.m. UTC | #7
Actually, from reading the code, it should theoretically only be up...

since:
        /*
         * Re-read upstream/downstream components' register state
         * after clock configuration
         */
        pcie_get_aspm_reg(parent, &upreg);
        pcie_get_aspm_reg(child, &dwreg);
...

But the max was there before? And also, it fixes actual issues? I'm
off to bed.... :)

On Wed, Sep 23, 2020 at 1:29 AM Ian Kumlien <ian.kumlien@gmail.com> wrote:
>
> On Wed, Sep 23, 2020 at 1:00 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> >
> > [+cc Alexander]
> >
> > On Tue, Sep 22, 2020 at 11:02:35PM +0200, Ian Kumlien wrote:
> > > On Tue, Sep 22, 2020 at 10:19 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > On Mon, Aug 03, 2020 at 04:58:32PM +0200, Ian Kumlien wrote:
> >
> > > > > @@ -469,11 +477,14 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint)
> > > > >                * L1 exit latencies advertised by a device include L1
> > > > >                * substate latencies (and hence do not do any check).
> > > > >                */
> > > > > -             latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> > > > > -             if ((link->aspm_capable & ASPM_STATE_L1) &&
> > > > > -                 (latency + l1_switch_latency > acceptable->l1))
> > > > > -                     link->aspm_capable &= ~ASPM_STATE_L1;
> > > > > -             l1_switch_latency += 1000;
> > > > > +             if (link->aspm_capable & ASPM_STATE_L1) {
> > > > > +                     latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> > > > > +                     l1_max_latency = max_t(u32, latency, l1_max_latency);
> > > > > +                     if (l1_max_latency + l1_switch_latency > acceptable->l1)
> > > > > +                             link->aspm_capable &= ~ASPM_STATE_L1;
> > > > > +
> > > > > +                     l1_switch_latency += 1000;
> > > > > +             }
> > > >
> > > > This accumulates the 1 usec delays for a Switch to propagate the exit
> > > > transition from its Downstream Port to its Upstream Port, but it
> > > > doesn't accumulate the L1 exit latencies themselves for the entire
> > > > path, does it?  I.e., we don't accumulate "latency" for the whole
> > > > path.  Don't we need that?
> > >
> > > Not for L1's apparently, from what I gather the maximum link latency
> > > is "largest latency" + 1us * number-of-hops
> > >
> > > Ie, just like the comment above states - the L1 total time might be
> > > more but  1us is all that is needed to "start" and that propagates
> > > over the link.
> >
> > Ah, you're right!  I don't think this is clear from the existing code
> > comment, but it *is* clear from the example in sec 5.4.1.2.2 (Figure
> > 5-8) of the spec.
> >
> > > @@ -448,14 +449,18 @@ static void pcie_aspm_check_latency(struct
> > > pci_dev *endpoint)
> > >
> > >         while (link) {
> > >                 /* Check upstream direction L0s latency */
> > > -               if ((link->aspm_capable & ASPM_STATE_L0S_UP) &&
> > > -                   (link->latency_up.l0s > acceptable->l0s))
> > > -                       link->aspm_capable &= ~ASPM_STATE_L0S_UP;
> > > +               if (link->aspm_capable & ASPM_STATE_L0S_UP) {
> > > +                       l0s_latency_up += link->latency_up.l0s;
> >
> > It's pretty clear from sec 5.4.1.2.2 that we *don't* need to
> > accumulate the L1 exit latencies.  Unfortunately sec 5.4.1.1.2 about
> > L0s exit doesn't have such a nice example.
> >
> > The L0s *language* is similar though:
> >
> >   5.4.1.1.2: If the Upstream component is a Switch (i.e., it is not
> >   the Root Complex), then it must initiate a transition on its
> >   Upstream Port Transmit Lanes (if the Upstream Port's Transmit Lanes
> >   are in a low-power state) as soon as it detects an exit from L0s on
> >   any of its Downstream Ports.
>
> "it detects an exit"
>
> >   5.4.1.2.1: A Switch is required to initiate an L1 exit transition on
> >   its Upstream Port Link after no more than 1 μs from the beginning of
> >   an L1 exit transition on any of its Downstream Port Links.  during
> >   L1 exit.
>
> vs "from the beginning of"
>
> So to me, this looks like edge triggering - only sense i could make of
> it would be cumulative
>
> (you should also note that i have no L0s issues, but I suspect that
> the code is wrong currently)
>
> > So a switch must start upstream L0s exit "as soon as" it sees L0s exit
> > on any downstream port, while it must start L1 exit "no more than 1 μs"
> > after seeing an L1 exit.
> >
> > And I really can't tell from the spec whether we need to accumulate
> > the L0s exit latencies or not.  Maybe somebody can clarify this.
> >
> > > commit db3d9c4baf4ab177d87b5cd41f624f5901e7390f
> > > Author: Ian Kumlien <ian.kumlien@gmail.com>
> > > Date:   Sun Jul 26 16:01:15 2020 +0200
> > >
> > >     Use maximum latency when determining L1 ASPM
> > >
> > >     If it's not, we clear the link for the path that had too large latency.
> > >
> > >     Currently we check the maximum latency of upstream and downstream
> > >     per link, not the maximum for the path
> > >
> > >     This would work if all links have the same latency, but:
> > >     endpoint -> c -> b -> a -> root  (in the order we walk the path)
> > >
> > >     If c or b has the higest latency, it will not register
> > >
> > >     Fix this by maintaining the maximum latency value for the path
> > >
> > >     See this bugzilla for more information:
> > >     https://bugzilla.kernel.org/show_bug.cgi?id=208741
> > >
> > >     This fixes an issue for me where my desktops machines maximum bandwidth
> > >     for remote connections dropped from 933 MBit to ~40 MBit.
> > >
> > >     The bug became obvious once we enabled ASPM on all links:
> > >     66ff14e59e8a (PCI/ASPM: Allow ASPM on links to PCIe-to-PCI/PCI-X Bridges)
> >
> > I can't connect the dots here yet.  I don't see a PCIe-to-PCI/PCI-X
> > bridge in your lspci, so I can't figure out why this commit would make
> > a difference for you.
> >
> > IIUC, the problem device is 03:00.0, the Intel I211 NIC.  Here's the
> > path to it:
> >
> >   00:01.2 Root Port              to [bus 01-07]
> >   01:00.0 Switch Upstream Port   to [bus 02-07]
> >   02:03.0 Switch Downstream Port to [bus 03]
> >   03:00.0 Endpoint (Intel I211 NIC)
> >
> > And I think this is the relevant info:
> >
> >                                                     LnkCtl    LnkCtl
> >            ------DevCap-------  ----LnkCap-------  -Before-  -After--
> >   00:01.2                                L1 <32us       L1+       L1-
> >   01:00.0                                L1 <32us       L1+       L1-
> >   02:03.0                                L1 <32us       L1+       L1+
> >   03:00.0  L0s <512ns L1 <64us  L0s <2us L1 <16us  L0s- L1-  L0s- L1-
> >
> > The NIC says it can tolerate at most 512ns of L0s exit latency and at
> > most 64us of L1 exit latency.
> >
> > 02:03.0 doesn't support L0s, and the NIC itself can't exit L0s that
> > fast anyway (it can only do <2us), so L0s should be out of the picture
> > completely.
> >
> > Before your patch, apparently we (or BIOS) enabled L1 on the link from
> > 00:01.2 to 01:00.0, and partially enabled it on the link from 02:03.0
> > to 03:00.0.
>
> According to the spec, this is managed by the OS - which was the
> change introduced...
>
> > It looks like we *should* be able to enable L1 on both links since the
> > exit latency should be <33us (first link starts exit at T=0, completes
> > by T=32; second link starts exit at T=1, completes by T=33), and
> > 03:00.0 can tolerate up to 64us.
> >
> > I guess the effect of your patch is to disable L1 on the 00:01.2 -
> > 01:00.0 link?  And that makes the NIC work better?  I am obviously
> > missing something because I don't understand why the patch does that
> > or why it works better.
>
> It makes it work like normal again, like if i disable ASPM on the nic itself...
>
> I don't know which value that reflects, up or down - since we do max
> of both values and
> it actually disables ASPM.
>
> What we can see is that the first device that passes the threshold is 01:00.0
>
> How can I read more data from PCIe without needing to add kprint...
>
> This is what lspci uses apparently:
> #define  PCI_EXP_LNKCAP_L0S     0x07000 /* L0s Exit Latency */
> #define  PCI_EXP_LNKCAP_L1      0x38000 /* L1 Exit Latency */
>
> But which latencies are those? up or down?
>
> > I added Alexander to cc since it sounds like he's helped debug this,
> > too.
> >
> > >     Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>
> > >
> > > diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
> > > index 253c30cc1967..893b37669087 100644
> > > --- a/drivers/pci/pcie/aspm.c
> > > +++ b/drivers/pci/pcie/aspm.c
> > > @@ -434,7 +434,7 @@ static void pcie_get_aspm_reg(struct pci_dev *pdev,
> > >
> > >  static void pcie_aspm_check_latency(struct pci_dev *endpoint)
> > >  {
> > > -       u32 latency, l1_switch_latency = 0;
> > > +       u32 latency, l1_max_latency = 0, l1_switch_latency = 0;
> > >         struct aspm_latency *acceptable;
> > >         struct pcie_link_state *link;
> > >
> > > @@ -456,10 +456,14 @@ static void pcie_aspm_check_latency(struct
> > > pci_dev *endpoint)
> > >                 if ((link->aspm_capable & ASPM_STATE_L0S_DW) &&
> > >                     (link->latency_dw.l0s > acceptable->l0s))
> > >                         link->aspm_capable &= ~ASPM_STATE_L0S_DW;
> > > +
> > >                 /*
> > >                  * Check L1 latency.
> > > -                * Every switch on the path to root complex need 1
> > > -                * more microsecond for L1. Spec doesn't mention L0s.
> > > +                *
> > > +                * PCIe r5.0, sec 5.4.1.2.2 states:
> > > +                * A Switch is required to initiate an L1 exit transition on its
> > > +                * Upstream Port Link after no more than 1 μs from the
> > > beginning of an
> > > +                * L1 exit transition on any of its Downstream Port Links.
> > >                  *
> > >                  * The exit latencies for L1 substates are not advertised
> > >                  * by a device.  Since the spec also doesn't mention a way
> > > @@ -469,11 +473,14 @@ static void pcie_aspm_check_latency(struct
> > > pci_dev *endpoint)
> > >                  * L1 exit latencies advertised by a device include L1
> > >                  * substate latencies (and hence do not do any check).
> > >                  */
> > > -               latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
> > > -               if ((link->aspm_capable & ASPM_STATE_L1) &&
> > > -                   (latency + l1_switch_latency > acceptable->l1))
> > > -                       link->aspm_capable &= ~ASPM_STATE_L1;
> > > -               l1_switch_latency += 1000;
> > > +               if (link->aspm_capable & ASPM_STATE_L1) {
> > > +                       latency = max_t(u32, link->latency_up.l1,
> > > link->latency_dw.l1);
> > > +                       l1_max_latency = max_t(u32, latency, l1_max_latency);
> > > +                       if (l1_max_latency + l1_switch_latency > acceptable->l1)
> > > +                               link->aspm_capable &= ~ASPM_STATE_L1;
> > > +
> > > +                       l1_switch_latency += 1000;
> > > +               }
> > >
> > >                 link = link->parent;
> > >         }
> > > ----------------------
Bjorn Helgaas Sept. 23, 2020, 9:23 p.m. UTC | #8
On Wed, Sep 23, 2020 at 01:29:18AM +0200, Ian Kumlien wrote:
> On Wed, Sep 23, 2020 at 1:00 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > On Tue, Sep 22, 2020 at 11:02:35PM +0200, Ian Kumlien wrote:

> > > commit db3d9c4baf4ab177d87b5cd41f624f5901e7390f
> > > Author: Ian Kumlien <ian.kumlien@gmail.com>
> > > Date:   Sun Jul 26 16:01:15 2020 +0200
> > >
> > >     Use maximum latency when determining L1 ASPM
> > >
> > >     If it's not, we clear the link for the path that had too large latency.
> > >
> > >     Currently we check the maximum latency of upstream and downstream
> > >     per link, not the maximum for the path
> > >
> > >     This would work if all links have the same latency, but:
> > >     endpoint -> c -> b -> a -> root  (in the order we walk the path)
> > >
> > >     If c or b has the higest latency, it will not register
> > >
> > >     Fix this by maintaining the maximum latency value for the path
> > >
> > >     See this bugzilla for more information:
> > >     https://bugzilla.kernel.org/show_bug.cgi?id=208741
> > >
> > >     This fixes an issue for me where my desktops machines maximum bandwidth
> > >     for remote connections dropped from 933 MBit to ~40 MBit.
> > >
> > >     The bug became obvious once we enabled ASPM on all links:
> > >     66ff14e59e8a (PCI/ASPM: Allow ASPM on links to PCIe-to-PCI/PCI-X Bridges)
> >
> > I can't connect the dots here yet.  I don't see a PCIe-to-PCI/PCI-X
> > bridge in your lspci, so I can't figure out why this commit would make
> > a difference for you.
> >
> > IIUC, the problem device is 03:00.0, the Intel I211 NIC.  Here's the
> > path to it:
> >
> >   00:01.2 Root Port              to [bus 01-07]
> >   01:00.0 Switch Upstream Port   to [bus 02-07]
> >   02:03.0 Switch Downstream Port to [bus 03]
> >   03:00.0 Endpoint (Intel I211 NIC)
> >
> > And I think this is the relevant info:
> >
> >                                                     LnkCtl    LnkCtl
> >            ------DevCap-------  ----LnkCap-------  -Before-  -After--
> >   00:01.2                                L1 <32us       L1+       L1-
> >   01:00.0                                L1 <32us       L1+       L1-
> >   02:03.0                                L1 <32us       L1+       L1+
> >   03:00.0  L0s <512ns L1 <64us  L0s <2us L1 <16us  L0s- L1-  L0s- L1-
> >
> > The NIC says it can tolerate at most 512ns of L0s exit latency and at
> > most 64us of L1 exit latency.
> >
> > 02:03.0 doesn't support L0s, and the NIC itself can't exit L0s that
> > fast anyway (it can only do <2us), so L0s should be out of the picture
> > completely.
> >
> > Before your patch, apparently we (or BIOS) enabled L1 on the link from
> > 00:01.2 to 01:00.0, and partially enabled it on the link from 02:03.0
> > to 03:00.0.
> 
> According to the spec, this is managed by the OS - which was the
> change introduced...

BIOS frequently enables ASPM and, if CONFIG_PCIEASPM_DEFAULT=y, I
don't think Linux touches it unless a user requests it via sysfs.

What does "grep ASPM .config" tell you?

Boot with "pci=earlydump" and the dmesg will tell us what the BIOS
did.

If you do this in the unmodified kernel:

  # echo 1 > /sys/.../0000:03:00.0/l1_aspm

it should enable L1 for 03:00.0.  I'd like to know whether it actually
does, and whether the NIC behaves any differently when L1 is enabled
for the entire path instead of just the first three components.

If the above doesn't work, you should be able to enable ASPM manually:

  # setpci -s03:00.0 CAP_EXP+0x10.w=0x0042

> > It looks like we *should* be able to enable L1 on both links since the
> > exit latency should be <33us (first link starts exit at T=0, completes
> > by T=32; second link starts exit at T=1, completes by T=33), and
> > 03:00.0 can tolerate up to 64us.
> >
> > I guess the effect of your patch is to disable L1 on the 00:01.2 -
> > 01:00.0 link?  And that makes the NIC work better?  I am obviously
> > missing something because I don't understand why the patch does that
> > or why it works better.
> 
> It makes it work like normal again, like if i disable ASPM on the
> nic itself...

I wonder if ASPM is just broken on this device.
__e1000e_disable_aspm() mentions hardware errata on a different Intel
NIC.

> I don't know which value that reflects, up or down - since we do max
> of both values and
> it actually disables ASPM.
> 
> What we can see is that the first device that passes the threshold
> is 01:00.0

I don't understand this.  03:00.0 claims to be able to tolerate 64us
of L1 exit latency.  The path should only have <33us of latency, so
it's not even close to the 64us threshold.

> How can I read more data from PCIe without needing to add kprint...
> 
> This is what lspci uses apparently:
> #define  PCI_EXP_LNKCAP_L0S     0x07000 /* L0s Exit Latency */
> #define  PCI_EXP_LNKCAP_L1      0x38000 /* L1 Exit Latency */
> 
> But which latencies are those? up or down?

I think the idea in aspm.c that exit latencies depend on which
direction traffic is going is incorrect.  The components at the
upstream and downstream ends of the link may have different exit
latencies, of course.
Ian Kumlien Sept. 23, 2020, 9:36 p.m. UTC | #9
On Wed, Sep 23, 2020 at 11:23 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Wed, Sep 23, 2020 at 01:29:18AM +0200, Ian Kumlien wrote:
> > On Wed, Sep 23, 2020 at 1:00 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > On Tue, Sep 22, 2020 at 11:02:35PM +0200, Ian Kumlien wrote:
>
> > > > commit db3d9c4baf4ab177d87b5cd41f624f5901e7390f
> > > > Author: Ian Kumlien <ian.kumlien@gmail.com>
> > > > Date:   Sun Jul 26 16:01:15 2020 +0200
> > > >
> > > >     Use maximum latency when determining L1 ASPM
> > > >
> > > >     If it's not, we clear the link for the path that had too large latency.
> > > >
> > > >     Currently we check the maximum latency of upstream and downstream
> > > >     per link, not the maximum for the path
> > > >
> > > >     This would work if all links have the same latency, but:
> > > >     endpoint -> c -> b -> a -> root  (in the order we walk the path)
> > > >
> > > >     If c or b has the higest latency, it will not register
> > > >
> > > >     Fix this by maintaining the maximum latency value for the path
> > > >
> > > >     See this bugzilla for more information:
> > > >     https://bugzilla.kernel.org/show_bug.cgi?id=208741
> > > >
> > > >     This fixes an issue for me where my desktops machines maximum bandwidth
> > > >     for remote connections dropped from 933 MBit to ~40 MBit.
> > > >
> > > >     The bug became obvious once we enabled ASPM on all links:
> > > >     66ff14e59e8a (PCI/ASPM: Allow ASPM on links to PCIe-to-PCI/PCI-X Bridges)
> > >
> > > I can't connect the dots here yet.  I don't see a PCIe-to-PCI/PCI-X
> > > bridge in your lspci, so I can't figure out why this commit would make
> > > a difference for you.
> > >
> > > IIUC, the problem device is 03:00.0, the Intel I211 NIC.  Here's the
> > > path to it:
> > >
> > >   00:01.2 Root Port              to [bus 01-07]
> > >   01:00.0 Switch Upstream Port   to [bus 02-07]
> > >   02:03.0 Switch Downstream Port to [bus 03]
> > >   03:00.0 Endpoint (Intel I211 NIC)
> > >
> > > And I think this is the relevant info:
> > >
> > >                                                     LnkCtl    LnkCtl
> > >            ------DevCap-------  ----LnkCap-------  -Before-  -After--
> > >   00:01.2                                L1 <32us       L1+       L1-
> > >   01:00.0                                L1 <32us       L1+       L1-
> > >   02:03.0                                L1 <32us       L1+       L1+
> > >   03:00.0  L0s <512ns L1 <64us  L0s <2us L1 <16us  L0s- L1-  L0s- L1-
> > >
> > > The NIC says it can tolerate at most 512ns of L0s exit latency and at
> > > most 64us of L1 exit latency.
> > >
> > > 02:03.0 doesn't support L0s, and the NIC itself can't exit L0s that
> > > fast anyway (it can only do <2us), so L0s should be out of the picture
> > > completely.
> > >
> > > Before your patch, apparently we (or BIOS) enabled L1 on the link from
> > > 00:01.2 to 01:00.0, and partially enabled it on the link from 02:03.0
> > > to 03:00.0.
> >
> > According to the spec, this is managed by the OS - which was the
> > change introduced...
>
> BIOS frequently enables ASPM and, if CONFIG_PCIEASPM_DEFAULT=y, I
> don't think Linux touches it unless a user requests it via sysfs.

https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=66ff14e59e8a

"7d715a6c1ae5 ("PCI: add PCI Express ASPM support") added the ability for
Linux to enable ASPM, but for some undocumented reason, it didn't enable
ASPM on links where the downstream component is a PCIe-to-PCI/PCI-X Bridge.

Remove this exclusion so we can enable ASPM on these links."

> What does "grep ASPM .config" tell you?

CONFIG_PCIEASPM=y
CONFIG_PCIEASPM_POWERSAVE=y

And all of this worked before the commit above.

> Boot with "pci=earlydump" and the dmesg will tell us what the BIOS
> did.
>
> If you do this in the unmodified kernel:
>
>   # echo 1 > /sys/.../0000:03:00.0/l1_aspm
>
> it should enable L1 for 03:00.0.  I'd like to know whether it actually
> does, and whether the NIC behaves any differently when L1 is enabled
> for the entire path instead of just the first three components.

With an unmodified kernel, I have ~40 mbit bandwidth.

> If the above doesn't work, you should be able to enable ASPM manually:
>
>   # setpci -s03:00.0 CAP_EXP+0x10.w=0x0042

I used something similar to disable ASPM, which made the nic work again.

> > > It looks like we *should* be able to enable L1 on both links since the
> > > exit latency should be <33us (first link starts exit at T=0, completes
> > > by T=32; second link starts exit at T=1, completes by T=33), and
> > > 03:00.0 can tolerate up to 64us.
> > >
> > > I guess the effect of your patch is to disable L1 on the 00:01.2 -
> > > 01:00.0 link?  And that makes the NIC work better?  I am obviously
> > > missing something because I don't understand why the patch does that
> > > or why it works better.
> >
> > It makes it work like normal again, like if i disable ASPM on the
> > nic itself...
>
> I wonder if ASPM is just broken on this device.
> __e1000e_disable_aspm() mentions hardware errata on a different Intel
> NIC.

I doubt it. The total time seems to surpass the time it can handle
with it's buffers.

Note that the nic is running with ASPM now, and it is working =)

> > I don't know which value that reflects, up or down - since we do max
> > of both values and
> > it actually disables ASPM.
> >
> > What we can see is that the first device that passes the threshold
> > is 01:00.0
>
> I don't understand this.  03:00.0 claims to be able to tolerate 64us
> of L1 exit latency.  The path should only have <33us of latency, so
> it's not even close to the 64us threshold.

Well, this is why i dumped the raw data - i don't know how lspci reads it

> > How can I read more data from PCIe without needing to add kprint...
> >
> > This is what lspci uses apparently:
> > #define  PCI_EXP_LNKCAP_L0S     0x07000 /* L0s Exit Latency */
> > #define  PCI_EXP_LNKCAP_L1      0x38000 /* L1 Exit Latency */
> >
> > But which latencies are those? up or down?
>
> I think the idea in aspm.c that exit latencies depend on which
> direction traffic is going is incorrect.  The components at the
> upstream and downstream ends of the link may have different exit
> latencies, of course.

Yes, and the max latency is the maximum time the device can buffer before
the link has to be up, so maximum time to establish link must be
max(up, down) right?
Ian Kumlien Sept. 23, 2020, 9:48 p.m. UTC | #10
Added dmesg with pci=earlydump on the bugzilla

https://bugzilla.kernel.org/attachment.cgi?id=292601

On Wed, Sep 23, 2020 at 11:36 PM Ian Kumlien <ian.kumlien@gmail.com> wrote:
>
> On Wed, Sep 23, 2020 at 11:23 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> >
> > On Wed, Sep 23, 2020 at 01:29:18AM +0200, Ian Kumlien wrote:
> > > On Wed, Sep 23, 2020 at 1:00 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > On Tue, Sep 22, 2020 at 11:02:35PM +0200, Ian Kumlien wrote:
> >
> > > > > commit db3d9c4baf4ab177d87b5cd41f624f5901e7390f
> > > > > Author: Ian Kumlien <ian.kumlien@gmail.com>
> > > > > Date:   Sun Jul 26 16:01:15 2020 +0200
> > > > >
> > > > >     Use maximum latency when determining L1 ASPM
> > > > >
> > > > >     If it's not, we clear the link for the path that had too large latency.
> > > > >
> > > > >     Currently we check the maximum latency of upstream and downstream
> > > > >     per link, not the maximum for the path
> > > > >
> > > > >     This would work if all links have the same latency, but:
> > > > >     endpoint -> c -> b -> a -> root  (in the order we walk the path)
> > > > >
> > > > >     If c or b has the higest latency, it will not register
> > > > >
> > > > >     Fix this by maintaining the maximum latency value for the path
> > > > >
> > > > >     See this bugzilla for more information:
> > > > >     https://bugzilla.kernel.org/show_bug.cgi?id=208741
> > > > >
> > > > >     This fixes an issue for me where my desktops machines maximum bandwidth
> > > > >     for remote connections dropped from 933 MBit to ~40 MBit.
> > > > >
> > > > >     The bug became obvious once we enabled ASPM on all links:
> > > > >     66ff14e59e8a (PCI/ASPM: Allow ASPM on links to PCIe-to-PCI/PCI-X Bridges)
> > > >
> > > > I can't connect the dots here yet.  I don't see a PCIe-to-PCI/PCI-X
> > > > bridge in your lspci, so I can't figure out why this commit would make
> > > > a difference for you.
> > > >
> > > > IIUC, the problem device is 03:00.0, the Intel I211 NIC.  Here's the
> > > > path to it:
> > > >
> > > >   00:01.2 Root Port              to [bus 01-07]
> > > >   01:00.0 Switch Upstream Port   to [bus 02-07]
> > > >   02:03.0 Switch Downstream Port to [bus 03]
> > > >   03:00.0 Endpoint (Intel I211 NIC)
> > > >
> > > > And I think this is the relevant info:
> > > >
> > > >                                                     LnkCtl    LnkCtl
> > > >            ------DevCap-------  ----LnkCap-------  -Before-  -After--
> > > >   00:01.2                                L1 <32us       L1+       L1-
> > > >   01:00.0                                L1 <32us       L1+       L1-
> > > >   02:03.0                                L1 <32us       L1+       L1+
> > > >   03:00.0  L0s <512ns L1 <64us  L0s <2us L1 <16us  L0s- L1-  L0s- L1-
> > > >
> > > > The NIC says it can tolerate at most 512ns of L0s exit latency and at
> > > > most 64us of L1 exit latency.
> > > >
> > > > 02:03.0 doesn't support L0s, and the NIC itself can't exit L0s that
> > > > fast anyway (it can only do <2us), so L0s should be out of the picture
> > > > completely.
> > > >
> > > > Before your patch, apparently we (or BIOS) enabled L1 on the link from
> > > > 00:01.2 to 01:00.0, and partially enabled it on the link from 02:03.0
> > > > to 03:00.0.
> > >
> > > According to the spec, this is managed by the OS - which was the
> > > change introduced...
> >
> > BIOS frequently enables ASPM and, if CONFIG_PCIEASPM_DEFAULT=y, I
> > don't think Linux touches it unless a user requests it via sysfs.
>
> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=66ff14e59e8a
>
> "7d715a6c1ae5 ("PCI: add PCI Express ASPM support") added the ability for
> Linux to enable ASPM, but for some undocumented reason, it didn't enable
> ASPM on links where the downstream component is a PCIe-to-PCI/PCI-X Bridge.
>
> Remove this exclusion so we can enable ASPM on these links."
>
> > What does "grep ASPM .config" tell you?
>
> CONFIG_PCIEASPM=y
> CONFIG_PCIEASPM_POWERSAVE=y
>
> And all of this worked before the commit above.
>
> > Boot with "pci=earlydump" and the dmesg will tell us what the BIOS
> > did.
> >
> > If you do this in the unmodified kernel:
> >
> >   # echo 1 > /sys/.../0000:03:00.0/l1_aspm
> >
> > it should enable L1 for 03:00.0.  I'd like to know whether it actually
> > does, and whether the NIC behaves any differently when L1 is enabled
> > for the entire path instead of just the first three components.
>
> With an unmodified kernel, I have ~40 mbit bandwidth.
>
> > If the above doesn't work, you should be able to enable ASPM manually:
> >
> >   # setpci -s03:00.0 CAP_EXP+0x10.w=0x0042
>
> I used something similar to disable ASPM, which made the nic work again.
>
> > > > It looks like we *should* be able to enable L1 on both links since the
> > > > exit latency should be <33us (first link starts exit at T=0, completes
> > > > by T=32; second link starts exit at T=1, completes by T=33), and
> > > > 03:00.0 can tolerate up to 64us.
> > > >
> > > > I guess the effect of your patch is to disable L1 on the 00:01.2 -
> > > > 01:00.0 link?  And that makes the NIC work better?  I am obviously
> > > > missing something because I don't understand why the patch does that
> > > > or why it works better.
> > >
> > > It makes it work like normal again, like if i disable ASPM on the
> > > nic itself...
> >
> > I wonder if ASPM is just broken on this device.
> > __e1000e_disable_aspm() mentions hardware errata on a different Intel
> > NIC.
>
> I doubt it. The total time seems to surpass the time it can handle
> with it's buffers.
>
> Note that the nic is running with ASPM now, and it is working =)
>
> > > I don't know which value that reflects, up or down - since we do max
> > > of both values and
> > > it actually disables ASPM.
> > >
> > > What we can see is that the first device that passes the threshold
> > > is 01:00.0
> >
> > I don't understand this.  03:00.0 claims to be able to tolerate 64us
> > of L1 exit latency.  The path should only have <33us of latency, so
> > it's not even close to the 64us threshold.
>
> Well, this is why i dumped the raw data - i don't know how lspci reads it
>
> > > How can I read more data from PCIe without needing to add kprint...
> > >
> > > This is what lspci uses apparently:
> > > #define  PCI_EXP_LNKCAP_L0S     0x07000 /* L0s Exit Latency */
> > > #define  PCI_EXP_LNKCAP_L1      0x38000 /* L1 Exit Latency */
> > >
> > > But which latencies are those? up or down?
> >
> > I think the idea in aspm.c that exit latencies depend on which
> > direction traffic is going is incorrect.  The components at the
> > upstream and downstream ends of the link may have different exit
> > latencies, of course.
>
> Yes, and the max latency is the maximum time the device can buffer before
> the link has to be up, so maximum time to establish link must be
> max(up, down) right?
Bjorn Helgaas Sept. 24, 2020, 4:24 p.m. UTC | #11
On Wed, Sep 23, 2020 at 11:36:00PM +0200, Ian Kumlien wrote:

> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=66ff14e59e8a
> 
> "7d715a6c1ae5 ("PCI: add PCI Express ASPM support") added the ability for
> Linux to enable ASPM, but for some undocumented reason, it didn't enable
> ASPM on links where the downstream component is a PCIe-to-PCI/PCI-X Bridge.
> 
> Remove this exclusion so we can enable ASPM on these links."
> ...

> And all of this worked before the commit above.

OK, really sorry, I got myself totally confused here, and I need to
start over from scratch.  Correct me when I go off the rails.

You're running 5.8.11+, and you get ~40 Mbit/s on the Intel I211 NIC.
Reverting 66ff14e59e8a ("PCI/ASPM: Allow ASPM on links to
PCIe-to-PCI/PCI-X Bridges") gets your bandwidth up to the 900+ Mbit/s
you expect.

66ff14e59e8a only makes a difference if you have a PCIe-to-PCI/PCI-X
Bridge (PCI_EXP_TYPE_PCI_BRIDGE) in your system.  But from your lspci
and pci=earlydump output, I don't see any of those.  The only bridges
I see are:

[    0.810346] pci 0000:00:01.2: [1022:1483] type 01 Root Port to [bus 01-07]
[    0.810587] pci 0000:00:03.1: [1022:1483] type 01 Root Port to [bus 08]
[    0.810587] pci 0000:00:03.2: [1022:1483] type 01 Root Port to [bus 09]
[    0.810837] pci 0000:00:07.1: [1022:1484] type 01 Root Port to [bus 0a]
[    0.811587] pci 0000:00:08.1: [1022:1484] type 01 Root Port to [bus 0b]
[    0.812586] pci 0000:01:00.0: [1022:57ad] type 01 Upstream Port to [bus 02-07]
[    0.812629] pci 0000:02:03.0: [1022:57a3] type 01 Downstream Port to [bus 03]
[    0.813584] pci 0000:02:04.0: [1022:57a3] type 01 Downstream Port to [bus 04]
[    0.814584] pci 0000:02:08.0: [1022:57a4] type 01 Downstream Port to [bus 05]
[    0.815584] pci 0000:02:09.0: [1022:57a4] type 01 Downstream Port to [bus 06]
[    0.815584] pci 0000:02:0a.0: [1022:57a4] type 01 Downstream Port to [bus 07]

So I'm lost right off the bat.  You have no PCI_EXP_TYPE_PCI_BRIDGE
device, so how can 66ff14e59e8a make a difference for you?

Can you add a printk there, e.g.,

        list_for_each_entry(child, &linkbus->devices, bus_list) {
                if (pci_pcie_type(child) == PCI_EXP_TYPE_PCI_BRIDGE) {
  +                     pci_info(child, "PCIe-to-PCI bridge, disabling ASPM\n");
                        link->aspm_disable = ASPM_STATE_ALL;
                        break;
                }
        }
Ian Kumlien Sept. 25, 2020, 8:06 a.m. UTC | #12
Ok so it might not be related to that change then...

but:
 ID] Interval           Transfer     Bitrate
[  5]   0.00-1.00   sec  4.34 MBytes  36.4 Mbits/sec
[  5]   1.00-2.00   sec  7.11 MBytes  59.6 Mbits/sec
[  5]   2.00-3.00   sec  4.76 MBytes  39.9 Mbits/sec
[  5]   3.00-4.00   sec  4.13 MBytes  34.7 Mbits/sec
[  5]   4.00-5.00   sec  4.19 MBytes  35.1 Mbits/sec
[  5]   5.00-6.00   sec  5.70 MBytes  47.8 Mbits/sec
[  5]   6.00-7.00   sec  5.96 MBytes  50.0 Mbits/sec
[  5]   7.00-8.00   sec  4.17 MBytes  35.0 Mbits/sec
[  5]   8.00-9.00   sec  4.14 MBytes  34.7 Mbits/sec
[  5]   9.00-10.00  sec  4.08 MBytes  34.3 Mbits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-10.00  sec  48.6 MBytes  40.8 Mbits/sec                  receiver

The issue persists

On Thu, Sep 24, 2020 at 6:24 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Wed, Sep 23, 2020 at 11:36:00PM +0200, Ian Kumlien wrote:
>
> > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=66ff14e59e8a
> >
> > "7d715a6c1ae5 ("PCI: add PCI Express ASPM support") added the ability for
> > Linux to enable ASPM, but for some undocumented reason, it didn't enable
> > ASPM on links where the downstream component is a PCIe-to-PCI/PCI-X Bridge.
> >
> > Remove this exclusion so we can enable ASPM on these links."
> > ...
>
> > And all of this worked before the commit above.
>
> OK, really sorry, I got myself totally confused here, and I need to
> start over from scratch.  Correct me when I go off the rails.
>
> You're running 5.8.11+, and you get ~40 Mbit/s on the Intel I211 NIC.
> Reverting 66ff14e59e8a ("PCI/ASPM: Allow ASPM on links to
> PCIe-to-PCI/PCI-X Bridges") gets your bandwidth up to the 900+ Mbit/s
> you expect.
>
> 66ff14e59e8a only makes a difference if you have a PCIe-to-PCI/PCI-X
> Bridge (PCI_EXP_TYPE_PCI_BRIDGE) in your system.  But from your lspci
> and pci=earlydump output, I don't see any of those.  The only bridges
> I see are:
>
> [    0.810346] pci 0000:00:01.2: [1022:1483] type 01 Root Port to [bus 01-07]
> [    0.810587] pci 0000:00:03.1: [1022:1483] type 01 Root Port to [bus 08]
> [    0.810587] pci 0000:00:03.2: [1022:1483] type 01 Root Port to [bus 09]
> [    0.810837] pci 0000:00:07.1: [1022:1484] type 01 Root Port to [bus 0a]
> [    0.811587] pci 0000:00:08.1: [1022:1484] type 01 Root Port to [bus 0b]
> [    0.812586] pci 0000:01:00.0: [1022:57ad] type 01 Upstream Port to [bus 02-07]
> [    0.812629] pci 0000:02:03.0: [1022:57a3] type 01 Downstream Port to [bus 03]
> [    0.813584] pci 0000:02:04.0: [1022:57a3] type 01 Downstream Port to [bus 04]
> [    0.814584] pci 0000:02:08.0: [1022:57a4] type 01 Downstream Port to [bus 05]
> [    0.815584] pci 0000:02:09.0: [1022:57a4] type 01 Downstream Port to [bus 06]
> [    0.815584] pci 0000:02:0a.0: [1022:57a4] type 01 Downstream Port to [bus 07]
>
> So I'm lost right off the bat.  You have no PCI_EXP_TYPE_PCI_BRIDGE
> device, so how can 66ff14e59e8a make a difference for you?
>
> Can you add a printk there, e.g.,
>
>         list_for_each_entry(child, &linkbus->devices, bus_list) {
>                 if (pci_pcie_type(child) == PCI_EXP_TYPE_PCI_BRIDGE) {
>   +                     pci_info(child, "PCIe-to-PCI bridge, disabling ASPM\n");
>                         link->aspm_disable = ASPM_STATE_ALL;
>                         break;
>                 }
>         }
>

Patch
diff mbox series

diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index b17e5ffd31b1..bc512e217258 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -434,7 +434,8 @@  static void pcie_get_aspm_reg(struct pci_dev *pdev,
 
 static void pcie_aspm_check_latency(struct pci_dev *endpoint)
 {
-	u32 latency, l1_switch_latency = 0;
+	u32 latency, l1_max_latency = 0, l1_switch_latency = 0,
+		l0s_latency_up = 0, l0s_latency_dw = 0;
 	struct aspm_latency *acceptable;
 	struct pcie_link_state *link;
 
@@ -447,15 +448,22 @@  static void pcie_aspm_check_latency(struct pci_dev *endpoint)
 	acceptable = &link->acceptable[PCI_FUNC(endpoint->devfn)];
 
 	while (link) {
-		/* Check upstream direction L0s latency */
-		if ((link->aspm_capable & ASPM_STATE_L0S_UP) &&
-		    (link->latency_up.l0s > acceptable->l0s))
-			link->aspm_capable &= ~ASPM_STATE_L0S_UP;
-
-		/* Check downstream direction L0s latency */
-		if ((link->aspm_capable & ASPM_STATE_L0S_DW) &&
-		    (link->latency_dw.l0s > acceptable->l0s))
-			link->aspm_capable &= ~ASPM_STATE_L0S_DW;
+		if (link->aspm_capable & ASPM_STATE_L0S) {
+			/* Check upstream direction L0s latency */
+			if (link->aspm_capable & ASPM_STATE_L0S_UP) {
+				l0s_latency_up += link->latency_up.l0s;
+				if (l0s_latency_up > acceptable->l0s)
+					link->aspm_capable &= ~ASPM_STATE_L0S_UP;
+			}
+
+			/* Check downstream direction L0s latency */
+			if (link->aspm_capable & ASPM_STATE_L0S_DW) {
+				l0s_latency_dw += link->latency_dw.l0s;
+				if (l0s_latency_dw > acceptable->l0s)
+					link->aspm_capable &= ~ASPM_STATE_L0S_DW;
+			}
+		}
+
 		/*
 		 * Check L1 latency.
 		 * Every switch on the path to root complex need 1
@@ -469,11 +477,14 @@  static void pcie_aspm_check_latency(struct pci_dev *endpoint)
 		 * L1 exit latencies advertised by a device include L1
 		 * substate latencies (and hence do not do any check).
 		 */
-		latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
-		if ((link->aspm_capable & ASPM_STATE_L1) &&
-		    (latency + l1_switch_latency > acceptable->l1))
-			link->aspm_capable &= ~ASPM_STATE_L1;
-		l1_switch_latency += 1000;
+		if (link->aspm_capable & ASPM_STATE_L1) {
+			latency = max_t(u32, link->latency_up.l1, link->latency_dw.l1);
+			l1_max_latency = max_t(u32, latency, l1_max_latency);
+			if (l1_max_latency + l1_switch_latency > acceptable->l1)
+				link->aspm_capable &= ~ASPM_STATE_L1;
+
+			l1_switch_latency += 1000;
+		}
 
 		link = link->parent;
 	}