diff mbox series

PCI: Fix active state requirement in PME polling

Message ID 20240123185548.1040096-1-alex.williamson@redhat.com (mailing list archive)
State Accepted
Delegated to: Bjorn Helgaas
Headers show
Series PCI: Fix active state requirement in PME polling | expand

Commit Message

Alex Williamson Jan. 23, 2024, 6:55 p.m. UTC
The commit noted in fixes added a bogus requirement that runtime PM
managed devices need to be in the RPM_ACTIVE state for PME polling.
In fact, only devices in low power states should be polled.

However there's still a requirement that the device config space must
be accessible, which has implications for both the current state of
the polled device and the parent bridge, when present.  It's not
sufficient to assume the bridge remains in D0 and cases have been
observed where the bridge passes the D0 test, but the PM state
indicates RPM_SUSPENDING and config space of the polled device becomes
inaccessible during pci_pme_wakeup().

Therefore, since the bridge is already effectively required to be in
the RPM_ACTIVE state, formalize this in the code and elevate the PM
usage count to maintain the state while polling the subordinate
device.

Cc: Lukas Wunner <lukas@wunner.de>
Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Fixes: d3fcd7360338 ("PCI: Fix runtime PM race with PME polling")
Reported-by: Sanath S <sanath.s@amd.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218360
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/pci/pci.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

Comments

Rafael J. Wysocki Jan. 23, 2024, 7:40 p.m. UTC | #1
On Tue, Jan 23, 2024 at 7:56 PM Alex Williamson
<alex.williamson@redhat.com> wrote:
>
> The commit noted in fixes added a bogus requirement that runtime PM
> managed devices need to be in the RPM_ACTIVE state for PME polling.
> In fact, only devices in low power states should be polled.
>
> However there's still a requirement that the device config space must
> be accessible, which has implications for both the current state of
> the polled device and the parent bridge, when present.  It's not
> sufficient to assume the bridge remains in D0 and cases have been
> observed where the bridge passes the D0 test, but the PM state
> indicates RPM_SUSPENDING and config space of the polled device becomes
> inaccessible during pci_pme_wakeup().
>
> Therefore, since the bridge is already effectively required to be in
> the RPM_ACTIVE state, formalize this in the code and elevate the PM
> usage count to maintain the state while polling the subordinate
> device.
>
> Cc: Lukas Wunner <lukas@wunner.de>
> Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
> Cc: Rafael J. Wysocki <rafael@kernel.org>
> Fixes: d3fcd7360338 ("PCI: Fix runtime PM race with PME polling")
> Reported-by: Sanath S <sanath.s@amd.com>
> Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218360
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
>  drivers/pci/pci.c | 37 ++++++++++++++++++++++---------------
>  1 file changed, 22 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index bdbf8a94b4d0..764d7c977ef4 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -2433,29 +2433,36 @@ static void pci_pme_list_scan(struct work_struct *work)
>                 if (pdev->pme_poll) {
>                         struct pci_dev *bridge = pdev->bus->self;
>                         struct device *dev = &pdev->dev;
> -                       int pm_status;
> +                       struct device *bdev = bridge ? &bridge->dev : NULL;
> +                       int bref = 0;
>
>                         /*
> -                        * If bridge is in low power state, the
> -                        * configuration space of subordinate devices
> -                        * may be not accessible
> +                        * If we have a bridge, it should be in an active/D0
> +                        * state or the configuration space of subordinate
> +                        * devices may not be accessible or stable over the
> +                        * course of the call.
>                          */
> -                       if (bridge && bridge->current_state != PCI_D0)
> -                               continue;
> +                       if (bdev) {
> +                               bref = pm_runtime_get_if_active(bdev, true);
> +                               if (!bref)

I would check bref <= 0 here.

> +                                       continue;
> +
> +                               if (bridge->current_state != PCI_D0)

Isn't the power state guaranteed to be PCI_D0 at this point?  If it
isn't, then why?

> +                                       goto put_bridge;
> +                       }
>
>                         /*
> -                        * If the device is in a low power state it
> -                        * should not be polled either.
> +                        * The device itself should be suspended but config
> +                        * space must be accessible, therefore it cannot be in
> +                        * D3cold.
>                          */
> -                       pm_status = pm_runtime_get_if_active(dev, true);
> -                       if (!pm_status)
> -                               continue;
> -
> -                       if (pdev->current_state != PCI_D3cold)
> +                       if (pm_runtime_suspended(dev) &&
> +                           pdev->current_state != PCI_D3cold)
>                                 pci_pme_wakeup(pdev, NULL);
>
> -                       if (pm_status > 0)
> -                               pm_runtime_put(dev);
> +put_bridge:
> +                       if (bref > 0)
> +                               pm_runtime_put(bdev);
>                 } else {
>                         list_del(&pme_dev->list);
>                         kfree(pme_dev);
> --
Alex Williamson Jan. 23, 2024, 7:50 p.m. UTC | #2
On Tue, 23 Jan 2024 20:40:32 +0100
"Rafael J. Wysocki" <rafael@kernel.org> wrote:

> On Tue, Jan 23, 2024 at 7:56 PM Alex Williamson
> <alex.williamson@redhat.com> wrote:
> >
> > The commit noted in fixes added a bogus requirement that runtime PM
> > managed devices need to be in the RPM_ACTIVE state for PME polling.
> > In fact, only devices in low power states should be polled.
> >
> > However there's still a requirement that the device config space must
> > be accessible, which has implications for both the current state of
> > the polled device and the parent bridge, when present.  It's not
> > sufficient to assume the bridge remains in D0 and cases have been
> > observed where the bridge passes the D0 test, but the PM state
> > indicates RPM_SUSPENDING and config space of the polled device becomes
> > inaccessible during pci_pme_wakeup().
> >
> > Therefore, since the bridge is already effectively required to be in
> > the RPM_ACTIVE state, formalize this in the code and elevate the PM
> > usage count to maintain the state while polling the subordinate
> > device.
> >
> > Cc: Lukas Wunner <lukas@wunner.de>
> > Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
> > Cc: Rafael J. Wysocki <rafael@kernel.org>
> > Fixes: d3fcd7360338 ("PCI: Fix runtime PM race with PME polling")
> > Reported-by: Sanath S <sanath.s@amd.com>
> > Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218360
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> >  drivers/pci/pci.c | 37 ++++++++++++++++++++++---------------
> >  1 file changed, 22 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > index bdbf8a94b4d0..764d7c977ef4 100644
> > --- a/drivers/pci/pci.c
> > +++ b/drivers/pci/pci.c
> > @@ -2433,29 +2433,36 @@ static void pci_pme_list_scan(struct work_struct *work)
> >                 if (pdev->pme_poll) {
> >                         struct pci_dev *bridge = pdev->bus->self;
> >                         struct device *dev = &pdev->dev;
> > -                       int pm_status;
> > +                       struct device *bdev = bridge ? &bridge->dev : NULL;
> > +                       int bref = 0;
> >
> >                         /*
> > -                        * If bridge is in low power state, the
> > -                        * configuration space of subordinate devices
> > -                        * may be not accessible
> > +                        * If we have a bridge, it should be in an active/D0
> > +                        * state or the configuration space of subordinate
> > +                        * devices may not be accessible or stable over the
> > +                        * course of the call.
> >                          */
> > -                       if (bridge && bridge->current_state != PCI_D0)
> > -                               continue;
> > +                       if (bdev) {
> > +                               bref = pm_runtime_get_if_active(bdev, true);
> > +                               if (!bref)  
> 
> I would check bref <= 0 here.
> 
> > +                                       continue;
> > +
> > +                               if (bridge->current_state != PCI_D0)  
> 
> Isn't the power state guaranteed to be PCI_D0 at this point?  If it
> isn't, then why?

Both of these seem necessary to support !CONFIG_PM, where bref would be
-EINVAL and provides no indication of the current_state.  Is that
incorrect?  Thanks,

Alex


> > +                                       goto put_bridge;
> > +                       }
> >
> >                         /*
> > -                        * If the device is in a low power state it
> > -                        * should not be polled either.
> > +                        * The device itself should be suspended but config
> > +                        * space must be accessible, therefore it cannot be in
> > +                        * D3cold.
> >                          */
> > -                       pm_status = pm_runtime_get_if_active(dev, true);
> > -                       if (!pm_status)
> > -                               continue;
> > -
> > -                       if (pdev->current_state != PCI_D3cold)
> > +                       if (pm_runtime_suspended(dev) &&
> > +                           pdev->current_state != PCI_D3cold)
> >                                 pci_pme_wakeup(pdev, NULL);
> >
> > -                       if (pm_status > 0)
> > -                               pm_runtime_put(dev);
> > +put_bridge:
> > +                       if (bref > 0)
> > +                               pm_runtime_put(bdev);
> >                 } else {
> >                         list_del(&pme_dev->list);
> >                         kfree(pme_dev);
> > --  
>
Rafael J. Wysocki Jan. 23, 2024, 7:59 p.m. UTC | #3
On Tue, Jan 23, 2024 at 8:51 PM Alex Williamson
<alex.williamson@redhat.com> wrote:
>
> On Tue, 23 Jan 2024 20:40:32 +0100
> "Rafael J. Wysocki" <rafael@kernel.org> wrote:
>
> > On Tue, Jan 23, 2024 at 7:56 PM Alex Williamson
> > <alex.williamson@redhat.com> wrote:
> > >
> > > The commit noted in fixes added a bogus requirement that runtime PM
> > > managed devices need to be in the RPM_ACTIVE state for PME polling.
> > > In fact, only devices in low power states should be polled.
> > >
> > > However there's still a requirement that the device config space must
> > > be accessible, which has implications for both the current state of
> > > the polled device and the parent bridge, when present.  It's not
> > > sufficient to assume the bridge remains in D0 and cases have been
> > > observed where the bridge passes the D0 test, but the PM state
> > > indicates RPM_SUSPENDING and config space of the polled device becomes
> > > inaccessible during pci_pme_wakeup().
> > >
> > > Therefore, since the bridge is already effectively required to be in
> > > the RPM_ACTIVE state, formalize this in the code and elevate the PM
> > > usage count to maintain the state while polling the subordinate
> > > device.
> > >
> > > Cc: Lukas Wunner <lukas@wunner.de>
> > > Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
> > > Cc: Rafael J. Wysocki <rafael@kernel.org>
> > > Fixes: d3fcd7360338 ("PCI: Fix runtime PM race with PME polling")
> > > Reported-by: Sanath S <sanath.s@amd.com>
> > > Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218360
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > ---
> > >  drivers/pci/pci.c | 37 ++++++++++++++++++++++---------------
> > >  1 file changed, 22 insertions(+), 15 deletions(-)
> > >
> > > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > > index bdbf8a94b4d0..764d7c977ef4 100644
> > > --- a/drivers/pci/pci.c
> > > +++ b/drivers/pci/pci.c
> > > @@ -2433,29 +2433,36 @@ static void pci_pme_list_scan(struct work_struct *work)
> > >                 if (pdev->pme_poll) {
> > >                         struct pci_dev *bridge = pdev->bus->self;
> > >                         struct device *dev = &pdev->dev;
> > > -                       int pm_status;
> > > +                       struct device *bdev = bridge ? &bridge->dev : NULL;
> > > +                       int bref = 0;
> > >
> > >                         /*
> > > -                        * If bridge is in low power state, the
> > > -                        * configuration space of subordinate devices
> > > -                        * may be not accessible
> > > +                        * If we have a bridge, it should be in an active/D0
> > > +                        * state or the configuration space of subordinate
> > > +                        * devices may not be accessible or stable over the
> > > +                        * course of the call.
> > >                          */
> > > -                       if (bridge && bridge->current_state != PCI_D0)
> > > -                               continue;
> > > +                       if (bdev) {
> > > +                               bref = pm_runtime_get_if_active(bdev, true);
> > > +                               if (!bref)
> >
> > I would check bref <= 0 here.
> >
> > > +                                       continue;
> > > +
> > > +                               if (bridge->current_state != PCI_D0)
> >
> > Isn't the power state guaranteed to be PCI_D0 at this point?  If it
> > isn't, then why?
>
> Both of these seem necessary to support !CONFIG_PM, where bref would be
> -EINVAL and provides no indication of the current_state.  Is that
> incorrect?  Thanks,

Well, CONFIG_PCIE_PME depends on CONFIG_PM, so I'm not sure how
dev->pme_poll can be set without it.
Alex Williamson Jan. 23, 2024, 8:39 p.m. UTC | #4
On Tue, 23 Jan 2024 20:59:50 +0100
"Rafael J. Wysocki" <rafael@kernel.org> wrote:

> On Tue, Jan 23, 2024 at 8:51 PM Alex Williamson
> <alex.williamson@redhat.com> wrote:
> >
> > On Tue, 23 Jan 2024 20:40:32 +0100
> > "Rafael J. Wysocki" <rafael@kernel.org> wrote:
> >  
> > > On Tue, Jan 23, 2024 at 7:56 PM Alex Williamson
> > > <alex.williamson@redhat.com> wrote:  
> > > >
> > > > The commit noted in fixes added a bogus requirement that runtime PM
> > > > managed devices need to be in the RPM_ACTIVE state for PME polling.
> > > > In fact, only devices in low power states should be polled.
> > > >
> > > > However there's still a requirement that the device config space must
> > > > be accessible, which has implications for both the current state of
> > > > the polled device and the parent bridge, when present.  It's not
> > > > sufficient to assume the bridge remains in D0 and cases have been
> > > > observed where the bridge passes the D0 test, but the PM state
> > > > indicates RPM_SUSPENDING and config space of the polled device becomes
> > > > inaccessible during pci_pme_wakeup().
> > > >
> > > > Therefore, since the bridge is already effectively required to be in
> > > > the RPM_ACTIVE state, formalize this in the code and elevate the PM
> > > > usage count to maintain the state while polling the subordinate
> > > > device.
> > > >
> > > > Cc: Lukas Wunner <lukas@wunner.de>
> > > > Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
> > > > Cc: Rafael J. Wysocki <rafael@kernel.org>
> > > > Fixes: d3fcd7360338 ("PCI: Fix runtime PM race with PME polling")
> > > > Reported-by: Sanath S <sanath.s@amd.com>
> > > > Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218360
> > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > ---
> > > >  drivers/pci/pci.c | 37 ++++++++++++++++++++++---------------
> > > >  1 file changed, 22 insertions(+), 15 deletions(-)
> > > >
> > > > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > > > index bdbf8a94b4d0..764d7c977ef4 100644
> > > > --- a/drivers/pci/pci.c
> > > > +++ b/drivers/pci/pci.c
> > > > @@ -2433,29 +2433,36 @@ static void pci_pme_list_scan(struct work_struct *work)
> > > >                 if (pdev->pme_poll) {
> > > >                         struct pci_dev *bridge = pdev->bus->self;
> > > >                         struct device *dev = &pdev->dev;
> > > > -                       int pm_status;
> > > > +                       struct device *bdev = bridge ? &bridge->dev : NULL;
> > > > +                       int bref = 0;
> > > >
> > > >                         /*
> > > > -                        * If bridge is in low power state, the
> > > > -                        * configuration space of subordinate devices
> > > > -                        * may be not accessible
> > > > +                        * If we have a bridge, it should be in an active/D0
> > > > +                        * state or the configuration space of subordinate
> > > > +                        * devices may not be accessible or stable over the
> > > > +                        * course of the call.
> > > >                          */
> > > > -                       if (bridge && bridge->current_state != PCI_D0)
> > > > -                               continue;
> > > > +                       if (bdev) {
> > > > +                               bref = pm_runtime_get_if_active(bdev, true);
> > > > +                               if (!bref)  
> > >
> > > I would check bref <= 0 here.
> > >  
> > > > +                                       continue;
> > > > +
> > > > +                               if (bridge->current_state != PCI_D0)  
> > >
> > > Isn't the power state guaranteed to be PCI_D0 at this point?  If it
> > > isn't, then why?  
> >
> > Both of these seem necessary to support !CONFIG_PM, where bref would be
> > -EINVAL and provides no indication of the current_state.  Is that
> > incorrect?  Thanks,  
> 
> Well, CONFIG_PCIE_PME depends on CONFIG_PM, so I'm not sure how
> dev->pme_poll can be set without it.

I only see that drivers/pci/pci.c:pci_pm_init() sets pme_poll true and
I'm not spotting a dependency on either PCIE_PME or PM to get there.  I
see a few places where pme.c, governed by PCIE_PME, can set pme_poll
false though.

It's also not clear to me that we should skip scanning a device if
pm_runtime_get_if_active() returns -EINVAL for the bridge due to
power.disable_depth.  If runtime PM isn't enabled on the bridge,
shouldn't we be able to test current_state and assume it won't change?
Thanks,

Alex
Rafael J. Wysocki Jan. 23, 2024, 10:33 p.m. UTC | #5
On Tue, Jan 23, 2024 at 9:39 PM Alex Williamson
<alex.williamson@redhat.com> wrote:
>
> On Tue, 23 Jan 2024 20:59:50 +0100
> "Rafael J. Wysocki" <rafael@kernel.org> wrote:
>
> > On Tue, Jan 23, 2024 at 8:51 PM Alex Williamson
> > <alex.williamson@redhat.com> wrote:
> > >
> > > On Tue, 23 Jan 2024 20:40:32 +0100
> > > "Rafael J. Wysocki" <rafael@kernel.org> wrote:
> > >
> > > > On Tue, Jan 23, 2024 at 7:56 PM Alex Williamson
> > > > <alex.williamson@redhat.com> wrote:
> > > > >
> > > > > The commit noted in fixes added a bogus requirement that runtime PM
> > > > > managed devices need to be in the RPM_ACTIVE state for PME polling.
> > > > > In fact, only devices in low power states should be polled.
> > > > >
> > > > > However there's still a requirement that the device config space must
> > > > > be accessible, which has implications for both the current state of
> > > > > the polled device and the parent bridge, when present.  It's not
> > > > > sufficient to assume the bridge remains in D0 and cases have been
> > > > > observed where the bridge passes the D0 test, but the PM state
> > > > > indicates RPM_SUSPENDING and config space of the polled device becomes
> > > > > inaccessible during pci_pme_wakeup().
> > > > >
> > > > > Therefore, since the bridge is already effectively required to be in
> > > > > the RPM_ACTIVE state, formalize this in the code and elevate the PM
> > > > > usage count to maintain the state while polling the subordinate
> > > > > device.
> > > > >
> > > > > Cc: Lukas Wunner <lukas@wunner.de>
> > > > > Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
> > > > > Cc: Rafael J. Wysocki <rafael@kernel.org>
> > > > > Fixes: d3fcd7360338 ("PCI: Fix runtime PM race with PME polling")
> > > > > Reported-by: Sanath S <sanath.s@amd.com>
> > > > > Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218360
> > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > ---
> > > > >  drivers/pci/pci.c | 37 ++++++++++++++++++++++---------------
> > > > >  1 file changed, 22 insertions(+), 15 deletions(-)
> > > > >
> > > > > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > > > > index bdbf8a94b4d0..764d7c977ef4 100644
> > > > > --- a/drivers/pci/pci.c
> > > > > +++ b/drivers/pci/pci.c
> > > > > @@ -2433,29 +2433,36 @@ static void pci_pme_list_scan(struct work_struct *work)
> > > > >                 if (pdev->pme_poll) {
> > > > >                         struct pci_dev *bridge = pdev->bus->self;
> > > > >                         struct device *dev = &pdev->dev;
> > > > > -                       int pm_status;
> > > > > +                       struct device *bdev = bridge ? &bridge->dev : NULL;
> > > > > +                       int bref = 0;
> > > > >
> > > > >                         /*
> > > > > -                        * If bridge is in low power state, the
> > > > > -                        * configuration space of subordinate devices
> > > > > -                        * may be not accessible
> > > > > +                        * If we have a bridge, it should be in an active/D0
> > > > > +                        * state or the configuration space of subordinate
> > > > > +                        * devices may not be accessible or stable over the
> > > > > +                        * course of the call.
> > > > >                          */
> > > > > -                       if (bridge && bridge->current_state != PCI_D0)
> > > > > -                               continue;
> > > > > +                       if (bdev) {
> > > > > +                               bref = pm_runtime_get_if_active(bdev, true);
> > > > > +                               if (!bref)
> > > >
> > > > I would check bref <= 0 here.
> > > >
> > > > > +                                       continue;
> > > > > +
> > > > > +                               if (bridge->current_state != PCI_D0)
> > > >
> > > > Isn't the power state guaranteed to be PCI_D0 at this point?  If it
> > > > isn't, then why?
> > >
> > > Both of these seem necessary to support !CONFIG_PM, where bref would be
> > > -EINVAL and provides no indication of the current_state.  Is that
> > > incorrect?  Thanks,
> >
> > Well, CONFIG_PCIE_PME depends on CONFIG_PM, so I'm not sure how
> > dev->pme_poll can be set without it.
>
> I only see that drivers/pci/pci.c:pci_pm_init() sets pme_poll true and
> I'm not spotting a dependency on either PCIE_PME or PM to get there.  I
> see a few places where pme.c, governed by PCIE_PME, can set pme_poll
> false though.

All of this is a bit moot when CONFIG_PM is unset, because PME polling
was introduced as a workaround for problems with PME signaling which
is only enabled when CONFIG_PM is set.  IOW, if CONFIG_PM is not set,
there is no reason for PME polling.

> It's also not clear to me that we should skip scanning a device if
> pm_runtime_get_if_active() returns -EINVAL for the bridge due to
> power.disable_depth.

Let me recap things a bit.

PME polling is run from a freezable workqueue, so it is not carried
out during system-wide PM transitions, only in the working state
proper.

This means that when runtime PM is disabled for a bridge, there is no
reason for its power state to change and all bridges start in D0.

So you are right, the endpoint device below the bridge still needs to
be scanned then, but I'm not sure about the current_state check.  In
theory, it should not be necessary.

But I agree that this is a minor point, so please feel free to add

Reviewed-by: Rafael J. Wysocki <rafael@kernel.org>

to the patch.
Sanath S Jan. 24, 2024, 2:21 p.m. UTC | #6
On 1/24/2024 4:03 AM, Rafael J. Wysocki wrote:
> On Tue, Jan 23, 2024 at 9:39 PM Alex Williamson
> <alex.williamson@redhat.com> wrote:
>> On Tue, 23 Jan 2024 20:59:50 +0100
>> "Rafael J. Wysocki" <rafael@kernel.org> wrote:
>>
>>> On Tue, Jan 23, 2024 at 8:51 PM Alex Williamson
>>> <alex.williamson@redhat.com> wrote:
>>>> On Tue, 23 Jan 2024 20:40:32 +0100
>>>> "Rafael J. Wysocki" <rafael@kernel.org> wrote:
>>>>
>>>>> On Tue, Jan 23, 2024 at 7:56 PM Alex Williamson
>>>>> <alex.williamson@redhat.com> wrote:
>>>>>> The commit noted in fixes added a bogus requirement that runtime PM
>>>>>> managed devices need to be in the RPM_ACTIVE state for PME polling.
>>>>>> In fact, only devices in low power states should be polled.
>>>>>>
>>>>>> However there's still a requirement that the device config space must
>>>>>> be accessible, which has implications for both the current state of
>>>>>> the polled device and the parent bridge, when present.  It's not
>>>>>> sufficient to assume the bridge remains in D0 and cases have been
>>>>>> observed where the bridge passes the D0 test, but the PM state
>>>>>> indicates RPM_SUSPENDING and config space of the polled device becomes
>>>>>> inaccessible during pci_pme_wakeup().
>>>>>>
>>>>>> Therefore, since the bridge is already effectively required to be in
>>>>>> the RPM_ACTIVE state, formalize this in the code and elevate the PM
>>>>>> usage count to maintain the state while polling the subordinate
>>>>>> device.
>>>>>>
>>>>>> Cc: Lukas Wunner <lukas@wunner.de>
>>>>>> Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
>>>>>> Cc: Rafael J. Wysocki <rafael@kernel.org>
>>>>>> Fixes: d3fcd7360338 ("PCI: Fix runtime PM race with PME polling")
>>>>>> Reported-by: Sanath S <sanath.s@amd.com>

Gave a try on couple of thunderbolt docks, issue is resolved with this 
patch.

Tested-by: Sanath S <sanath.s@amd.com>

>>>>>> Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218360
>>>>>> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
>>>>>> ---
>>>>>>   drivers/pci/pci.c | 37 ++++++++++++++++++++++---------------
>>>>>>   1 file changed, 22 insertions(+), 15 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>>>>>> index bdbf8a94b4d0..764d7c977ef4 100644
>>>>>> --- a/drivers/pci/pci.c
>>>>>> +++ b/drivers/pci/pci.c
>>>>>> @@ -2433,29 +2433,36 @@ static void pci_pme_list_scan(struct work_struct *work)
>>>>>>                  if (pdev->pme_poll) {
>>>>>>                          struct pci_dev *bridge = pdev->bus->self;
>>>>>>                          struct device *dev = &pdev->dev;
>>>>>> -                       int pm_status;
>>>>>> +                       struct device *bdev = bridge ? &bridge->dev : NULL;
>>>>>> +                       int bref = 0;
>>>>>>
>>>>>>                          /*
>>>>>> -                        * If bridge is in low power state, the
>>>>>> -                        * configuration space of subordinate devices
>>>>>> -                        * may be not accessible
>>>>>> +                        * If we have a bridge, it should be in an active/D0
>>>>>> +                        * state or the configuration space of subordinate
>>>>>> +                        * devices may not be accessible or stable over the
>>>>>> +                        * course of the call.
>>>>>>                           */
>>>>>> -                       if (bridge && bridge->current_state != PCI_D0)
>>>>>> -                               continue;
>>>>>> +                       if (bdev) {
>>>>>> +                               bref = pm_runtime_get_if_active(bdev, true);
>>>>>> +                               if (!bref)
>>>>> I would check bref <= 0 here.
>>>>>
>>>>>> +                                       continue;
>>>>>> +
>>>>>> +                               if (bridge->current_state != PCI_D0)
>>>>> Isn't the power state guaranteed to be PCI_D0 at this point?  If it
>>>>> isn't, then why?
>>>> Both of these seem necessary to support !CONFIG_PM, where bref would be
>>>> -EINVAL and provides no indication of the current_state.  Is that
>>>> incorrect?  Thanks,
>>> Well, CONFIG_PCIE_PME depends on CONFIG_PM, so I'm not sure how
>>> dev->pme_poll can be set without it.
>> I only see that drivers/pci/pci.c:pci_pm_init() sets pme_poll true and
>> I'm not spotting a dependency on either PCIE_PME or PM to get there.  I
>> see a few places where pme.c, governed by PCIE_PME, can set pme_poll
>> false though.
> All of this is a bit moot when CONFIG_PM is unset, because PME polling
> was introduced as a workaround for problems with PME signaling which
> is only enabled when CONFIG_PM is set.  IOW, if CONFIG_PM is not set,
> there is no reason for PME polling.
>
>> It's also not clear to me that we should skip scanning a device if
>> pm_runtime_get_if_active() returns -EINVAL for the bridge due to
>> power.disable_depth.
> Let me recap things a bit.
>
> PME polling is run from a freezable workqueue, so it is not carried
> out during system-wide PM transitions, only in the working state
> proper.
>
> This means that when runtime PM is disabled for a bridge, there is no
> reason for its power state to change and all bridges start in D0.
>
> So you are right, the endpoint device below the bridge still needs to
> be scanned then, but I'm not sure about the current_state check.  In
> theory, it should not be necessary.
>
> But I agree that this is a minor point, so please feel free to add
>
> Reviewed-by: Rafael J. Wysocki <rafael@kernel.org>
>
> to the patch.
Bjorn Helgaas Feb. 9, 2024, 4:35 p.m. UTC | #7
On Tue, Jan 23, 2024 at 11:55:31AM -0700, Alex Williamson wrote:
> The commit noted in fixes added a bogus requirement that runtime PM
> managed devices need to be in the RPM_ACTIVE state for PME polling.
> In fact, only devices in low power states should be polled.
> 
> However there's still a requirement that the device config space must
> be accessible, which has implications for both the current state of
> the polled device and the parent bridge, when present.  It's not
> sufficient to assume the bridge remains in D0 and cases have been
> observed where the bridge passes the D0 test, but the PM state
> indicates RPM_SUSPENDING and config space of the polled device becomes
> inaccessible during pci_pme_wakeup().
> 
> Therefore, since the bridge is already effectively required to be in
> the RPM_ACTIVE state, formalize this in the code and elevate the PM
> usage count to maintain the state while polling the subordinate
> device.

This apparently fixes a problem: the bugzilla says something about
disks attached to Thunderbolt/USB4 docks not working, but I doubt it's
actually specific to Thunderbolt/USB4 or to disks.

The bugzilla also indicates that d3fcd7360338 was a regression.
d3fcd7360338 appeared in v6.6, so this fix is likely a candidate for
the current release (v6.8).

I'd like to mention both the user-visible problem being fixed and 
the fact that it fixes a regression here in the commit log so we can
make the case for putting this in v6.8.

> Cc: Lukas Wunner <lukas@wunner.de>
> Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
> Cc: Rafael J. Wysocki <rafael@kernel.org>
> Fixes: d3fcd7360338 ("PCI: Fix runtime PM race with PME polling")
> Reported-by: Sanath S <sanath.s@amd.com>
> Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218360
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
>  drivers/pci/pci.c | 37 ++++++++++++++++++++++---------------
>  1 file changed, 22 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index bdbf8a94b4d0..764d7c977ef4 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -2433,29 +2433,36 @@ static void pci_pme_list_scan(struct work_struct *work)
>  		if (pdev->pme_poll) {
>  			struct pci_dev *bridge = pdev->bus->self;
>  			struct device *dev = &pdev->dev;
> -			int pm_status;
> +			struct device *bdev = bridge ? &bridge->dev : NULL;
> +			int bref = 0;
>  
>  			/*
> -			 * If bridge is in low power state, the
> -			 * configuration space of subordinate devices
> -			 * may be not accessible
> +			 * If we have a bridge, it should be in an active/D0
> +			 * state or the configuration space of subordinate
> +			 * devices may not be accessible or stable over the
> +			 * course of the call.
>  			 */
> -			if (bridge && bridge->current_state != PCI_D0)
> -				continue;
> +			if (bdev) {
> +				bref = pm_runtime_get_if_active(bdev, true);
> +				if (!bref)
> +					continue;
> +
> +				if (bridge->current_state != PCI_D0)
> +					goto put_bridge;
> +			}
>  
>  			/*
> -			 * If the device is in a low power state it
> -			 * should not be polled either.
> +			 * The device itself should be suspended but config
> +			 * space must be accessible, therefore it cannot be in
> +			 * D3cold.
>  			 */
> -			pm_status = pm_runtime_get_if_active(dev, true);
> -			if (!pm_status)
> -				continue;
> -
> -			if (pdev->current_state != PCI_D3cold)
> +			if (pm_runtime_suspended(dev) &&
> +			    pdev->current_state != PCI_D3cold)
>  				pci_pme_wakeup(pdev, NULL);
>  
> -			if (pm_status > 0)
> -				pm_runtime_put(dev);
> +put_bridge:
> +			if (bref > 0)
> +				pm_runtime_put(bdev);
>  		} else {
>  			list_del(&pme_dev->list);
>  			kfree(pme_dev);
> -- 
> 2.43.0
>
Alex Williamson Feb. 9, 2024, 5:56 p.m. UTC | #8
On Fri, 9 Feb 2024 10:35:21 -0600
Bjorn Helgaas <helgaas@kernel.org> wrote:

> On Tue, Jan 23, 2024 at 11:55:31AM -0700, Alex Williamson wrote:
> > The commit noted in fixes added a bogus requirement that runtime PM
> > managed devices need to be in the RPM_ACTIVE state for PME polling.
> > In fact, only devices in low power states should be polled.
> > 
> > However there's still a requirement that the device config space must
> > be accessible, which has implications for both the current state of
> > the polled device and the parent bridge, when present.  It's not
> > sufficient to assume the bridge remains in D0 and cases have been
> > observed where the bridge passes the D0 test, but the PM state
> > indicates RPM_SUSPENDING and config space of the polled device becomes
> > inaccessible during pci_pme_wakeup().
> > 
> > Therefore, since the bridge is already effectively required to be in
> > the RPM_ACTIVE state, formalize this in the code and elevate the PM
> > usage count to maintain the state while polling the subordinate
> > device.  
> 
> This apparently fixes a problem: the bugzilla says something about
> disks attached to Thunderbolt/USB4 docks not working, but I doubt it's
> actually specific to Thunderbolt/USB4 or to disks.

Right, AIUI it's simply a PCIe hierarchy where a bridge was previously
scanned in response to a PME and no longer is because of the invalid
requirement added in d3fcd7360338 that the runtime power management
status of the device is active.

> The bugzilla also indicates that d3fcd7360338 was a regression.
> d3fcd7360338 appeared in v6.6, so this fix is likely a candidate for
> the current release (v6.8).

Agreed.

> I'd like to mention both the user-visible problem being fixed and 
> the fact that it fixes a regression here in the commit log so we can
> make the case for putting this in v6.8.

Ok, I've not experienced the regression myself, but I can add a
paragraph describing my understanding of the bugzilla.  I'd probably
just say:

	This resolves a regression reported in the bugzilla below where
	a Thunderbolt/USB4 hierarchy fails to scan for an attached NVMe
	endpoint downstream of a bridge in a D3hot power state.

If you'd like a respin including that or if you have further
phrasing/info suggestions, please let me know.  Thanks,

Alex

> > Cc: Lukas Wunner <lukas@wunner.de>
> > Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
> > Cc: Rafael J. Wysocki <rafael@kernel.org>
> > Fixes: d3fcd7360338 ("PCI: Fix runtime PM race with PME polling")
> > Reported-by: Sanath S <sanath.s@amd.com>
> > Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218360
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> >  drivers/pci/pci.c | 37 ++++++++++++++++++++++---------------
> >  1 file changed, 22 insertions(+), 15 deletions(-)
> > 
> > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > index bdbf8a94b4d0..764d7c977ef4 100644
> > --- a/drivers/pci/pci.c
> > +++ b/drivers/pci/pci.c
> > @@ -2433,29 +2433,36 @@ static void pci_pme_list_scan(struct work_struct *work)
> >  		if (pdev->pme_poll) {
> >  			struct pci_dev *bridge = pdev->bus->self;
> >  			struct device *dev = &pdev->dev;
> > -			int pm_status;
> > +			struct device *bdev = bridge ? &bridge->dev : NULL;
> > +			int bref = 0;
> >  
> >  			/*
> > -			 * If bridge is in low power state, the
> > -			 * configuration space of subordinate devices
> > -			 * may be not accessible
> > +			 * If we have a bridge, it should be in an active/D0
> > +			 * state or the configuration space of subordinate
> > +			 * devices may not be accessible or stable over the
> > +			 * course of the call.
> >  			 */
> > -			if (bridge && bridge->current_state != PCI_D0)
> > -				continue;
> > +			if (bdev) {
> > +				bref = pm_runtime_get_if_active(bdev, true);
> > +				if (!bref)
> > +					continue;
> > +
> > +				if (bridge->current_state != PCI_D0)
> > +					goto put_bridge;
> > +			}
> >  
> >  			/*
> > -			 * If the device is in a low power state it
> > -			 * should not be polled either.
> > +			 * The device itself should be suspended but config
> > +			 * space must be accessible, therefore it cannot be in
> > +			 * D3cold.
> >  			 */
> > -			pm_status = pm_runtime_get_if_active(dev, true);
> > -			if (!pm_status)
> > -				continue;
> > -
> > -			if (pdev->current_state != PCI_D3cold)
> > +			if (pm_runtime_suspended(dev) &&
> > +			    pdev->current_state != PCI_D3cold)
> >  				pci_pme_wakeup(pdev, NULL);
> >  
> > -			if (pm_status > 0)
> > -				pm_runtime_put(dev);
> > +put_bridge:
> > +			if (bref > 0)
> > +				pm_runtime_put(bdev);
> >  		} else {
> >  			list_del(&pme_dev->list);
> >  			kfree(pme_dev);
> > -- 
> > 2.43.0
> >   
>
Bjorn Helgaas Feb. 9, 2024, 7:03 p.m. UTC | #9
On Tue, Jan 23, 2024 at 11:55:31AM -0700, Alex Williamson wrote:
> The commit noted in fixes added a bogus requirement that runtime PM
> managed devices need to be in the RPM_ACTIVE state for PME polling.
> In fact, only devices in low power states should be polled.
> 
> However there's still a requirement that the device config space must
> be accessible, which has implications for both the current state of
> the polled device and the parent bridge, when present.  It's not
> sufficient to assume the bridge remains in D0 and cases have been
> observed where the bridge passes the D0 test, but the PM state
> indicates RPM_SUSPENDING and config space of the polled device becomes
> inaccessible during pci_pme_wakeup().
> 
> Therefore, since the bridge is already effectively required to be in
> the RPM_ACTIVE state, formalize this in the code and elevate the PM
> usage count to maintain the state while polling the subordinate
> device.
> 
> Cc: Lukas Wunner <lukas@wunner.de>
> Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
> Cc: Rafael J. Wysocki <rafael@kernel.org>
> Fixes: d3fcd7360338 ("PCI: Fix runtime PM race with PME polling")
> Reported-by: Sanath S <sanath.s@amd.com>
> Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218360
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>

Applied with Rafael's reviewed-by to for-linus for v6.8, thanks!

> ---
>  drivers/pci/pci.c | 37 ++++++++++++++++++++++---------------
>  1 file changed, 22 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index bdbf8a94b4d0..764d7c977ef4 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -2433,29 +2433,36 @@ static void pci_pme_list_scan(struct work_struct *work)
>  		if (pdev->pme_poll) {
>  			struct pci_dev *bridge = pdev->bus->self;
>  			struct device *dev = &pdev->dev;
> -			int pm_status;
> +			struct device *bdev = bridge ? &bridge->dev : NULL;
> +			int bref = 0;
>  
>  			/*
> -			 * If bridge is in low power state, the
> -			 * configuration space of subordinate devices
> -			 * may be not accessible
> +			 * If we have a bridge, it should be in an active/D0
> +			 * state or the configuration space of subordinate
> +			 * devices may not be accessible or stable over the
> +			 * course of the call.
>  			 */
> -			if (bridge && bridge->current_state != PCI_D0)
> -				continue;
> +			if (bdev) {
> +				bref = pm_runtime_get_if_active(bdev, true);
> +				if (!bref)
> +					continue;
> +
> +				if (bridge->current_state != PCI_D0)
> +					goto put_bridge;
> +			}
>  
>  			/*
> -			 * If the device is in a low power state it
> -			 * should not be polled either.
> +			 * The device itself should be suspended but config
> +			 * space must be accessible, therefore it cannot be in
> +			 * D3cold.
>  			 */
> -			pm_status = pm_runtime_get_if_active(dev, true);
> -			if (!pm_status)
> -				continue;
> -
> -			if (pdev->current_state != PCI_D3cold)
> +			if (pm_runtime_suspended(dev) &&
> +			    pdev->current_state != PCI_D3cold)
>  				pci_pme_wakeup(pdev, NULL);
>  
> -			if (pm_status > 0)
> -				pm_runtime_put(dev);
> +put_bridge:
> +			if (bref > 0)
> +				pm_runtime_put(bdev);
>  		} else {
>  			list_del(&pme_dev->list);
>  			kfree(pme_dev);
> -- 
> 2.43.0
>
diff mbox series

Patch

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index bdbf8a94b4d0..764d7c977ef4 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2433,29 +2433,36 @@  static void pci_pme_list_scan(struct work_struct *work)
 		if (pdev->pme_poll) {
 			struct pci_dev *bridge = pdev->bus->self;
 			struct device *dev = &pdev->dev;
-			int pm_status;
+			struct device *bdev = bridge ? &bridge->dev : NULL;
+			int bref = 0;
 
 			/*
-			 * If bridge is in low power state, the
-			 * configuration space of subordinate devices
-			 * may be not accessible
+			 * If we have a bridge, it should be in an active/D0
+			 * state or the configuration space of subordinate
+			 * devices may not be accessible or stable over the
+			 * course of the call.
 			 */
-			if (bridge && bridge->current_state != PCI_D0)
-				continue;
+			if (bdev) {
+				bref = pm_runtime_get_if_active(bdev, true);
+				if (!bref)
+					continue;
+
+				if (bridge->current_state != PCI_D0)
+					goto put_bridge;
+			}
 
 			/*
-			 * If the device is in a low power state it
-			 * should not be polled either.
+			 * The device itself should be suspended but config
+			 * space must be accessible, therefore it cannot be in
+			 * D3cold.
 			 */
-			pm_status = pm_runtime_get_if_active(dev, true);
-			if (!pm_status)
-				continue;
-
-			if (pdev->current_state != PCI_D3cold)
+			if (pm_runtime_suspended(dev) &&
+			    pdev->current_state != PCI_D3cold)
 				pci_pme_wakeup(pdev, NULL);
 
-			if (pm_status > 0)
-				pm_runtime_put(dev);
+put_bridge:
+			if (bref > 0)
+				pm_runtime_put(bdev);
 		} else {
 			list_del(&pme_dev->list);
 			kfree(pme_dev);