diff mbox series

[4/5] PCI / PM: Check for error when reading Power State

Message ID 20190805205214.194981-5-helgaas@kernel.org (mailing list archive)
State Superseded, archived
Headers show
Series PCI: Add PCI_ERROR_RESPONSE, check for errors | expand

Commit Message

Bjorn Helgaas Aug. 5, 2019, 8:52 p.m. UTC
From: Bjorn Helgaas <bhelgaas@google.com>

The Power Management Status Register is in config space, and reads while
the device is in D3cold typically return ~0 data (PCI_ERROR_RESPONSE).  If
we just look at the PCI_PM_CTRL_STATE_MASK bits, that is 0x3, which looks
like D3hot, not D3cold.

Check the entire register for PCI_ERROR_RESPONSE so we can distinguish
D3cold from D3hot.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c   |  6 +++---
 include/linux/pci.h | 13 +++++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

Comments

Rafael J. Wysocki Aug. 5, 2019, 9:09 p.m. UTC | #1
On Mon, Aug 5, 2019 at 10:52 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> From: Bjorn Helgaas <bhelgaas@google.com>
>
> The Power Management Status Register is in config space, and reads while
> the device is in D3cold typically return ~0 data (PCI_ERROR_RESPONSE).  If
> we just look at the PCI_PM_CTRL_STATE_MASK bits, that is 0x3, which looks
> like D3hot, not D3cold.
>
> Check the entire register for PCI_ERROR_RESPONSE so we can distinguish
> D3cold from D3hot.
>
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> ---
>  drivers/pci/pci.c   |  6 +++---
>  include/linux/pci.h | 13 +++++++++++++
>  2 files changed, 16 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index af6a97d7012b..d8686e3cd5eb 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -894,7 +894,7 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state)
>                 udelay(PCI_PM_D2_DELAY);
>
>         pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
> -       dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
> +       dev->current_state = pci_power_state(pmcsr);

But pci_raw_set_power_state() should not even be called for devices in
D3_cold, so this at best is redundant.

>         if (dev->current_state != state && printk_ratelimit())
>                 pci_info(dev, "Refused to change power state, currently in D%d\n",
>                          dev->current_state);
> @@ -942,7 +942,7 @@ void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
>                 u16 pmcsr;
>
>                 pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
> -               dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
> +               dev->current_state = pci_power_state(pmcsr);

The if () branch above should cover the D3cold case, shouldn't it?

>         } else {
>                 dev->current_state = state;
>         }
> @@ -1677,7 +1677,7 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
>         if (dev->pm_cap) {
>                 u16 pmcsr;
>                 pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
> -               dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
> +               dev->current_state = pci_power_state(pmcsr);

So this appears to be only case in which pci_power_state(pmcsr) is
useful at all.

It might be better to use the code from it directly here IMO.

>         }
>
>         if (atomic_inc_return(&dev->enable_cnt) > 1)
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index d64fd3788061..fdfe990e9661 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -152,6 +152,19 @@ static inline const char *pci_power_name(pci_power_t state)
>         return pci_power_names[1 + (__force int) state];
>  }
>
> +/*
> + * Convert a Power Management Status Register value to a pci_power_t.
> + * Note that if we read the register while the device is in D3cold, we
> + * typically get PCI_ERROR_RESPONSE, which looks like D3hot (0x3) if we
> + * only look at the PCI_PM_CTRL_STATE_MASK bits.
> + */
> +static inline pci_power_t pci_power_state(u16 pmcsr)
> +{
> +       if (pmcsr == (u16) PCI_ERROR_RESPONSE)
> +               return PCI_D3cold;
> +       return pmcsr & PCI_PM_CTRL_STATE_MASK;
> +}
> +
>  #define PCI_PM_D2_DELAY                200
>  #define PCI_PM_D3_WAIT         10
>  #define PCI_PM_D3COLD_WAIT     100
> --
> 2.22.0.770.g0f2c4a37fd-goog
>
Bjorn Helgaas Aug. 9, 2019, 10:01 p.m. UTC | #2
On Mon, Aug 05, 2019 at 11:09:19PM +0200, Rafael J. Wysocki wrote:
> On Mon, Aug 5, 2019 at 10:52 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> >
> > From: Bjorn Helgaas <bhelgaas@google.com>
> >
> > The Power Management Status Register is in config space, and reads while
> > the device is in D3cold typically return ~0 data (PCI_ERROR_RESPONSE).  If
> > we just look at the PCI_PM_CTRL_STATE_MASK bits, that is 0x3, which looks
> > like D3hot, not D3cold.
> >
> > Check the entire register for PCI_ERROR_RESPONSE so we can distinguish
> > D3cold from D3hot.
> >
> > Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> > ---
> >  drivers/pci/pci.c   |  6 +++---
> >  include/linux/pci.h | 13 +++++++++++++
> >  2 files changed, 16 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > index af6a97d7012b..d8686e3cd5eb 100644
> > --- a/drivers/pci/pci.c
> > +++ b/drivers/pci/pci.c
> > @@ -894,7 +894,7 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state)
> >                 udelay(PCI_PM_D2_DELAY);
> >
> >         pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
> > -       dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
> > +       dev->current_state = pci_power_state(pmcsr);
> 
> But pci_raw_set_power_state() should not even be called for devices in
> D3_cold, so this at best is redundant.

I tried to verify that we don't call pci_raw_set_power_state() for
devices in D3cold, but it wasn't obvious to me.  Is there an easy way
to verify that?  I'd rather have code that doesn't rely on deep
knowledge about other areas.

Even if the device was in, say D0, what if it is hot-removed just
before we read PCI_PM_CTRL?  We'll set dev->current_state to D3hot,
when I think D3cold would better correspond to the state of the
device.  Maybe that's harmless, but I don't know how to verify that.

> >         if (dev->current_state != state && printk_ratelimit())
> >                 pci_info(dev, "Refused to change power state, currently in D%d\n",
> >                          dev->current_state);
> > @@ -942,7 +942,7 @@ void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
> >                 u16 pmcsr;
> >
> >                 pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
> > -               dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
> > +               dev->current_state = pci_power_state(pmcsr);
> 
> The if () branch above should cover the D3cold case, shouldn't it?

You mean the "if (platform_pci_get_power_state(dev) == PCI_D3cold)"
test?

platform_pci_get_power_state() returns PCI_UNKNOWN in some cases.
When that happens, might we not read PCI_PM_CTRL of a device in
D3cold?  I think this also has the same hotplug question as above.

> >         } else {
> >                 dev->current_state = state;
> >         }
> > @@ -1677,7 +1677,7 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
> >         if (dev->pm_cap) {
> >                 u16 pmcsr;
> >                 pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
> > -               dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
> > +               dev->current_state = pci_power_state(pmcsr);
> 
> So this appears to be only case in which pci_power_state(pmcsr) is
> useful at all.
> 
> It might be better to use the code from it directly here IMO.

If we're decoding CSR values, I think it's better to notice error
responses when we can than it is to try to figure out whether the
error response is theoretically impossible or the incorrectly decoded
value (e.g., D3hot instead of D3cold) is harmless.

> >         }
> >
> >         if (atomic_inc_return(&dev->enable_cnt) > 1)
> > diff --git a/include/linux/pci.h b/include/linux/pci.h
> > index d64fd3788061..fdfe990e9661 100644
> > --- a/include/linux/pci.h
> > +++ b/include/linux/pci.h
> > @@ -152,6 +152,19 @@ static inline const char *pci_power_name(pci_power_t state)
> >         return pci_power_names[1 + (__force int) state];
> >  }
> >
> > +/*
> > + * Convert a Power Management Status Register value to a pci_power_t.
> > + * Note that if we read the register while the device is in D3cold, we
> > + * typically get PCI_ERROR_RESPONSE, which looks like D3hot (0x3) if we
> > + * only look at the PCI_PM_CTRL_STATE_MASK bits.
> > + */
> > +static inline pci_power_t pci_power_state(u16 pmcsr)
> > +{
> > +       if (pmcsr == (u16) PCI_ERROR_RESPONSE)
> > +               return PCI_D3cold;
> > +       return pmcsr & PCI_PM_CTRL_STATE_MASK;
> > +}
> > +
> >  #define PCI_PM_D2_DELAY                200
> >  #define PCI_PM_D3_WAIT         10
> >  #define PCI_PM_D3COLD_WAIT     100
> > --
> > 2.22.0.770.g0f2c4a37fd-goog
> >
Rafael J. Wysocki Aug. 13, 2019, 10:59 p.m. UTC | #3
On Saturday, August 10, 2019 12:01:16 AM CEST Bjorn Helgaas wrote:
> On Mon, Aug 05, 2019 at 11:09:19PM +0200, Rafael J. Wysocki wrote:
> > On Mon, Aug 5, 2019 at 10:52 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > >
> > > From: Bjorn Helgaas <bhelgaas@google.com>
> > >
> > > The Power Management Status Register is in config space, and reads while
> > > the device is in D3cold typically return ~0 data (PCI_ERROR_RESPONSE).  If
> > > we just look at the PCI_PM_CTRL_STATE_MASK bits, that is 0x3, which looks
> > > like D3hot, not D3cold.
> > >
> > > Check the entire register for PCI_ERROR_RESPONSE so we can distinguish
> > > D3cold from D3hot.
> > >
> > > Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> > > ---
> > >  drivers/pci/pci.c   |  6 +++---
> > >  include/linux/pci.h | 13 +++++++++++++
> > >  2 files changed, 16 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > > index af6a97d7012b..d8686e3cd5eb 100644
> > > --- a/drivers/pci/pci.c
> > > +++ b/drivers/pci/pci.c
> > > @@ -894,7 +894,7 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state)
> > >                 udelay(PCI_PM_D2_DELAY);
> > >
> > >         pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
> > > -       dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
> > > +       dev->current_state = pci_power_state(pmcsr);
> > 
> > But pci_raw_set_power_state() should not even be called for devices in
> > D3_cold, so this at best is redundant.
> 
> I tried to verify that we don't call pci_raw_set_power_state() for
> devices in D3cold, but it wasn't obvious to me.  Is there an easy way
> to verify that?  I'd rather have code that doesn't rely on deep
> knowledge about other areas.

It is called in two places, pci_power_up() and pci_set_power_state().

pci_power_up() is called on resume when the whole hierarchy is
turned on and pci_set_power_state() explicitly powers up the
device if in D3cold (with the help of the platform).

And the "device not accessible at all" case should be covered by patch [2/5]
in this series.

> Even if the device was in, say D0, what if it is hot-removed just
> before we read PCI_PM_CTRL?

I guess you mean surprise-hot-removed?

Then it may as well be hot-removed after setting current_state.

> We'll set dev->current_state to D3hot,
> when I think D3cold would better correspond to the state of the
> device.  Maybe that's harmless, but I don't know how to verify that.

Well, D3cold may just be equally misleading, because the device may
very well not be present at all any more.

> > >         if (dev->current_state != state && printk_ratelimit())
> > >                 pci_info(dev, "Refused to change power state, currently in D%d\n",
> > >                          dev->current_state);
> > > @@ -942,7 +942,7 @@ void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
> > >                 u16 pmcsr;
> > >
> > >                 pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
> > > -               dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
> > > +               dev->current_state = pci_power_state(pmcsr);
> > 
> > The if () branch above should cover the D3cold case, shouldn't it?
> 
> You mean the "if (platform_pci_get_power_state(dev) == PCI_D3cold)"
> test?

Not exactly.

I mean "if (platform_pci_get_power_state(dev) == PCI_D3cold ||
!pci_device_is_present(dev))".

> platform_pci_get_power_state() returns PCI_UNKNOWN in some cases.
> When that happens, might we not read PCI_PM_CTRL of a device in
> D3cold?  I think this also has the same hotplug question as above.

Surprise hot-removal can take place at any time, in particular after setting
current_state, so adding extra checks here doesn't prevent the value of
it from becoming stale at least sometimes anyway.

> > >         } else {
> > >                 dev->current_state = state;
> > >         }
> > > @@ -1677,7 +1677,7 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
> > >         if (dev->pm_cap) {
> > >                 u16 pmcsr;
> > >                 pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
> > > -               dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
> > > +               dev->current_state = pci_power_state(pmcsr);
> > 
> > So this appears to be only case in which pci_power_state(pmcsr) is
> > useful at all.
> > 
> > It might be better to use the code from it directly here IMO.
> 
> If we're decoding CSR values, I think it's better to notice error
> responses when we can than it is to try to figure out whether the
> error response is theoretically impossible or the incorrectly decoded
> value (e.g., D3hot instead of D3cold) is harmless.

IMO this means more complex code and extra overhead for a very
little practical value, however.
Bjorn Helgaas Aug. 14, 2019, 1:08 a.m. UTC | #4
On Wed, Aug 14, 2019 at 12:59:26AM +0200, Rafael J. Wysocki wrote:
> On Saturday, August 10, 2019 12:01:16 AM CEST Bjorn Helgaas wrote:
> > On Mon, Aug 05, 2019 at 11:09:19PM +0200, Rafael J. Wysocki wrote:
> > > On Mon, Aug 5, 2019 at 10:52 PM Bjorn Helgaas <helgaas@kernel.org> wrote:

> > > > @@ -942,7 +942,7 @@ void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
> > > >                 u16 pmcsr;
> > > >
> > > >                 pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
> > > > -               dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
> > > > +               dev->current_state = pci_power_state(pmcsr);
> > > 
> > > The if () branch above should cover the D3cold case, shouldn't it?
> > 
> > You mean the "if (platform_pci_get_power_state(dev) == PCI_D3cold)"
> > test?
> 
> Not exactly.
> 
> I mean "if (platform_pci_get_power_state(dev) == PCI_D3cold ||
> !pci_device_is_present(dev))".

I don't see what you mean.  The !pci_device_is_present(dev) test tells
us something about what the state of the device was at some time in
the past, but of course it doesn't say anything about whether reading
PCI_PM_CTRL will succeed, e.g.,

  # dev is present and in D0
  platform_pci_get_power_state(dev) == PCI_D3cold   # currently false
  !pci_device_is_present(dev)                       # currently false
  # dev is surprise hot-removed or put in D3cold
  pci_read_config_word(PCI_PM_CTRL, &pmcsr)
  # pmcsr == ~0 (error response)

(Maybe going to D3cold is impossible, but it's pretty hard to prove
that.  The hot-remove is definitely possible.)

> > platform_pci_get_power_state() returns PCI_UNKNOWN in some cases.
> > When that happens, might we not read PCI_PM_CTRL of a device in
> > D3cold?  I think this also has the same hotplug question as above.
> 
> Surprise hot-removal can take place at any time, in particular after setting
> current_state, so adding extra checks here doesn't prevent the value of
> it from becoming stale at least sometimes anyway.

Definitely.  The point is not to prevent current_state from becoming
stale, it's to prevent us from interpreting ~0 data (known to be
invalid) as though it were a valid register value.

Bjorn
diff mbox series

Patch

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index af6a97d7012b..d8686e3cd5eb 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -894,7 +894,7 @@  static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state)
 		udelay(PCI_PM_D2_DELAY);
 
 	pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
-	dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
+	dev->current_state = pci_power_state(pmcsr);
 	if (dev->current_state != state && printk_ratelimit())
 		pci_info(dev, "Refused to change power state, currently in D%d\n",
 			 dev->current_state);
@@ -942,7 +942,7 @@  void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
 		u16 pmcsr;
 
 		pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
-		dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
+		dev->current_state = pci_power_state(pmcsr);
 	} else {
 		dev->current_state = state;
 	}
@@ -1677,7 +1677,7 @@  static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
 	if (dev->pm_cap) {
 		u16 pmcsr;
 		pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
-		dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
+		dev->current_state = pci_power_state(pmcsr);
 	}
 
 	if (atomic_inc_return(&dev->enable_cnt) > 1)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d64fd3788061..fdfe990e9661 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -152,6 +152,19 @@  static inline const char *pci_power_name(pci_power_t state)
 	return pci_power_names[1 + (__force int) state];
 }
 
+/*
+ * Convert a Power Management Status Register value to a pci_power_t.
+ * Note that if we read the register while the device is in D3cold, we
+ * typically get PCI_ERROR_RESPONSE, which looks like D3hot (0x3) if we
+ * only look at the PCI_PM_CTRL_STATE_MASK bits.
+ */
+static inline pci_power_t pci_power_state(u16 pmcsr)
+{
+	if (pmcsr == (u16) PCI_ERROR_RESPONSE)
+		return PCI_D3cold;
+	return pmcsr & PCI_PM_CTRL_STATE_MASK;
+}
+
 #define PCI_PM_D2_DELAY		200
 #define PCI_PM_D3_WAIT		10
 #define PCI_PM_D3COLD_WAIT	100