diff mbox

[2/4] PCI: Query platform firmware for device power state

Message ID ee1e8bbc387594877162ef09acb5027444ac8f9a.1472554278.git.lukas@wunner.de (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Lukas Wunner Aug. 31, 2016, 6:15 a.m. UTC
Usually the most accurate way to determine a PCI device's power state is
to read its PM Control & Status Register.  There are two cases however
when this is not an option:  If the device doesn't have the PM
capability at all, or if it is in D3cold.

In D3cold, reading from config space typically results in a fabricated
"all ones" response.  But in D3hot, the two bits representing the power
state in the PMCSR are *also* set to 1.  Thus D3hot and D3cold are not
discernible by just reading the PMCSR.

A (supposedly) reliable way to detect D3cold is to query the platform
firmware for its opinion on the device's power state.  To this end,
add a ->get_power callback to struct pci_platform_pm_ops, and an
implementation to acpi_pci_platform_pm.  (The only pci_platform_pm_ops
existing so far).

Amend pci_update_current_state() to query the platform firmware before
reading the PMCSR.

Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
---
 drivers/pci/pci-acpi.c | 23 +++++++++++++++++++++++
 drivers/pci/pci.c      | 21 ++++++++++++++++-----
 drivers/pci/pci.h      |  3 +++
 3 files changed, 42 insertions(+), 5 deletions(-)

Comments

Rafael J. Wysocki Sept. 14, 2016, 12:21 a.m. UTC | #1
On Wednesday, August 31, 2016 08:15:18 AM Lukas Wunner wrote:
> Usually the most accurate way to determine a PCI device's power state is
> to read its PM Control & Status Register.  There are two cases however
> when this is not an option:  If the device doesn't have the PM
> capability at all, or if it is in D3cold.
> 
> In D3cold, reading from config space typically results in a fabricated
> "all ones" response.  But in D3hot, the two bits representing the power
> state in the PMCSR are *also* set to 1.  Thus D3hot and D3cold are not
> discernible by just reading the PMCSR.
> 
> A (supposedly) reliable way to detect D3cold is to query the platform
> firmware for its opinion on the device's power state.  To this end,
> add a ->get_power callback to struct pci_platform_pm_ops, and an
> implementation to acpi_pci_platform_pm.  (The only pci_platform_pm_ops
> existing so far).
> 
> Amend pci_update_current_state() to query the platform firmware before
> reading the PMCSR.
> 
> Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
> Signed-off-by: Lukas Wunner <lukas@wunner.de>
> ---
>  drivers/pci/pci-acpi.c | 23 +++++++++++++++++++++++
>  drivers/pci/pci.c      | 21 ++++++++++++++++-----
>  drivers/pci/pci.h      |  3 +++
>  3 files changed, 42 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> index 9a033e8..89f2707 100644
> --- a/drivers/pci/pci-acpi.c
> +++ b/drivers/pci/pci-acpi.c
> @@ -452,6 +452,28 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
>  	return error;
>  }
>  
> +static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev)
> +{
> +	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
> +	static const pci_power_t state_conv[] = {
> +		[ACPI_STATE_D0]      = PCI_D0,
> +		[ACPI_STATE_D1]      = PCI_D1,
> +		[ACPI_STATE_D2]      = PCI_D2,
> +		[ACPI_STATE_D3_HOT]  = PCI_D3hot,
> +		[ACPI_STATE_D3_COLD] = PCI_D3cold,
> +	};
> +	int state;

ACPI_STATE_D3_HOT and ACPI_STATE_D3_COLD were introduced in ACPI 4.0.  For
systems predating that, ACPI_STATE_D3_HOT is the deepest state returned by
acpi_device_get_power().

> +
> +	if (!adev || !acpi_device_power_manageable(adev))
> +		return PCI_UNKNOWN;
> +
> +	if (acpi_device_get_power(adev, &state) || state < ACPI_STATE_D0
> +						|| state > ACPI_STATE_D3_COLD)

If the device is power-manageable by ACPI (you've just checked that) and
acpi_device_get_power() returns success (0), the returned state is guaranteed
to be within the boundaries (if it isn't, there is a bug that needs to be fixed).

> +		return PCI_UNKNOWN;
> +
> +	return state_conv[state];
> +}
> +
>  static bool acpi_pci_can_wakeup(struct pci_dev *dev)
>  {
>  	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
> @@ -534,6 +556,7 @@ static bool acpi_pci_need_resume(struct pci_dev *dev)
>  static const struct pci_platform_pm_ops acpi_pci_platform_pm = {
>  	.is_manageable = acpi_pci_power_manageable,
>  	.set_state = acpi_pci_set_power_state,
> +	.get_state = acpi_pci_get_power_state,
>  	.choose_state = acpi_pci_choose_state,
>  	.sleep_wake = acpi_pci_sleep_wake,
>  	.run_wake = acpi_pci_run_wake,
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 72a9d3a..e52e3d4 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -552,8 +552,9 @@ static const struct pci_platform_pm_ops *pci_platform_pm;
>  
>  int pci_set_platform_pm(const struct pci_platform_pm_ops *ops)
>  {
> -	if (!ops->is_manageable || !ops->set_state || !ops->choose_state ||
> -	    !ops->sleep_wake || !ops->run_wake || !ops->need_resume)
> +	if (!ops->is_manageable || !ops->set_state  || !ops->get_state ||
> +	    !ops->choose_state  || !ops->sleep_wake || !ops->run_wake  ||
> +	    !ops->need_resume)
>  		return -EINVAL;
>  	pci_platform_pm = ops;
>  	return 0;
> @@ -570,6 +571,11 @@ static inline int platform_pci_set_power_state(struct pci_dev *dev,
>  	return pci_platform_pm ? pci_platform_pm->set_state(dev, t) : -ENOSYS;
>  }
>  
> +static inline pci_power_t platform_pci_get_power_state(struct pci_dev *dev)
> +{
> +	return pci_platform_pm ? pci_platform_pm->get_state(dev) : PCI_UNKNOWN;
> +}
> +
>  static inline pci_power_t platform_pci_choose_state(struct pci_dev *dev)
>  {
>  	return pci_platform_pm ?
> @@ -701,14 +707,19 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state)
>  }
>  
>  /**
> - * pci_update_current_state - Read PCI power state of given device from its
> - *                            PCI PM registers and cache it
> + * pci_update_current_state - Read power state of given device and cache it
>   * @dev: PCI device to handle.
>   * @state: State to cache in case the device doesn't have the PM capability
> + *
> + * The power state is read from the PMCSR register, which however is
> + * inaccessible in D3cold. The platform firmware is therefore queried first
> + * to detect accessibility of the register.
>   */
>  void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
>  {
> -	if (dev->pm_cap) {
> +	if (platform_pci_get_power_state(dev) == PCI_D3cold) {
> +		dev->current_state = PCI_D3cold;
> +	} else if (dev->pm_cap) {

Why exactly do you need to change this function?

>  		u16 pmcsr;
>  
>  		/*
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 9730c47..01d5206 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -42,6 +42,8 @@ int pci_probe_reset_function(struct pci_dev *dev);
>   *
>   * @set_state: invokes the platform firmware to set the device's power state
>   *
> + * @get_state: queries the platform firmware for a device's current power state
> + *
>   * @choose_state: returns PCI power state of given device preferred by the
>   *                platform; to be used during system-wide transitions from a
>   *                sleeping state to the working state and vice versa
> @@ -62,6 +64,7 @@ int pci_probe_reset_function(struct pci_dev *dev);
>  struct pci_platform_pm_ops {
>  	bool (*is_manageable)(struct pci_dev *dev);
>  	int (*set_state)(struct pci_dev *dev, pci_power_t state);
> +	pci_power_t (*get_state)(struct pci_dev *dev);
>  	pci_power_t (*choose_state)(struct pci_dev *dev);
>  	int (*sleep_wake)(struct pci_dev *dev, bool enable);
>  	int (*run_wake)(struct pci_dev *dev, bool enable);
> 

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukas Wunner Sept. 14, 2016, 10:50 a.m. UTC | #2
On Wed, Sep 14, 2016 at 02:21:30AM +0200, Rafael J. Wysocki wrote:
> On Wednesday, August 31, 2016 08:15:18 AM Lukas Wunner wrote:
> > Usually the most accurate way to determine a PCI device's power state is
> > to read its PM Control & Status Register.  There are two cases however
> > when this is not an option:  If the device doesn't have the PM
> > capability at all, or if it is in D3cold.
> > 
> > In D3cold, reading from config space typically results in a fabricated
> > "all ones" response.  But in D3hot, the two bits representing the power
> > state in the PMCSR are *also* set to 1.  Thus D3hot and D3cold are not
> > discernible by just reading the PMCSR.
> > 
> > A (supposedly) reliable way to detect D3cold is to query the platform
> > firmware for its opinion on the device's power state.  To this end,
> > add a ->get_power callback to struct pci_platform_pm_ops, and an
> > implementation to acpi_pci_platform_pm.  (The only pci_platform_pm_ops
> > existing so far).
> > 
> > Amend pci_update_current_state() to query the platform firmware before
> > reading the PMCSR.
> > 
> > Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
> > Signed-off-by: Lukas Wunner <lukas@wunner.de>
> > ---
> >  drivers/pci/pci-acpi.c | 23 +++++++++++++++++++++++
> >  drivers/pci/pci.c      | 21 ++++++++++++++++-----
> >  drivers/pci/pci.h      |  3 +++
> >  3 files changed, 42 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> > index 9a033e8..89f2707 100644
> > --- a/drivers/pci/pci-acpi.c
> > +++ b/drivers/pci/pci-acpi.c
> > @@ -452,6 +452,28 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> >  	return error;
> >  }
> >  
> > +static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev)
> > +{
> > +	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
> > +	static const pci_power_t state_conv[] = {
> > +		[ACPI_STATE_D0]      = PCI_D0,
> > +		[ACPI_STATE_D1]      = PCI_D1,
> > +		[ACPI_STATE_D2]      = PCI_D2,
> > +		[ACPI_STATE_D3_HOT]  = PCI_D3hot,
> > +		[ACPI_STATE_D3_COLD] = PCI_D3cold,
> > +	};
> > +	int state;
> 
> ACPI_STATE_D3_HOT and ACPI_STATE_D3_COLD were introduced in ACPI 4.0.  For
> systems predating that, ACPI_STATE_D3_HOT is the deepest state returned by
> acpi_device_get_power().

Would it be possible to detect the ACPI spec version the platform
firmware conforms to, and amend acpi_device_get_power() to return
ACPI_STATE_D3_COLD if the device is in D3?

Then we could avoid the unnecessary runtime resume after direct_complete
also for these older machines.

Can the revision in the FADT (offset 8) be used as a proxy?
=> E.g. the old Clevo B7130 has revision 3 in the FADT and uses
   a _DSM and _PS3 to put the discrete GPU in D3cold:
   https://github.com/Lekensteyn/acpi-stuff/tree/master/dsl/Clevo_B7130
=> Whereas the newer Clevo P651RA has revision 5 in the FADT and uses
   _PR3 to put the discrete GPU in D3cold:
   https://github.com/Lekensteyn/acpi-stuff/tree/master/dsl/Clevo_P651RA

However the FADT revision was already 4 in the ACPI 3.0 spec,
so we can only use it to discern ACPI 2.0 vs 3.0, not 3.0 vs 4.0,
which is what we'd actually want.  And there's a comment in
acpica/tbfadt.c that "The FADT revision value is unreliable."

Do you know of a better way to discern ACPI 3.0 vs 4.0?

> 
> > +
> > +	if (!adev || !acpi_device_power_manageable(adev))
> > +		return PCI_UNKNOWN;
> > +
> > +	if (acpi_device_get_power(adev, &state) || state < ACPI_STATE_D0
> > +						|| state > ACPI_STATE_D3_COLD)
> 
> If the device is power-manageable by ACPI (you've just checked that) and
> acpi_device_get_power() returns success (0), the returned state is guaranteed
> to be within the boundaries (if it isn't, there is a bug that needs to be
> fixed).

No, acpi_device_get_power() can also return ACPI_STATE_UNKNOWN, which has
the value 0xff.  I could add that to state_conv[] above but then I'd have
an array with 256 integers on the stack, most of them 0, which I don't want.
I could check for != ACPI_STATE_UNKNOWN but checking the boundaries seemed
safer. So I maintain that the code is correct.

> 
> > +		return PCI_UNKNOWN;
> > +
> > +	return state_conv[state];
> > +}
> > +
> >  static bool acpi_pci_can_wakeup(struct pci_dev *dev)
> >  {
> >  	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
> > @@ -534,6 +556,7 @@ static bool acpi_pci_need_resume(struct pci_dev *dev)
> >  static const struct pci_platform_pm_ops acpi_pci_platform_pm = {
> >  	.is_manageable = acpi_pci_power_manageable,
> >  	.set_state = acpi_pci_set_power_state,
> > +	.get_state = acpi_pci_get_power_state,
> >  	.choose_state = acpi_pci_choose_state,
> >  	.sleep_wake = acpi_pci_sleep_wake,
> >  	.run_wake = acpi_pci_run_wake,
> > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > index 72a9d3a..e52e3d4 100644
> > --- a/drivers/pci/pci.c
> > +++ b/drivers/pci/pci.c
> > @@ -552,8 +552,9 @@ static const struct pci_platform_pm_ops *pci_platform_pm;
> >  
> >  int pci_set_platform_pm(const struct pci_platform_pm_ops *ops)
> >  {
> > -	if (!ops->is_manageable || !ops->set_state || !ops->choose_state ||
> > -	    !ops->sleep_wake || !ops->run_wake || !ops->need_resume)
> > +	if (!ops->is_manageable || !ops->set_state  || !ops->get_state ||
> > +	    !ops->choose_state  || !ops->sleep_wake || !ops->run_wake  ||
> > +	    !ops->need_resume)
> >  		return -EINVAL;
> >  	pci_platform_pm = ops;
> >  	return 0;
> > @@ -570,6 +571,11 @@ static inline int platform_pci_set_power_state(struct pci_dev *dev,
> >  	return pci_platform_pm ? pci_platform_pm->set_state(dev, t) : -ENOSYS;
> >  }
> >  
> > +static inline pci_power_t platform_pci_get_power_state(struct pci_dev *dev)
> > +{
> > +	return pci_platform_pm ? pci_platform_pm->get_state(dev) : PCI_UNKNOWN;
> > +}
> > +
> >  static inline pci_power_t platform_pci_choose_state(struct pci_dev *dev)
> >  {
> >  	return pci_platform_pm ?
> > @@ -701,14 +707,19 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state)
> >  }
> >  
> >  /**
> > - * pci_update_current_state - Read PCI power state of given device from its
> > - *                            PCI PM registers and cache it
> > + * pci_update_current_state - Read power state of given device and cache it
> >   * @dev: PCI device to handle.
> >   * @state: State to cache in case the device doesn't have the PM capability
> > + *
> > + * The power state is read from the PMCSR register, which however is
> > + * inaccessible in D3cold. The platform firmware is therefore queried first
> > + * to detect accessibility of the register.
> >   */
> >  void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
> >  {
> > -	if (dev->pm_cap) {
> > +	if (platform_pci_get_power_state(dev) == PCI_D3cold) {
> > +		dev->current_state = PCI_D3cold;
> > +	} else if (dev->pm_cap) {
> 
> Why exactly do you need to change this function?

It would be pointless to add the ->platform_pci_get_power_state
hook without using it anywhere, wouldn't it?

I am adding this here so that I can call pci_update_current_state()
in patch [3/4] to compare the device's state after system sleep with
the one before, and be able to discern D3hot and D3cold properly
(as explained in the commit message above).

That said, I need to amend the patch to remove this portion in
pci_update_current_state():

                if (dev->current_state == PCI_D3cold)
                        return;

because otherwise we'd never try to read the PMCSR if the firmware
says the device is in <= D3hot.

Thanks,

Lukas

> 
> >  		u16 pmcsr;
> >  
> >  		/*
> > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> > index 9730c47..01d5206 100644
> > --- a/drivers/pci/pci.h
> > +++ b/drivers/pci/pci.h
> > @@ -42,6 +42,8 @@ int pci_probe_reset_function(struct pci_dev *dev);
> >   *
> >   * @set_state: invokes the platform firmware to set the device's power state
> >   *
> > + * @get_state: queries the platform firmware for a device's current power state
> > + *
> >   * @choose_state: returns PCI power state of given device preferred by the
> >   *                platform; to be used during system-wide transitions from a
> >   *                sleeping state to the working state and vice versa
> > @@ -62,6 +64,7 @@ int pci_probe_reset_function(struct pci_dev *dev);
> >  struct pci_platform_pm_ops {
> >  	bool (*is_manageable)(struct pci_dev *dev);
> >  	int (*set_state)(struct pci_dev *dev, pci_power_t state);
> > +	pci_power_t (*get_state)(struct pci_dev *dev);
> >  	pci_power_t (*choose_state)(struct pci_dev *dev);
> >  	int (*sleep_wake)(struct pci_dev *dev, bool enable);
> >  	int (*run_wake)(struct pci_dev *dev, bool enable);
> > 
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki Sept. 14, 2016, 1:01 p.m. UTC | #3
On Wednesday, September 14, 2016 12:50:24 PM Lukas Wunner wrote:
> On Wed, Sep 14, 2016 at 02:21:30AM +0200, Rafael J. Wysocki wrote:
> > On Wednesday, August 31, 2016 08:15:18 AM Lukas Wunner wrote:
> > > Usually the most accurate way to determine a PCI device's power state is
> > > to read its PM Control & Status Register.  There are two cases however
> > > when this is not an option:  If the device doesn't have the PM
> > > capability at all, or if it is in D3cold.
> > > 
> > > In D3cold, reading from config space typically results in a fabricated
> > > "all ones" response.  But in D3hot, the two bits representing the power
> > > state in the PMCSR are *also* set to 1.  Thus D3hot and D3cold are not
> > > discernible by just reading the PMCSR.
> > > 
> > > A (supposedly) reliable way to detect D3cold is to query the platform
> > > firmware for its opinion on the device's power state.  To this end,
> > > add a ->get_power callback to struct pci_platform_pm_ops, and an
> > > implementation to acpi_pci_platform_pm.  (The only pci_platform_pm_ops
> > > existing so far).
> > > 
> > > Amend pci_update_current_state() to query the platform firmware before
> > > reading the PMCSR.
> > > 
> > > Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
> > > Signed-off-by: Lukas Wunner <lukas@wunner.de>
> > > ---
> > >  drivers/pci/pci-acpi.c | 23 +++++++++++++++++++++++
> > >  drivers/pci/pci.c      | 21 ++++++++++++++++-----
> > >  drivers/pci/pci.h      |  3 +++
> > >  3 files changed, 42 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> > > index 9a033e8..89f2707 100644
> > > --- a/drivers/pci/pci-acpi.c
> > > +++ b/drivers/pci/pci-acpi.c
> > > @@ -452,6 +452,28 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> > >  	return error;
> > >  }
> > >  
> > > +static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev)
> > > +{
> > > +	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
> > > +	static const pci_power_t state_conv[] = {
> > > +		[ACPI_STATE_D0]      = PCI_D0,
> > > +		[ACPI_STATE_D1]      = PCI_D1,
> > > +		[ACPI_STATE_D2]      = PCI_D2,
> > > +		[ACPI_STATE_D3_HOT]  = PCI_D3hot,
> > > +		[ACPI_STATE_D3_COLD] = PCI_D3cold,
> > > +	};
> > > +	int state;
> > 
> > ACPI_STATE_D3_HOT and ACPI_STATE_D3_COLD were introduced in ACPI 4.0.  For
> > systems predating that, ACPI_STATE_D3_HOT is the deepest state returned by
> > acpi_device_get_power().
> 
> Would it be possible to detect the ACPI spec version the platform
> firmware conforms to, and amend acpi_device_get_power() to return
> ACPI_STATE_D3_COLD if the device is in D3?
> 
> Then we could avoid the unnecessary runtime resume after direct_complete
> also for these older machines.

Well, please see below.

> Can the revision in the FADT (offset 8) be used as a proxy?
> => E.g. the old Clevo B7130 has revision 3 in the FADT and uses
>    a _DSM and _PS3 to put the discrete GPU in D3cold:
>    https://github.com/Lekensteyn/acpi-stuff/tree/master/dsl/Clevo_B7130
> => Whereas the newer Clevo P651RA has revision 5 in the FADT and uses
>    _PR3 to put the discrete GPU in D3cold:
>    https://github.com/Lekensteyn/acpi-stuff/tree/master/dsl/Clevo_P651RA
> 
> However the FADT revision was already 4 in the ACPI 3.0 spec,
> so we can only use it to discern ACPI 2.0 vs 3.0, not 3.0 vs 4.0,
> which is what we'd actually want.  And there's a comment in
> acpica/tbfadt.c that "The FADT revision value is unreliable."
> 
> Do you know of a better way to discern ACPI 3.0 vs 4.0?

I'm not sure if there is a reliable way to be honest.

Also even if the FADT etc tells you something, there's no guarantee that AML
actually follows that.

In fact, whether or not acpi_device_get_power() will ever return D3cold
for a device depends on whether or not _PR3 is there.  If it's not there,
the deepest state returned will be D3hot in any case.

So I'm not sure how useful that is in the context of D3cold detection?

> > 
> > > +
> > > +	if (!adev || !acpi_device_power_manageable(adev))
> > > +		return PCI_UNKNOWN;
> > > +
> > > +	if (acpi_device_get_power(adev, &state) || state < ACPI_STATE_D0
> > > +						|| state > ACPI_STATE_D3_COLD)
> > 
> > If the device is power-manageable by ACPI (you've just checked that) and
> > acpi_device_get_power() returns success (0), the returned state is guaranteed
> > to be within the boundaries (if it isn't, there is a bug that needs to be
> > fixed).
> 
> No, acpi_device_get_power() can also return ACPI_STATE_UNKNOWN, which has
> the value 0xff.

That would be the case without power resources or _PSC, right?

> I could add that to state_conv[] above but then I'd have
> an array with 256 integers on the stack, most of them 0, which I don't want.
> I could check for != ACPI_STATE_UNKNOWN but checking the boundaries seemed
> safer.

Well, I'm not sure in what way it is safer and you get one check instead of
two. :-)

> So I maintain that the code is correct.

It simply contains an redundant check.

> > 
> > > +		return PCI_UNKNOWN;
> > > +
> > > +	return state_conv[state];
> > > +}
> > > +
> > >  static bool acpi_pci_can_wakeup(struct pci_dev *dev)
> > >  {
> > >  	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
> > > @@ -534,6 +556,7 @@ static bool acpi_pci_need_resume(struct pci_dev *dev)
> > >  static const struct pci_platform_pm_ops acpi_pci_platform_pm = {
> > >  	.is_manageable = acpi_pci_power_manageable,
> > >  	.set_state = acpi_pci_set_power_state,
> > > +	.get_state = acpi_pci_get_power_state,
> > >  	.choose_state = acpi_pci_choose_state,
> > >  	.sleep_wake = acpi_pci_sleep_wake,
> > >  	.run_wake = acpi_pci_run_wake,
> > > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > > index 72a9d3a..e52e3d4 100644
> > > --- a/drivers/pci/pci.c
> > > +++ b/drivers/pci/pci.c
> > > @@ -552,8 +552,9 @@ static const struct pci_platform_pm_ops *pci_platform_pm;
> > >  
> > >  int pci_set_platform_pm(const struct pci_platform_pm_ops *ops)
> > >  {
> > > -	if (!ops->is_manageable || !ops->set_state || !ops->choose_state ||
> > > -	    !ops->sleep_wake || !ops->run_wake || !ops->need_resume)
> > > +	if (!ops->is_manageable || !ops->set_state  || !ops->get_state ||
> > > +	    !ops->choose_state  || !ops->sleep_wake || !ops->run_wake  ||
> > > +	    !ops->need_resume)
> > >  		return -EINVAL;
> > >  	pci_platform_pm = ops;
> > >  	return 0;
> > > @@ -570,6 +571,11 @@ static inline int platform_pci_set_power_state(struct pci_dev *dev,
> > >  	return pci_platform_pm ? pci_platform_pm->set_state(dev, t) : -ENOSYS;
> > >  }
> > >  
> > > +static inline pci_power_t platform_pci_get_power_state(struct pci_dev *dev)
> > > +{
> > > +	return pci_platform_pm ? pci_platform_pm->get_state(dev) : PCI_UNKNOWN;
> > > +}
> > > +
> > >  static inline pci_power_t platform_pci_choose_state(struct pci_dev *dev)
> > >  {
> > >  	return pci_platform_pm ?
> > > @@ -701,14 +707,19 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state)
> > >  }
> > >  
> > >  /**
> > > - * pci_update_current_state - Read PCI power state of given device from its
> > > - *                            PCI PM registers and cache it
> > > + * pci_update_current_state - Read power state of given device and cache it
> > >   * @dev: PCI device to handle.
> > >   * @state: State to cache in case the device doesn't have the PM capability
> > > + *
> > > + * The power state is read from the PMCSR register, which however is
> > > + * inaccessible in D3cold. The platform firmware is therefore queried first
> > > + * to detect accessibility of the register.
> > >   */
> > >  void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
> > >  {
> > > -	if (dev->pm_cap) {
> > > +	if (platform_pci_get_power_state(dev) == PCI_D3cold) {
> > > +		dev->current_state = PCI_D3cold;
> > > +	} else if (dev->pm_cap) {
> > 
> > Why exactly do you need to change this function?
> 
> It would be pointless to add the ->platform_pci_get_power_state
> hook without using it anywhere, wouldn't it?

That depends on what you want to do with it next.  Callers may be added in
separate patches, no problem with that.

> I am adding this here so that I can call pci_update_current_state()
> in patch [3/4] to compare the device's state after system sleep with
> the one before, and be able to discern D3hot and D3cold properly
> (as explained in the commit message above).
> 
> That said, I need to amend the patch to remove this portion in
> pci_update_current_state():
> 
>                 if (dev->current_state == PCI_D3cold)
>                         return;
> 
> because otherwise we'd never try to read the PMCSR if the firmware
> says the device is in <= D3hot.

pci_update_current_state() is called in a few places with assumptions that need
to be re-evaluated to see if they still hold after the changes.  They probably
do, but then again, these call sites need to be double checked.  If you did
that, then fine.

But I would say that (a) the next patch will add detection of the real
post-resume power state of "direct_complete" devices and it will use
pci_update_current_state() for that (to be consistent with the rest of
the code), so (b) pci_update_current_state() needs to ask ACPI about its
view on the power state of the device for this purpose.

And if you want to change that logic for PCI, it would be good to change it
in the same way for acpi_general_pm_domain which has exactly the same problem.

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki Sept. 14, 2016, 4:47 p.m. UTC | #4
On Wednesday, September 14, 2016 03:01:33 PM Rafael J. Wysocki wrote:
> On Wednesday, September 14, 2016 12:50:24 PM Lukas Wunner wrote:
> > On Wed, Sep 14, 2016 at 02:21:30AM +0200, Rafael J. Wysocki wrote:
> > > On Wednesday, August 31, 2016 08:15:18 AM Lukas Wunner wrote:
> > > > Usually the most accurate way to determine a PCI device's power state is
> > > > to read its PM Control & Status Register.  There are two cases however
> > > > when this is not an option:  If the device doesn't have the PM
> > > > capability at all, or if it is in D3cold.
> > > > 
> > > > In D3cold, reading from config space typically results in a fabricated
> > > > "all ones" response.  But in D3hot, the two bits representing the power
> > > > state in the PMCSR are *also* set to 1.  Thus D3hot and D3cold are not
> > > > discernible by just reading the PMCSR.
> > > > 
> > > > A (supposedly) reliable way to detect D3cold is to query the platform
> > > > firmware for its opinion on the device's power state.  To this end,
> > > > add a ->get_power callback to struct pci_platform_pm_ops, and an
> > > > implementation to acpi_pci_platform_pm.  (The only pci_platform_pm_ops
> > > > existing so far).
> > > > 
> > > > Amend pci_update_current_state() to query the platform firmware before
> > > > reading the PMCSR.
> > > > 
> > > > Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
> > > > Signed-off-by: Lukas Wunner <lukas@wunner.de>
> > > > ---
> > > >  drivers/pci/pci-acpi.c | 23 +++++++++++++++++++++++
> > > >  drivers/pci/pci.c      | 21 ++++++++++++++++-----
> > > >  drivers/pci/pci.h      |  3 +++
> > > >  3 files changed, 42 insertions(+), 5 deletions(-)
> > > > 
> > > > diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> > > > index 9a033e8..89f2707 100644
> > > > --- a/drivers/pci/pci-acpi.c
> > > > +++ b/drivers/pci/pci-acpi.c
> > > > @@ -452,6 +452,28 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> > > >  	return error;
> > > >  }
> > > >  
> > > > +static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev)
> > > > +{
> > > > +	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
> > > > +	static const pci_power_t state_conv[] = {
> > > > +		[ACPI_STATE_D0]      = PCI_D0,
> > > > +		[ACPI_STATE_D1]      = PCI_D1,
> > > > +		[ACPI_STATE_D2]      = PCI_D2,
> > > > +		[ACPI_STATE_D3_HOT]  = PCI_D3hot,
> > > > +		[ACPI_STATE_D3_COLD] = PCI_D3cold,
> > > > +	};
> > > > +	int state;
> > > 
> > > ACPI_STATE_D3_HOT and ACPI_STATE_D3_COLD were introduced in ACPI 4.0.  For
> > > systems predating that, ACPI_STATE_D3_HOT is the deepest state returned by
> > > acpi_device_get_power().
> > 
> > Would it be possible to detect the ACPI spec version the platform
> > firmware conforms to, and amend acpi_device_get_power() to return
> > ACPI_STATE_D3_COLD if the device is in D3?
> > 
> > Then we could avoid the unnecessary runtime resume after direct_complete
> > also for these older machines.
> 
> Well, please see below.
> 
> > Can the revision in the FADT (offset 8) be used as a proxy?
> > => E.g. the old Clevo B7130 has revision 3 in the FADT and uses
> >    a _DSM and _PS3 to put the discrete GPU in D3cold:
> >    https://github.com/Lekensteyn/acpi-stuff/tree/master/dsl/Clevo_B7130
> > => Whereas the newer Clevo P651RA has revision 5 in the FADT and uses
> >    _PR3 to put the discrete GPU in D3cold:
> >    https://github.com/Lekensteyn/acpi-stuff/tree/master/dsl/Clevo_P651RA
> > 
> > However the FADT revision was already 4 in the ACPI 3.0 spec,
> > so we can only use it to discern ACPI 2.0 vs 3.0, not 3.0 vs 4.0,
> > which is what we'd actually want.  And there's a comment in
> > acpica/tbfadt.c that "The FADT revision value is unreliable."
> > 
> > Do you know of a better way to discern ACPI 3.0 vs 4.0?
> 
> I'm not sure if there is a reliable way to be honest.
> 
> Also even if the FADT etc tells you something, there's no guarantee that AML
> actually follows that.
> 
> In fact, whether or not acpi_device_get_power() will ever return D3cold
> for a device depends on whether or not _PR3 is there.  If it's not there,
> the deepest state returned will be D3hot in any case.
> 
> So I'm not sure how useful that is in the context of D3cold detection?

There is more to this.

In fact, we can't really trust _PSC too, because in some ACPI tables it is
implemented to always return 0 (seriously).

For this reason, I think the way to go would be to use something like
acpi_device_update_power() to ensure that the device really is in the
state we think it is in.

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki Sept. 14, 2016, 9:36 p.m. UTC | #5
On Wednesday, September 14, 2016 06:47:40 PM Rafael J. Wysocki wrote:
> On Wednesday, September 14, 2016 03:01:33 PM Rafael J. Wysocki wrote:
> > On Wednesday, September 14, 2016 12:50:24 PM Lukas Wunner wrote:
> > > On Wed, Sep 14, 2016 at 02:21:30AM +0200, Rafael J. Wysocki wrote:
> > > > On Wednesday, August 31, 2016 08:15:18 AM Lukas Wunner wrote:
> > > > > Usually the most accurate way to determine a PCI device's power state is
> > > > > to read its PM Control & Status Register.  There are two cases however
> > > > > when this is not an option:  If the device doesn't have the PM
> > > > > capability at all, or if it is in D3cold.
> > > > > 
> > > > > In D3cold, reading from config space typically results in a fabricated
> > > > > "all ones" response.  But in D3hot, the two bits representing the power
> > > > > state in the PMCSR are *also* set to 1.  Thus D3hot and D3cold are not
> > > > > discernible by just reading the PMCSR.
> > > > > 
> > > > > A (supposedly) reliable way to detect D3cold is to query the platform
> > > > > firmware for its opinion on the device's power state.  To this end,
> > > > > add a ->get_power callback to struct pci_platform_pm_ops, and an
> > > > > implementation to acpi_pci_platform_pm.  (The only pci_platform_pm_ops
> > > > > existing so far).
> > > > > 
> > > > > Amend pci_update_current_state() to query the platform firmware before
> > > > > reading the PMCSR.
> > > > > 
> > > > > Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
> > > > > Signed-off-by: Lukas Wunner <lukas@wunner.de>
> > > > > ---
> > > > >  drivers/pci/pci-acpi.c | 23 +++++++++++++++++++++++
> > > > >  drivers/pci/pci.c      | 21 ++++++++++++++++-----
> > > > >  drivers/pci/pci.h      |  3 +++
> > > > >  3 files changed, 42 insertions(+), 5 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> > > > > index 9a033e8..89f2707 100644
> > > > > --- a/drivers/pci/pci-acpi.c
> > > > > +++ b/drivers/pci/pci-acpi.c
> > > > > @@ -452,6 +452,28 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> > > > >  	return error;
> > > > >  }
> > > > >  
> > > > > +static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev)
> > > > > +{
> > > > > +	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
> > > > > +	static const pci_power_t state_conv[] = {
> > > > > +		[ACPI_STATE_D0]      = PCI_D0,
> > > > > +		[ACPI_STATE_D1]      = PCI_D1,
> > > > > +		[ACPI_STATE_D2]      = PCI_D2,
> > > > > +		[ACPI_STATE_D3_HOT]  = PCI_D3hot,
> > > > > +		[ACPI_STATE_D3_COLD] = PCI_D3cold,
> > > > > +	};
> > > > > +	int state;
> > > > 
> > > > ACPI_STATE_D3_HOT and ACPI_STATE_D3_COLD were introduced in ACPI 4.0.  For
> > > > systems predating that, ACPI_STATE_D3_HOT is the deepest state returned by
> > > > acpi_device_get_power().
> > > 
> > > Would it be possible to detect the ACPI spec version the platform
> > > firmware conforms to, and amend acpi_device_get_power() to return
> > > ACPI_STATE_D3_COLD if the device is in D3?
> > > 
> > > Then we could avoid the unnecessary runtime resume after direct_complete
> > > also for these older machines.
> > 
> > Well, please see below.
> > 
> > > Can the revision in the FADT (offset 8) be used as a proxy?
> > > => E.g. the old Clevo B7130 has revision 3 in the FADT and uses
> > >    a _DSM and _PS3 to put the discrete GPU in D3cold:
> > >    https://github.com/Lekensteyn/acpi-stuff/tree/master/dsl/Clevo_B7130
> > > => Whereas the newer Clevo P651RA has revision 5 in the FADT and uses
> > >    _PR3 to put the discrete GPU in D3cold:
> > >    https://github.com/Lekensteyn/acpi-stuff/tree/master/dsl/Clevo_P651RA
> > > 
> > > However the FADT revision was already 4 in the ACPI 3.0 spec,
> > > so we can only use it to discern ACPI 2.0 vs 3.0, not 3.0 vs 4.0,
> > > which is what we'd actually want.  And there's a comment in
> > > acpica/tbfadt.c that "The FADT revision value is unreliable."
> > > 
> > > Do you know of a better way to discern ACPI 3.0 vs 4.0?
> > 
> > I'm not sure if there is a reliable way to be honest.
> > 
> > Also even if the FADT etc tells you something, there's no guarantee that AML
> > actually follows that.
> > 
> > In fact, whether or not acpi_device_get_power() will ever return D3cold
> > for a device depends on whether or not _PR3 is there.  If it's not there,
> > the deepest state returned will be D3hot in any case.
> > 
> > So I'm not sure how useful that is in the context of D3cold detection?
> 
> There is more to this.
> 
> In fact, we can't really trust _PSC too, because in some ACPI tables it is
> implemented to always return 0 (seriously).
> 
> For this reason, I think the way to go would be to use something like
> acpi_device_update_power() to ensure that the device really is in the
> state we think it is in.

Actually, it may be sufficient to simply check if the device is in any
state different from D0 and leave it alone if that's the case.

For that, acpi_device_update_power() should be good enough and the PCI
native part would be simpler too I think.

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki Sept. 14, 2016, 9:47 p.m. UTC | #6
On Wednesday, September 14, 2016 11:36:40 PM Rafael J. Wysocki wrote:
> On Wednesday, September 14, 2016 06:47:40 PM Rafael J. Wysocki wrote:
> > On Wednesday, September 14, 2016 03:01:33 PM Rafael J. Wysocki wrote:
> > > On Wednesday, September 14, 2016 12:50:24 PM Lukas Wunner wrote:
> > > > On Wed, Sep 14, 2016 at 02:21:30AM +0200, Rafael J. Wysocki wrote:
> > > > > On Wednesday, August 31, 2016 08:15:18 AM Lukas Wunner wrote:
> > > > > > Usually the most accurate way to determine a PCI device's power state is
> > > > > > to read its PM Control & Status Register.  There are two cases however
> > > > > > when this is not an option:  If the device doesn't have the PM
> > > > > > capability at all, or if it is in D3cold.
> > > > > > 
> > > > > > In D3cold, reading from config space typically results in a fabricated
> > > > > > "all ones" response.  But in D3hot, the two bits representing the power
> > > > > > state in the PMCSR are *also* set to 1.  Thus D3hot and D3cold are not
> > > > > > discernible by just reading the PMCSR.
> > > > > > 
> > > > > > A (supposedly) reliable way to detect D3cold is to query the platform
> > > > > > firmware for its opinion on the device's power state.  To this end,
> > > > > > add a ->get_power callback to struct pci_platform_pm_ops, and an
> > > > > > implementation to acpi_pci_platform_pm.  (The only pci_platform_pm_ops
> > > > > > existing so far).
> > > > > > 
> > > > > > Amend pci_update_current_state() to query the platform firmware before
> > > > > > reading the PMCSR.
> > > > > > 
> > > > > > Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
> > > > > > Signed-off-by: Lukas Wunner <lukas@wunner.de>
> > > > > > ---
> > > > > >  drivers/pci/pci-acpi.c | 23 +++++++++++++++++++++++
> > > > > >  drivers/pci/pci.c      | 21 ++++++++++++++++-----
> > > > > >  drivers/pci/pci.h      |  3 +++
> > > > > >  3 files changed, 42 insertions(+), 5 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> > > > > > index 9a033e8..89f2707 100644
> > > > > > --- a/drivers/pci/pci-acpi.c
> > > > > > +++ b/drivers/pci/pci-acpi.c
> > > > > > @@ -452,6 +452,28 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> > > > > >  	return error;
> > > > > >  }
> > > > > >  
> > > > > > +static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev)
> > > > > > +{
> > > > > > +	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
> > > > > > +	static const pci_power_t state_conv[] = {
> > > > > > +		[ACPI_STATE_D0]      = PCI_D0,
> > > > > > +		[ACPI_STATE_D1]      = PCI_D1,
> > > > > > +		[ACPI_STATE_D2]      = PCI_D2,
> > > > > > +		[ACPI_STATE_D3_HOT]  = PCI_D3hot,
> > > > > > +		[ACPI_STATE_D3_COLD] = PCI_D3cold,
> > > > > > +	};
> > > > > > +	int state;
> > > > > 
> > > > > ACPI_STATE_D3_HOT and ACPI_STATE_D3_COLD were introduced in ACPI 4.0.  For
> > > > > systems predating that, ACPI_STATE_D3_HOT is the deepest state returned by
> > > > > acpi_device_get_power().
> > > > 
> > > > Would it be possible to detect the ACPI spec version the platform
> > > > firmware conforms to, and amend acpi_device_get_power() to return
> > > > ACPI_STATE_D3_COLD if the device is in D3?
> > > > 
> > > > Then we could avoid the unnecessary runtime resume after direct_complete
> > > > also for these older machines.
> > > 
> > > Well, please see below.
> > > 
> > > > Can the revision in the FADT (offset 8) be used as a proxy?
> > > > => E.g. the old Clevo B7130 has revision 3 in the FADT and uses
> > > >    a _DSM and _PS3 to put the discrete GPU in D3cold:
> > > >    https://github.com/Lekensteyn/acpi-stuff/tree/master/dsl/Clevo_B7130
> > > > => Whereas the newer Clevo P651RA has revision 5 in the FADT and uses
> > > >    _PR3 to put the discrete GPU in D3cold:
> > > >    https://github.com/Lekensteyn/acpi-stuff/tree/master/dsl/Clevo_P651RA
> > > > 
> > > > However the FADT revision was already 4 in the ACPI 3.0 spec,
> > > > so we can only use it to discern ACPI 2.0 vs 3.0, not 3.0 vs 4.0,
> > > > which is what we'd actually want.  And there's a comment in
> > > > acpica/tbfadt.c that "The FADT revision value is unreliable."
> > > > 
> > > > Do you know of a better way to discern ACPI 3.0 vs 4.0?
> > > 
> > > I'm not sure if there is a reliable way to be honest.
> > > 
> > > Also even if the FADT etc tells you something, there's no guarantee that AML
> > > actually follows that.
> > > 
> > > In fact, whether or not acpi_device_get_power() will ever return D3cold
> > > for a device depends on whether or not _PR3 is there.  If it's not there,
> > > the deepest state returned will be D3hot in any case.
> > > 
> > > So I'm not sure how useful that is in the context of D3cold detection?
> > 
> > There is more to this.
> > 
> > In fact, we can't really trust _PSC too, because in some ACPI tables it is
> > implemented to always return 0 (seriously).
> > 
> > For this reason, I think the way to go would be to use something like
> > acpi_device_update_power() to ensure that the device really is in the
> > state we think it is in.
> 
> Actually, it may be sufficient to simply check if the device is in any
> state different from D0 and leave it alone if that's the case.
> 
> For that, acpi_device_update_power() should be good enough and the PCI
> native part would be simpler too I think.

I meant acpi_device_get_power(), sorry.

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukas Wunner Sept. 14, 2016, 10:58 p.m. UTC | #7
On Wed, Sep 14, 2016 at 06:47:40PM +0200, Rafael J. Wysocki wrote:
> On Wednesday, September 14, 2016 03:01:33 PM Rafael J. Wysocki wrote:
> > On Wednesday, September 14, 2016 12:50:24 PM Lukas Wunner wrote:
> > > On Wed, Sep 14, 2016 at 02:21:30AM +0200, Rafael J. Wysocki wrote:
> > > > On Wednesday, August 31, 2016 08:15:18 AM Lukas Wunner wrote:
> > > > > Usually the most accurate way to determine a PCI device's power state
> > > > > is to read its PM Control & Status Register.  There are two cases
> > > > > however when this is not an option:  If the device doesn't have the
> > > > > PM capability at all, or if it is in D3cold.
> > > > > 
> > > > > In D3cold, reading from config space typically results in a
> > > > > fabricated "all ones" response.  But in D3hot, the two bits
> > > > > representing the power state in the PMCSR are *also* set to 1.
> > > > > Thus D3hot and D3cold are not discernible by just reading the PMCSR.
> > > > > 
> > > > > A (supposedly) reliable way to detect D3cold is to query the platform
> > > > > firmware for its opinion on the device's power state.  To this end,
> > > > > add a ->get_power callback to struct pci_platform_pm_ops, and an
> > > > > implementation to acpi_pci_platform_pm.  (The only
> > > > > pci_platform_pm_ops existing so far).
> > > > > 
> > > > > Amend pci_update_current_state() to query the platform firmware before
> > > > > reading the PMCSR.
> > > > > 
> > > > > Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
> > > > > Signed-off-by: Lukas Wunner <lukas@wunner.de>
> > > > > ---
> > > > >  drivers/pci/pci-acpi.c | 23 +++++++++++++++++++++++
> > > > >  drivers/pci/pci.c      | 21 ++++++++++++++++-----
> > > > >  drivers/pci/pci.h      |  3 +++
> > > > >  3 files changed, 42 insertions(+), 5 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> > > > > index 9a033e8..89f2707 100644
> > > > > --- a/drivers/pci/pci-acpi.c
> > > > > +++ b/drivers/pci/pci-acpi.c
> > > > > @@ -452,6 +452,28 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> > > > >  	return error;
> > > > >  }
> > > > >  
> > > > > +static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev)
> > > > > +{
> > > > > +	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
> > > > > +	static const pci_power_t state_conv[] = {
> > > > > +		[ACPI_STATE_D0]      = PCI_D0,
> > > > > +		[ACPI_STATE_D1]      = PCI_D1,
> > > > > +		[ACPI_STATE_D2]      = PCI_D2,
> > > > > +		[ACPI_STATE_D3_HOT]  = PCI_D3hot,
> > > > > +		[ACPI_STATE_D3_COLD] = PCI_D3cold,
> > > > > +	};
> > > > > +	int state;
> > > > 
> > > > ACPI_STATE_D3_HOT and ACPI_STATE_D3_COLD were introduced in ACPI 4.0.
> > > > For systems predating that, ACPI_STATE_D3_HOT is the deepest state
> > > > returned by acpi_device_get_power().
> > > 
> > > Would it be possible to detect the ACPI spec version the platform
> > > firmware conforms to, and amend acpi_device_get_power() to return
> > > ACPI_STATE_D3_COLD if the device is in D3?
> > > 
> > > Then we could avoid the unnecessary runtime resume after direct_complete
> > > also for these older machines.
> > 
> > Well, please see below.
> > 
> > > Can the revision in the FADT (offset 8) be used as a proxy?
> > > => E.g. the old Clevo B7130 has revision 3 in the FADT and uses
> > >    a _DSM and _PS3 to put the discrete GPU in D3cold:
> > >    https://github.com/Lekensteyn/acpi-stuff/tree/master/dsl/Clevo_B7130
> > > => Whereas the newer Clevo P651RA has revision 5 in the FADT and uses
> > >    _PR3 to put the discrete GPU in D3cold:
> > >    https://github.com/Lekensteyn/acpi-stuff/tree/master/dsl/Clevo_P651RA
> > > 
> > > However the FADT revision was already 4 in the ACPI 3.0 spec,
> > > so we can only use it to discern ACPI 2.0 vs 3.0, not 3.0 vs 4.0,
> > > which is what we'd actually want.  And there's a comment in
> > > acpica/tbfadt.c that "The FADT revision value is unreliable."
> > > 
> > > Do you know of a better way to discern ACPI 3.0 vs 4.0?
> > 
> > I'm not sure if there is a reliable way to be honest.
> > 
> > Also even if the FADT etc tells you something, there's no guarantee that
> > AML actually follows that.
> > 
> > In fact, whether or not acpi_device_get_power() will ever return D3cold
> > for a device depends on whether or not _PR3 is there.  If it's not there,
> > the deepest state returned will be D3hot in any case.
> > 
> > So I'm not sure how useful that is in the context of D3cold detection?
> 
> There is more to this.
> 
> In fact, we can't really trust _PSC too, because in some ACPI tables it is
> implemented to always return 0 (seriously).
> 
> For this reason, I think the way to go would be to use something like
> acpi_device_update_power() to ensure that the device really is in the
> state we think it is in.

The patch only trusts the platform firmware to report D3cold correctly.
If it reports anything else the patch reads the PMCSR.

Thus if _PSC returns a bogus D0 status, worst that can happen is that we
incorrectly assume D3hot while the device is actually in D3cold.  Which
is still kind of a bummer.

We've got this handy pci_device_is_present() function which reads the
vendor ID and returns false if it's "all ones" (which is what we'd get
in D3cold).

What if we ask the platform firmware first.  If it says D3cold, assume
that's true.  In all other cases, call pci_device_is_present().  If
that returns false, assume D3cold.  Otherwise read the PMCSR.

That would also work for devices that are suspended to D3cold in a
nonstandard way, such as Thunderbolt.  Having a function that updates
the current_state in a robust way and correctly discerns D3cold and
D3hot would be really useful I think.

Thanks a lot for sharing these ACPI intricacies, it wasn't clear to me
in how far we can trust the platform firmware.

Best regards,

Lukas
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki Sept. 15, 2016, 12:49 a.m. UTC | #8
On Thursday, September 15, 2016 12:58:57 AM Lukas Wunner wrote:
> On Wed, Sep 14, 2016 at 06:47:40PM +0200, Rafael J. Wysocki wrote:
> > On Wednesday, September 14, 2016 03:01:33 PM Rafael J. Wysocki wrote:
> > > On Wednesday, September 14, 2016 12:50:24 PM Lukas Wunner wrote:
> > > > On Wed, Sep 14, 2016 at 02:21:30AM +0200, Rafael J. Wysocki wrote:
> > > > > On Wednesday, August 31, 2016 08:15:18 AM Lukas Wunner wrote:
> > > > > > Usually the most accurate way to determine a PCI device's power state
> > > > > > is to read its PM Control & Status Register.  There are two cases
> > > > > > however when this is not an option:  If the device doesn't have the
> > > > > > PM capability at all, or if it is in D3cold.
> > > > > > 
> > > > > > In D3cold, reading from config space typically results in a
> > > > > > fabricated "all ones" response.  But in D3hot, the two bits
> > > > > > representing the power state in the PMCSR are *also* set to 1.
> > > > > > Thus D3hot and D3cold are not discernible by just reading the PMCSR.
> > > > > > 
> > > > > > A (supposedly) reliable way to detect D3cold is to query the platform
> > > > > > firmware for its opinion on the device's power state.  To this end,
> > > > > > add a ->get_power callback to struct pci_platform_pm_ops, and an
> > > > > > implementation to acpi_pci_platform_pm.  (The only
> > > > > > pci_platform_pm_ops existing so far).
> > > > > > 
> > > > > > Amend pci_update_current_state() to query the platform firmware before
> > > > > > reading the PMCSR.
> > > > > > 
> > > > > > Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
> > > > > > Signed-off-by: Lukas Wunner <lukas@wunner.de>
> > > > > > ---
> > > > > >  drivers/pci/pci-acpi.c | 23 +++++++++++++++++++++++
> > > > > >  drivers/pci/pci.c      | 21 ++++++++++++++++-----
> > > > > >  drivers/pci/pci.h      |  3 +++
> > > > > >  3 files changed, 42 insertions(+), 5 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> > > > > > index 9a033e8..89f2707 100644
> > > > > > --- a/drivers/pci/pci-acpi.c
> > > > > > +++ b/drivers/pci/pci-acpi.c
> > > > > > @@ -452,6 +452,28 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
> > > > > >  	return error;
> > > > > >  }
> > > > > >  
> > > > > > +static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev)
> > > > > > +{
> > > > > > +	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
> > > > > > +	static const pci_power_t state_conv[] = {
> > > > > > +		[ACPI_STATE_D0]      = PCI_D0,
> > > > > > +		[ACPI_STATE_D1]      = PCI_D1,
> > > > > > +		[ACPI_STATE_D2]      = PCI_D2,
> > > > > > +		[ACPI_STATE_D3_HOT]  = PCI_D3hot,
> > > > > > +		[ACPI_STATE_D3_COLD] = PCI_D3cold,
> > > > > > +	};
> > > > > > +	int state;
> > > > > 
> > > > > ACPI_STATE_D3_HOT and ACPI_STATE_D3_COLD were introduced in ACPI 4.0.
> > > > > For systems predating that, ACPI_STATE_D3_HOT is the deepest state
> > > > > returned by acpi_device_get_power().
> > > > 
> > > > Would it be possible to detect the ACPI spec version the platform
> > > > firmware conforms to, and amend acpi_device_get_power() to return
> > > > ACPI_STATE_D3_COLD if the device is in D3?
> > > > 
> > > > Then we could avoid the unnecessary runtime resume after direct_complete
> > > > also for these older machines.
> > > 
> > > Well, please see below.
> > > 
> > > > Can the revision in the FADT (offset 8) be used as a proxy?
> > > > => E.g. the old Clevo B7130 has revision 3 in the FADT and uses
> > > >    a _DSM and _PS3 to put the discrete GPU in D3cold:
> > > >    https://github.com/Lekensteyn/acpi-stuff/tree/master/dsl/Clevo_B7130
> > > > => Whereas the newer Clevo P651RA has revision 5 in the FADT and uses
> > > >    _PR3 to put the discrete GPU in D3cold:
> > > >    https://github.com/Lekensteyn/acpi-stuff/tree/master/dsl/Clevo_P651RA
> > > > 
> > > > However the FADT revision was already 4 in the ACPI 3.0 spec,
> > > > so we can only use it to discern ACPI 2.0 vs 3.0, not 3.0 vs 4.0,
> > > > which is what we'd actually want.  And there's a comment in
> > > > acpica/tbfadt.c that "The FADT revision value is unreliable."
> > > > 
> > > > Do you know of a better way to discern ACPI 3.0 vs 4.0?
> > > 
> > > I'm not sure if there is a reliable way to be honest.
> > > 
> > > Also even if the FADT etc tells you something, there's no guarantee that
> > > AML actually follows that.
> > > 
> > > In fact, whether or not acpi_device_get_power() will ever return D3cold
> > > for a device depends on whether or not _PR3 is there.  If it's not there,
> > > the deepest state returned will be D3hot in any case.
> > > 
> > > So I'm not sure how useful that is in the context of D3cold detection?
> > 
> > There is more to this.
> > 
> > In fact, we can't really trust _PSC too, because in some ACPI tables it is
> > implemented to always return 0 (seriously).
> > 
> > For this reason, I think the way to go would be to use something like
> > acpi_device_update_power() to ensure that the device really is in the
> > state we think it is in.
> 
> The patch only trusts the platform firmware to report D3cold correctly.
> If it reports anything else the patch reads the PMCSR.
> 
> Thus if _PSC returns a bogus D0 status, worst that can happen is that we
> incorrectly assume D3hot while the device is actually in D3cold.  Which
> is still kind of a bummer.
> 
> We've got this handy pci_device_is_present() function which reads the
> vendor ID and returns false if it's "all ones" (which is what we'd get
> in D3cold).
> 
> What if we ask the platform firmware first.  If it says D3cold, assume
> that's true.  In all other cases, call pci_device_is_present().  If
> that returns false, assume D3cold.  Otherwise read the PMCSR.

Sounds reasonable.  We already use pci_device_is_present() for a similar
purpose in at least one place IIRC.

> That would also work for devices that are suspended to D3cold in a
> nonstandard way, such as Thunderbolt.  Having a function that updates
> the current_state in a robust way and correctly discerns D3cold and
> D3hot would be really useful I think.
> 
> Thanks a lot for sharing these ACPI intricacies, it wasn't clear to me
> in how far we can trust the platform firmware.

No problem.  Unfortunately, it often takes me some time to recall those
things.

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukas Wunner Sept. 17, 2016, 1:49 p.m. UTC | #9
On Wed, Sep 14, 2016 at 03:01:33PM +0200, Rafael J. Wysocki wrote:
> On Wednesday, September 14, 2016 12:50:24 PM Lukas Wunner wrote:
> > On Wed, Sep 14, 2016 at 02:21:30AM +0200, Rafael J. Wysocki wrote:
> > > On Wednesday, August 31, 2016 08:15:18 AM Lukas Wunner wrote:
> > > > +
> > > > +	if (!adev || !acpi_device_power_manageable(adev))
> > > > +		return PCI_UNKNOWN;
> > > > +
> > > > +	if (acpi_device_get_power(adev, &state) || state < ACPI_STATE_D0
> > > > +						|| state > ACPI_STATE_D3_COLD)
> > > 
> > > If the device is power-manageable by ACPI (you've just checked that) and
> > > acpi_device_get_power() returns success (0), the returned state is
> > > guaranteed to be within the boundaries (if it isn't, there is a bug that
> > > needs to be fixed).
> > 
> > No, acpi_device_get_power() can also return ACPI_STATE_UNKNOWN, which has
> > the value 0xff.
> 
> That would be the case without power resources or _PSC, right?

There's this check in acpi_device_get_power():

                else if (result == ACPI_STATE_UNKNOWN)
                        result = psc > ACPI_STATE_D2 ? ACPI_STATE_D3_HOT : psc;

where "result" has been set further up by acpi_power_get_inferred_state().
If acpi_power_get_inferred_state() does return this value and _PSC is
not present (i.e. device->power.flags.explicit_get is not set),
then acpi_device_get_power() would return ACPI_STATE_UNKNOWN.

Also, if the device is not power manageable, has neither power resources
nor _PSC, and its parent has power state ACPI_STATE_UNKNOWN, then this
will be returned.


> > I could add that to state_conv[] above but then I'd have an array with
> > 256 integers on the stack, most of them 0, which I don't want.
> > I could check for != ACPI_STATE_UNKNOWN but checking the boundaries seemed
> > safer.
> 
> Well, I'm not sure in what way it is safer and you get one check instead of
> two. :-)
> 
> > So I maintain that the code is correct.
> 
> It simply contains an redundant check.

Fair enough, I'm only checking for ACPI_STATE_UNKNOWN now.


> > > >  void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
> > > >  {
> > > > -	if (dev->pm_cap) {
> > > > +	if (platform_pci_get_power_state(dev) == PCI_D3cold) {
> > > > +		dev->current_state = PCI_D3cold;
> > > > +	} else if (dev->pm_cap) {
> > > 
> > > Why exactly do you need to change this function?
> > 
> > It would be pointless to add the ->platform_pci_get_power_state
> > hook without using it anywhere, wouldn't it?
> 
> That depends on what you want to do with it next.  Callers may be added in
> separate patches, no problem with that.

I've moved the change of pci_update_current_state() to a separate patch now,
that will also make it easier to revert it, should anything blow up.


> pci_update_current_state() is called in a few places with assumptions that
> need to be re-evaluated to see if they still hold after the changes.  They
> probably do, but then again, these call sites need to be double checked.
> If you did that, then fine.

pci_update_current_state() is called whenever a device is resumed
(both runtime and after system sleep) and after changing its power
state using the platform in pci_platform_power_transition().

With the new patch, pci_update_current_state() will be more accurate
than it is now:  Laptop hybrid graphics which are not platform-power-
manageable (older Optimus/ATPX or current MacBook Pro) will power
down the GPU but its current_state will be D3hot.  That's because
nouveau/radeon call pci_set_power_state(pdev, PCI_D3cold) and the
PCI core will only put it in D3hot due to the lack of platform-power-
manageability.  When the device is resumed, pci_update_current_state()
will now change the current_state from D3hot to D3cold.  I'm actually
seeing this on my MacBook Pro.  When the system is subsequently put
to sleep, direct_complete will still be afforded despite the changed
current_state because of the first patch in this series.  Works like
a charm, I'm curious if this causes issues for others, but I doubt it.


> And if you want to change that logic for PCI, it would be good to change it
> in the same way for acpi_general_pm_domain which has exactly the same
> problem.

Hm, with PCI I can read the PMCSR and detect D3cold by reading the
vendor ID.  For generic ACPI devices, I don't have that so I have
to rely on the accuracy of acpi_device_get_power().  However as
you've pointed out, it might misrepresent D3cold as D3hot, and it
might incorrectly report D0 even though the device is in a different
state.  Is it safe to rely on acpi_device_get_power() then?

Another idea would be to use acpi_device_get_power() for PCI devices
without PM capability.  Then we could do away with the "state"
argument to pci_update_current_state().  This too hinges on the
reliability of acpi_device_get_power() of course.  At least D3cold
can be detected by reading the vendor ID, so we're not reliant on
ACPI for that.  I've even got a commit on my development branch
to make that change, but I can't test it, my machine doesn't have
PCI devices without PM cap.

Thanks,

Lukas
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki Sept. 18, 2016, 1:09 a.m. UTC | #10
On Saturday, September 17, 2016 03:49:43 PM Lukas Wunner wrote:
> On Wed, Sep 14, 2016 at 03:01:33PM +0200, Rafael J. Wysocki wrote:
> > On Wednesday, September 14, 2016 12:50:24 PM Lukas Wunner wrote:
> > > On Wed, Sep 14, 2016 at 02:21:30AM +0200, Rafael J. Wysocki wrote:
> > > > On Wednesday, August 31, 2016 08:15:18 AM Lukas Wunner wrote:
> > > > > +
> > > > > +	if (!adev || !acpi_device_power_manageable(adev))
> > > > > +		return PCI_UNKNOWN;
> > > > > +
> > > > > +	if (acpi_device_get_power(adev, &state) || state < ACPI_STATE_D0
> > > > > +						|| state > ACPI_STATE_D3_COLD)
> > > > 
> > > > If the device is power-manageable by ACPI (you've just checked that) and
> > > > acpi_device_get_power() returns success (0), the returned state is
> > > > guaranteed to be within the boundaries (if it isn't, there is a bug that
> > > > needs to be fixed).
> > > 
> > > No, acpi_device_get_power() can also return ACPI_STATE_UNKNOWN, which has
> > > the value 0xff.
> > 
> > That would be the case without power resources or _PSC, right?
> 
> There's this check in acpi_device_get_power():
> 
>                 else if (result == ACPI_STATE_UNKNOWN)
>                         result = psc > ACPI_STATE_D2 ? ACPI_STATE_D3_HOT : psc;
> 
> where "result" has been set further up by acpi_power_get_inferred_state().
> If acpi_power_get_inferred_state() does return this value

It doesn't.

> and _PSC is not present (i.e. device->power.flags.explicit_get is not set),
> then acpi_device_get_power() would return ACPI_STATE_UNKNOWN.
> 
> Also, if the device is not power manageable,

You had a check for that.

> has neither power resources nor _PSC, and its parent has power state
> ACPI_STATE_UNKNOWN, then this will be returned.

With the power-manageable check upfront the only case I can see is when the
device has _PS0, it doesn't have _PR0 (ie. no power resources) and _PSC is not
present.

> > > I could add that to state_conv[] above but then I'd have an array with
> > > 256 integers on the stack, most of them 0, which I don't want.
> > > I could check for != ACPI_STATE_UNKNOWN but checking the boundaries seemed
> > > safer.
> > 
> > Well, I'm not sure in what way it is safer and you get one check instead of
> > two. :-)
> > 
> > > So I maintain that the code is correct.
> > 
> > It simply contains an redundant check.
> 
> Fair enough, I'm only checking for ACPI_STATE_UNKNOWN now.
> 
> 
> > > > >  void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
> > > > >  {
> > > > > -	if (dev->pm_cap) {
> > > > > +	if (platform_pci_get_power_state(dev) == PCI_D3cold) {
> > > > > +		dev->current_state = PCI_D3cold;
> > > > > +	} else if (dev->pm_cap) {
> > > > 
> > > > Why exactly do you need to change this function?
> > > 
> > > It would be pointless to add the ->platform_pci_get_power_state
> > > hook without using it anywhere, wouldn't it?
> > 
> > That depends on what you want to do with it next.  Callers may be added in
> > separate patches, no problem with that.
> 
> I've moved the change of pci_update_current_state() to a separate patch now,
> that will also make it easier to revert it, should anything blow up.
> 
> 
> > pci_update_current_state() is called in a few places with assumptions that
> > need to be re-evaluated to see if they still hold after the changes.  They
> > probably do, but then again, these call sites need to be double checked.
> > If you did that, then fine.
> 
> pci_update_current_state() is called whenever a device is resumed
> (both runtime and after system sleep) and after changing its power
> state using the platform in pci_platform_power_transition().
> 
> With the new patch, pci_update_current_state() will be more accurate
> than it is now:  Laptop hybrid graphics which are not platform-power-
> manageable (older Optimus/ATPX or current MacBook Pro) will power
> down the GPU but its current_state will be D3hot.  That's because
> nouveau/radeon call pci_set_power_state(pdev, PCI_D3cold) and the
> PCI core will only put it in D3hot due to the lack of platform-power-
> manageability.  When the device is resumed, pci_update_current_state()
> will now change the current_state from D3hot to D3cold.  I'm actually
> seeing this on my MacBook Pro.  When the system is subsequently put
> to sleep, direct_complete will still be afforded despite the changed
> current_state because of the first patch in this series.  Works like
> a charm, I'm curious if this causes issues for others, but I doubt it.

Well, we'll see.

> > And if you want to change that logic for PCI, it would be good to change it
> > in the same way for acpi_general_pm_domain which has exactly the same
> > problem.
> 
> Hm, with PCI I can read the PMCSR and detect D3cold by reading the
> vendor ID.  For generic ACPI devices, I don't have that so I have
> to rely on the accuracy of acpi_device_get_power().  However as
> you've pointed out, it might misrepresent D3cold as D3hot, and it
> might incorrectly report D0 even though the device is in a different
> state.  Is it safe to rely on acpi_device_get_power() then?

This is all about optimizing the current suboptimal behavior and that can be
done for the cases when the answer is known to be most likely correct.

Like for example when acpi_device_get_power() reports D3cold (or D1, D2 for
that matter).

IOW, D0 and D3hot may be problematic for different reasons.

> Another idea would be to use acpi_device_get_power() for PCI devices
> without PM capability.  Then we could do away with the "state"
> argument to pci_update_current_state().

That I'm not sure about.

> This too hinges on the
> reliability of acpi_device_get_power() of course.  At least D3cold
> can be detected by reading the vendor ID, so we're not reliant on
> ACPI for that.  I've even got a commit on my development branch
> to make that change, but I can't test it, my machine doesn't have
> PCI devices without PM cap.

Those are rare now. All PCIe will have it and PCI 2.0 and later as well IIRC.

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 9a033e8..89f2707 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -452,6 +452,28 @@  static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 	return error;
 }
 
+static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev)
+{
+	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
+	static const pci_power_t state_conv[] = {
+		[ACPI_STATE_D0]      = PCI_D0,
+		[ACPI_STATE_D1]      = PCI_D1,
+		[ACPI_STATE_D2]      = PCI_D2,
+		[ACPI_STATE_D3_HOT]  = PCI_D3hot,
+		[ACPI_STATE_D3_COLD] = PCI_D3cold,
+	};
+	int state;
+
+	if (!adev || !acpi_device_power_manageable(adev))
+		return PCI_UNKNOWN;
+
+	if (acpi_device_get_power(adev, &state) || state < ACPI_STATE_D0
+						|| state > ACPI_STATE_D3_COLD)
+		return PCI_UNKNOWN;
+
+	return state_conv[state];
+}
+
 static bool acpi_pci_can_wakeup(struct pci_dev *dev)
 {
 	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
@@ -534,6 +556,7 @@  static bool acpi_pci_need_resume(struct pci_dev *dev)
 static const struct pci_platform_pm_ops acpi_pci_platform_pm = {
 	.is_manageable = acpi_pci_power_manageable,
 	.set_state = acpi_pci_set_power_state,
+	.get_state = acpi_pci_get_power_state,
 	.choose_state = acpi_pci_choose_state,
 	.sleep_wake = acpi_pci_sleep_wake,
 	.run_wake = acpi_pci_run_wake,
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 72a9d3a..e52e3d4 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -552,8 +552,9 @@  static const struct pci_platform_pm_ops *pci_platform_pm;
 
 int pci_set_platform_pm(const struct pci_platform_pm_ops *ops)
 {
-	if (!ops->is_manageable || !ops->set_state || !ops->choose_state ||
-	    !ops->sleep_wake || !ops->run_wake || !ops->need_resume)
+	if (!ops->is_manageable || !ops->set_state  || !ops->get_state ||
+	    !ops->choose_state  || !ops->sleep_wake || !ops->run_wake  ||
+	    !ops->need_resume)
 		return -EINVAL;
 	pci_platform_pm = ops;
 	return 0;
@@ -570,6 +571,11 @@  static inline int platform_pci_set_power_state(struct pci_dev *dev,
 	return pci_platform_pm ? pci_platform_pm->set_state(dev, t) : -ENOSYS;
 }
 
+static inline pci_power_t platform_pci_get_power_state(struct pci_dev *dev)
+{
+	return pci_platform_pm ? pci_platform_pm->get_state(dev) : PCI_UNKNOWN;
+}
+
 static inline pci_power_t platform_pci_choose_state(struct pci_dev *dev)
 {
 	return pci_platform_pm ?
@@ -701,14 +707,19 @@  static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state)
 }
 
 /**
- * pci_update_current_state - Read PCI power state of given device from its
- *                            PCI PM registers and cache it
+ * pci_update_current_state - Read power state of given device and cache it
  * @dev: PCI device to handle.
  * @state: State to cache in case the device doesn't have the PM capability
+ *
+ * The power state is read from the PMCSR register, which however is
+ * inaccessible in D3cold. The platform firmware is therefore queried first
+ * to detect accessibility of the register.
  */
 void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
 {
-	if (dev->pm_cap) {
+	if (platform_pci_get_power_state(dev) == PCI_D3cold) {
+		dev->current_state = PCI_D3cold;
+	} else if (dev->pm_cap) {
 		u16 pmcsr;
 
 		/*
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 9730c47..01d5206 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -42,6 +42,8 @@  int pci_probe_reset_function(struct pci_dev *dev);
  *
  * @set_state: invokes the platform firmware to set the device's power state
  *
+ * @get_state: queries the platform firmware for a device's current power state
+ *
  * @choose_state: returns PCI power state of given device preferred by the
  *                platform; to be used during system-wide transitions from a
  *                sleeping state to the working state and vice versa
@@ -62,6 +64,7 @@  int pci_probe_reset_function(struct pci_dev *dev);
 struct pci_platform_pm_ops {
 	bool (*is_manageable)(struct pci_dev *dev);
 	int (*set_state)(struct pci_dev *dev, pci_power_t state);
+	pci_power_t (*get_state)(struct pci_dev *dev);
 	pci_power_t (*choose_state)(struct pci_dev *dev);
 	int (*sleep_wake)(struct pci_dev *dev, bool enable);
 	int (*run_wake)(struct pci_dev *dev, bool enable);