diff mbox series

[v3,1/2] PCI: dwc: Implement general suspend/resume functionality for L2/L3 transitions

Message ID 20230419164118.596300-1-Frank.Li@nxp.com (mailing list archive)
State Changes Requested
Delegated to: Lorenzo Pieralisi
Headers show
Series [v3,1/2] PCI: dwc: Implement general suspend/resume functionality for L2/L3 transitions | expand

Commit Message

Frank Li April 19, 2023, 4:41 p.m. UTC
Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.

Typical L2 entry workflow:

1. Transmit PME turn off signal to PCI devices.
2. Await link entering L2_IDLE state.
3. Transition Root complex to D3 state.

Typical L2 exit workflow:

1. Transition Root complex to D0 state.
2. Issue exit from L2 command.
3. Reinitialize PCI host.
4. Wait for link to become active.

Signed-off-by: Frank Li <Frank.Li@nxp.com>
---
Change from v2 to v3: 
- Basic rewrite whole patch according rob herry suggestion. 
  put common function into dwc, so more soc can share the same logic.
  
 .../pci/controller/dwc/pcie-designware-host.c | 80 +++++++++++++++++++
 drivers/pci/controller/dwc/pcie-designware.h  | 28 +++++++
 2 files changed, 108 insertions(+)

Comments

Frank Li May 12, 2023, 2:47 p.m. UTC | #1
> 
> Introduced helper function dw_pcie_get_ltssm to retrieve
> SMLH_LTSS_STATE.
> Added API pme_turn_off and exit_from_l2 for managing L2/L3 state
> transitions.
> 
> Typical L2 entry workflow:
> 
> 1. Transmit PME turn off signal to PCI devices.
> 2. Await link entering L2_IDLE state.
> 3. Transition Root complex to D3 state.
> 
> Typical L2 exit workflow:
> 
> 1. Transition Root complex to D0 state.
> 2. Issue exit from L2 command.
> 3. Reinitialize PCI host.
> 4. Wait for link to become active.
> 
> Signed-off-by: Frank Li <Frank.Li@nxp.com>
> ---

Ping

> Change from v2 to v3:
> - Basic rewrite whole patch according rob herry suggestion.
>   put common function into dwc, so more soc can share the same logic.
>
Frank Li June 12, 2023, 4:16 p.m. UTC | #2
On Fri, May 12, 2023 at 02:47:46PM +0000, Frank Li wrote:
> > 
> > Introduced helper function dw_pcie_get_ltssm to retrieve
> > SMLH_LTSS_STATE.
> > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state
> > transitions.
> > 
> > Typical L2 entry workflow:
> > 
> > 1. Transmit PME turn off signal to PCI devices.
> > 2. Await link entering L2_IDLE state.
> > 3. Transition Root complex to D3 state.
> > 
> > Typical L2 exit workflow:
> > 
> > 1. Transition Root complex to D0 state.
> > 2. Issue exit from L2 command.
> > 3. Reinitialize PCI host.
> > 4. Wait for link to become active.
> > 
> > Signed-off-by: Frank Li <Frank.Li@nxp.com>
> > ---
> 
> Ping

Ping

> 
> > Change from v2 to v3:
> > - Basic rewrite whole patch according rob herry suggestion.
> >   put common function into dwc, so more soc can share the same logic.
> > 
>
Frank Li July 17, 2023, 2:05 p.m. UTC | #3
On Mon, Jun 12, 2023 at 12:16:24PM -0400, Frank Li wrote:
> On Fri, May 12, 2023 at 02:47:46PM +0000, Frank Li wrote:
> > > 
> > > Introduced helper function dw_pcie_get_ltssm to retrieve
> > > SMLH_LTSS_STATE.
> > > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state
> > > transitions.
> > > 
> > > Typical L2 entry workflow:
> > > 
> > > 1. Transmit PME turn off signal to PCI devices.
> > > 2. Await link entering L2_IDLE state.
> > > 3. Transition Root complex to D3 state.
> > > 
> > > Typical L2 exit workflow:
> > > 
> > > 1. Transition Root complex to D0 state.
> > > 2. Issue exit from L2 command.
> > > 3. Reinitialize PCI host.
> > > 4. Wait for link to become active.
> > > 
> > > Signed-off-by: Frank Li <Frank.Li@nxp.com>
> > > ---
> > 
> > Ping
> 
> Ping

Ping? 

> 
> > 
> > > Change from v2 to v3:
> > > - Basic rewrite whole patch according rob herry suggestion.
> > >   put common function into dwc, so more soc can share the same logic.
> > > 
> >
Manivannan Sadhasivam July 17, 2023, 4:45 p.m. UTC | #4
On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> 
> Typical L2 entry workflow:
> 
> 1. Transmit PME turn off signal to PCI devices.
> 2. Await link entering L2_IDLE state.

AFAIK, typical workflow is to wait for PME_To_Ack.

> 3. Transition Root complex to D3 state.
> 
> Typical L2 exit workflow:
> 
> 1. Transition Root complex to D0 state.
> 2. Issue exit from L2 command.
> 3. Reinitialize PCI host.
> 4. Wait for link to become active.
> 
> Signed-off-by: Frank Li <Frank.Li@nxp.com>
> ---
> Change from v2 to v3: 
> - Basic rewrite whole patch according rob herry suggestion. 
>   put common function into dwc, so more soc can share the same logic.
>   
>  .../pci/controller/dwc/pcie-designware-host.c | 80 +++++++++++++++++++
>  drivers/pci/controller/dwc/pcie-designware.h  | 28 +++++++
>  2 files changed, 108 insertions(+)
> 
> diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
> index 9952057c8819..ef6869488bde 100644
> --- a/drivers/pci/controller/dwc/pcie-designware-host.c
> +++ b/drivers/pci/controller/dwc/pcie-designware-host.c
> @@ -8,6 +8,7 @@
>   * Author: Jingoo Han <jg1.han@samsung.com>
>   */
>  
> +#include <linux/iopoll.h>
>  #include <linux/irqchip/chained_irq.h>
>  #include <linux/irqdomain.h>
>  #include <linux/msi.h>
> @@ -807,3 +808,82 @@ int dw_pcie_setup_rc(struct dw_pcie_rp *pp)
>  	return 0;
>  }
>  EXPORT_SYMBOL_GPL(dw_pcie_setup_rc);
> +
> +/*
> + * There are for configuring host controllers, which are bridges *to* PCI devices
> + * but are not PCI devices themselves.

None of the functions applicable to the devices. So there is no need for this
comment.

> + */
> +static void dw_pcie_set_dstate(struct dw_pcie *pci, u32 dstate)

Please use pci_power_t defines for dstates.

> +{
> +	u8 offset = dw_pcie_find_capability(pci, PCI_CAP_ID_PM);
> +	u32 val;
> +
> +	val = dw_pcie_readw_dbi(pci, offset + PCI_PM_CTRL);

Please use PCI accessors for accessing spec compliant registers.

> +	val &= ~PCI_PM_CTRL_STATE_MASK;
> +	val |= dstate;
> +	dw_pcie_writew_dbi(pci, offset + PCI_PM_CTRL, val);
> +}
> +
> +int dw_pcie_suspend_noirq(struct dw_pcie *pci)
> +{
> +	u32 val;
> +	int ret;
> +
> +	if (dw_pcie_get_ltssm(pci) <= DW_PCIE_LTSSM_DETECT_ACT)
> +		return 0;
> +
> +	pci->pp.ops->pme_turn_off(&pci->pp);

You should first check for the existence of the callback before invoking. This
applies to all callbacks in this patch.

> +
> +	/*
> +	 * PCI Express Base Specification Rev 4.0
> +	 * 5.3.3.2.1 PME Synchronization
> +	 * Recommand 1ms to 10ms timeout to check L2 ready
> +	 */
> +	ret = read_poll_timeout(dw_pcie_get_ltssm, val, val == DW_PCIE_LTSSM_L2_IDLE,
> +				100, 10000, false, pci);

Is there no way to wait for PME_To_Ack TLP?

> +	if (ret) {
> +		dev_err(pci->dev, "PCIe link enter L2 timeout! ltssm = 0x%x\n", val);
> +		return ret;
> +	}
> +
> +	dw_pcie_set_dstate(pci, 0x3);
> +
> +	pci->suspended = true;
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(dw_pcie_suspend_noirq);
> +
> +int dw_pcie_resume_noirq(struct dw_pcie *pci)
> +{
> +	int ret;
> +
> +	if (!pci->suspended)
> +		return 0;
> +
> +	pci->suspended = false;
> +
> +	dw_pcie_set_dstate(pci, 0x0);
> +
> +	pci->pp.ops->exit_from_l2(&pci->pp);
> +
> +	/* delay 10 ms to access EP */

Is this delay as part of the DWC spec? If so, please quote the section.

> +	mdelay(10);
> +
> +	ret = pci->pp.ops->host_init(&pci->pp);
> +	if (ret) {
> +		dev_err(pci->dev, "ls_pcie_host_init failed! ret = 0x%x\n", ret);

s/ls_pcie_host_init/Host init

> +		return ret;
> +	}
> +
> +	dw_pcie_setup_rc(&pci->pp);
> +

Don't you need to configure iATU?

> +	ret = dw_pcie_wait_for_link(pci);

Don't you need to start the link beforehand?

> +	if (ret) {
> +		dev_err(pci->dev, "wait link up timeout! ret = 0x%x\n", ret);

dw_pcie_wait_for_link() itself prints error message on failure. So no need to do
the same here.

- Mani

> +		return ret;
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(dw_pcie_resume_noirq);
> diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h
> index 79713ce075cc..effb07a506e4 100644
> --- a/drivers/pci/controller/dwc/pcie-designware.h
> +++ b/drivers/pci/controller/dwc/pcie-designware.h
> @@ -288,10 +288,21 @@ enum dw_pcie_core_rst {
>  	DW_PCIE_NUM_CORE_RSTS
>  };
>  
> +enum dw_pcie_ltssm {
> +	DW_PCIE_LTSSM_UNKNOWN = 0xFFFFFFFF,
> +	/* Need align PCIE_PORT_DEBUG0 bit0:5 */
> +	DW_PCIE_LTSSM_DETECT_QUIET = 0x0,
> +	DW_PCIE_LTSSM_DETECT_ACT = 0x1,
> +	DW_PCIE_LTSSM_L0 = 0x11,
> +	DW_PCIE_LTSSM_L2_IDLE = 0x15,
> +};
> +
>  struct dw_pcie_host_ops {
>  	int (*host_init)(struct dw_pcie_rp *pp);
>  	void (*host_deinit)(struct dw_pcie_rp *pp);
>  	int (*msi_host_init)(struct dw_pcie_rp *pp);
> +	void (*pme_turn_off)(struct dw_pcie_rp *pp);
> +	void (*exit_from_l2)(struct dw_pcie_rp *pp);
>  };
>  
>  struct dw_pcie_rp {
> @@ -364,6 +375,7 @@ struct dw_pcie_ops {
>  	void    (*write_dbi2)(struct dw_pcie *pcie, void __iomem *base, u32 reg,
>  			      size_t size, u32 val);
>  	int	(*link_up)(struct dw_pcie *pcie);
> +	enum dw_pcie_ltssm (*get_ltssm)(struct dw_pcie *pcie);
>  	int	(*start_link)(struct dw_pcie *pcie);
>  	void	(*stop_link)(struct dw_pcie *pcie);
>  };
> @@ -393,6 +405,7 @@ struct dw_pcie {
>  	struct reset_control_bulk_data	app_rsts[DW_PCIE_NUM_APP_RSTS];
>  	struct reset_control_bulk_data	core_rsts[DW_PCIE_NUM_CORE_RSTS];
>  	struct gpio_desc		*pe_rst;
> +	bool			suspended;
>  };
>  
>  #define to_dw_pcie_from_pp(port) container_of((port), struct dw_pcie, pp)
> @@ -430,6 +443,9 @@ void dw_pcie_iatu_detect(struct dw_pcie *pci);
>  int dw_pcie_edma_detect(struct dw_pcie *pci);
>  void dw_pcie_edma_remove(struct dw_pcie *pci);
>  
> +int dw_pcie_suspend_noirq(struct dw_pcie *pci);
> +int dw_pcie_resume_noirq(struct dw_pcie *pci);
> +
>  static inline void dw_pcie_writel_dbi(struct dw_pcie *pci, u32 reg, u32 val)
>  {
>  	dw_pcie_write_dbi(pci, reg, 0x4, val);
> @@ -501,6 +517,18 @@ static inline void dw_pcie_stop_link(struct dw_pcie *pci)
>  		pci->ops->stop_link(pci);
>  }
>  
> +static inline enum dw_pcie_ltssm dw_pcie_get_ltssm(struct dw_pcie *pci)
> +{
> +	u32 val;
> +
> +	if (pci->ops && pci->ops->get_ltssm)
> +		return pci->ops->get_ltssm(pci);
> +
> +	val = dw_pcie_readl_dbi(pci, PCIE_PORT_DEBUG0);
> +
> +	return (enum dw_pcie_ltssm)FIELD_GET(PORT_LOGIC_LTSSM_STATE_MASK, val);
> +}
> +
>  #ifdef CONFIG_PCIE_DW_HOST
>  irqreturn_t dw_handle_msi_irq(struct dw_pcie_rp *pp);
>  int dw_pcie_setup_rc(struct dw_pcie_rp *pp);
> -- 
> 2.34.1
>
Frank Li July 17, 2023, 6:36 p.m. UTC | #5
On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
> On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> > Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> > 
> > Typical L2 entry workflow:
> > 
> > 1. Transmit PME turn off signal to PCI devices.
> > 2. Await link entering L2_IDLE state.
> 
> AFAIK, typical workflow is to wait for PME_To_Ack.

1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
I think PCI RC needs some time to set link enter L2 after get ACK from
PME.

> 
> > 3. Transition Root complex to D3 state.
> > 
> > Typical L2 exit workflow:
> > 
> > 1. Transition Root complex to D0 state.
> > 2. Issue exit from L2 command.
> > 3. Reinitialize PCI host.
> > 4. Wait for link to become active.
> > 
> > Signed-off-by: Frank Li <Frank.Li@nxp.com>
> > ---
> > Change from v2 to v3: 
> > - Basic rewrite whole patch according rob herry suggestion. 
> >   put common function into dwc, so more soc can share the same logic.
> >   
> >  .../pci/controller/dwc/pcie-designware-host.c | 80 +++++++++++++++++++
> >  drivers/pci/controller/dwc/pcie-designware.h  | 28 +++++++
> >  2 files changed, 108 insertions(+)
> > 
> > diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
> > index 9952057c8819..ef6869488bde 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware-host.c
> > +++ b/drivers/pci/controller/dwc/pcie-designware-host.c
> > @@ -8,6 +8,7 @@
> >   * Author: Jingoo Han <jg1.han@samsung.com>
> >   */
> >  
> > +#include <linux/iopoll.h>
> >  #include <linux/irqchip/chained_irq.h>
> >  #include <linux/irqdomain.h>
> >  #include <linux/msi.h>
> > @@ -807,3 +808,82 @@ int dw_pcie_setup_rc(struct dw_pcie_rp *pp)
> >  	return 0;
> >  }
> >  EXPORT_SYMBOL_GPL(dw_pcie_setup_rc);
> > +
> > +/*
> > + * There are for configuring host controllers, which are bridges *to* PCI devices
> > + * but are not PCI devices themselves.
> 
> None of the functions applicable to the devices. So there is no need for this
> comment.

I copy comments in drivers/pci/controller/dwc/pcie-designware.c.

/*
 * These interfaces resemble the pci_find_*capability() interfaces, but these
 * are for configuring host controllers, which are bridges *to* PCI devices but
 * are not PCI devices themselves.
 */
static u8 __dw_pcie_find_next_cap(struct dw_pcie *pci, u8 cap_ptr,
                                  u8 cap)


I think it is reasonalble because it is too similar with standard API
pci_set_power_state();

> 
> > + */
> > +static void dw_pcie_set_dstate(struct dw_pcie *pci, u32 dstate)
> 
> Please use pci_power_t defines for dstates.

Although dwc use the same define, it is difference things. 

> 
> > +{
> > +	u8 offset = dw_pcie_find_capability(pci, PCI_CAP_ID_PM);
> > +	u32 val;
> > +
> > +	val = dw_pcie_readw_dbi(pci, offset + PCI_PM_CTRL);
> 
> Please use PCI accessors for accessing spec compliant registers.

According to comments in pcie-designware.c, it is difference concept
even though register define is the same as PCI spec. It was used to
control root bridges.

> 
> > +	val &= ~PCI_PM_CTRL_STATE_MASK;
> > +	val |= dstate;
> > +	dw_pcie_writew_dbi(pci, offset + PCI_PM_CTRL, val);
> > +}
> > +
> > +int dw_pcie_suspend_noirq(struct dw_pcie *pci)
> > +{
> > +	u32 val;
> > +	int ret;
> > +
> > +	if (dw_pcie_get_ltssm(pci) <= DW_PCIE_LTSSM_DETECT_ACT)
> > +		return 0;
> > +
> > +	pci->pp.ops->pme_turn_off(&pci->pp);
> 
> You should first check for the existence of the callback before invoking. This
> applies to all callbacks in this patch.

Yes, I will update.

> 
> > +
> > +	/*
> > +	 * PCI Express Base Specification Rev 4.0
> > +	 * 5.3.3.2.1 PME Synchronization
> > +	 * Recommand 1ms to 10ms timeout to check L2 ready
> > +	 */
> > +	ret = read_poll_timeout(dw_pcie_get_ltssm, val, val == DW_PCIE_LTSSM_L2_IDLE,
> > +				100, 10000, false, pci);
> 
> Is there no way to wait for PME_To_Ack TLP?


Suppose PME_turn_off should wait for ACK before return. 
Here, just make sure Link enter L2 status. Hardware need some time to put
link to L2 after get ACK from bus, even it is very short generally.

> 
> > +	if (ret) {
> > +		dev_err(pci->dev, "PCIe link enter L2 timeout! ltssm = 0x%x\n", val);
> > +		return ret;
> > +	}
> > +
> > +	dw_pcie_set_dstate(pci, 0x3);
> > +
> > +	pci->suspended = true;
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(dw_pcie_suspend_noirq);
> > +
> > +int dw_pcie_resume_noirq(struct dw_pcie *pci)
> > +{
> > +	int ret;
> > +
> > +	if (!pci->suspended)
> > +		return 0;
> > +
> > +	pci->suspended = false;
> > +
> > +	dw_pcie_set_dstate(pci, 0x0);
> > +
> > +	pci->pp.ops->exit_from_l2(&pci->pp);
> > +
> > +	/* delay 10 ms to access EP */
> 
> Is this delay as part of the DWC spec? If so, please quote the section.
> 
> > +	mdelay(10);
> > +
> > +	ret = pci->pp.ops->host_init(&pci->pp);
> > +	if (ret) {
> > +		dev_err(pci->dev, "ls_pcie_host_init failed! ret = 0x%x\n", ret);
> 
> s/ls_pcie_host_init/Host init
> 
> > +		return ret;
> > +	}
> > +
> > +	dw_pcie_setup_rc(&pci->pp);
> > +
> 
> Don't you need to configure iATU?
> 
> > +	ret = dw_pcie_wait_for_link(pci);
> 
> Don't you need to start the link beforehand?

Suppose need start link, it works at layerscape platform just because dwc
have not full power off. some state still kept.

> 
> > +	if (ret) {
> > +		dev_err(pci->dev, "wait link up timeout! ret = 0x%x\n", ret);
> 
> dw_pcie_wait_for_link() itself prints error message on failure. So no need to do
> the same here.

Okay

> 
> - Mani
> 
> > +		return ret;
> > +	}
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(dw_pcie_resume_noirq);
> > diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h
> > index 79713ce075cc..effb07a506e4 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware.h
> > +++ b/drivers/pci/controller/dwc/pcie-designware.h
> > @@ -288,10 +288,21 @@ enum dw_pcie_core_rst {
> >  	DW_PCIE_NUM_CORE_RSTS
> >  };
> >  
> > +enum dw_pcie_ltssm {
> > +	DW_PCIE_LTSSM_UNKNOWN = 0xFFFFFFFF,
> > +	/* Need align PCIE_PORT_DEBUG0 bit0:5 */
> > +	DW_PCIE_LTSSM_DETECT_QUIET = 0x0,
> > +	DW_PCIE_LTSSM_DETECT_ACT = 0x1,
> > +	DW_PCIE_LTSSM_L0 = 0x11,
> > +	DW_PCIE_LTSSM_L2_IDLE = 0x15,
> > +};
> > +
> >  struct dw_pcie_host_ops {
> >  	int (*host_init)(struct dw_pcie_rp *pp);
> >  	void (*host_deinit)(struct dw_pcie_rp *pp);
> >  	int (*msi_host_init)(struct dw_pcie_rp *pp);
> > +	void (*pme_turn_off)(struct dw_pcie_rp *pp);
> > +	void (*exit_from_l2)(struct dw_pcie_rp *pp);
> >  };
> >  
> >  struct dw_pcie_rp {
> > @@ -364,6 +375,7 @@ struct dw_pcie_ops {
> >  	void    (*write_dbi2)(struct dw_pcie *pcie, void __iomem *base, u32 reg,
> >  			      size_t size, u32 val);
> >  	int	(*link_up)(struct dw_pcie *pcie);
> > +	enum dw_pcie_ltssm (*get_ltssm)(struct dw_pcie *pcie);
> >  	int	(*start_link)(struct dw_pcie *pcie);
> >  	void	(*stop_link)(struct dw_pcie *pcie);
> >  };
> > @@ -393,6 +405,7 @@ struct dw_pcie {
> >  	struct reset_control_bulk_data	app_rsts[DW_PCIE_NUM_APP_RSTS];
> >  	struct reset_control_bulk_data	core_rsts[DW_PCIE_NUM_CORE_RSTS];
> >  	struct gpio_desc		*pe_rst;
> > +	bool			suspended;
> >  };
> >  
> >  #define to_dw_pcie_from_pp(port) container_of((port), struct dw_pcie, pp)
> > @@ -430,6 +443,9 @@ void dw_pcie_iatu_detect(struct dw_pcie *pci);
> >  int dw_pcie_edma_detect(struct dw_pcie *pci);
> >  void dw_pcie_edma_remove(struct dw_pcie *pci);
> >  
> > +int dw_pcie_suspend_noirq(struct dw_pcie *pci);
> > +int dw_pcie_resume_noirq(struct dw_pcie *pci);
> > +
> >  static inline void dw_pcie_writel_dbi(struct dw_pcie *pci, u32 reg, u32 val)
> >  {
> >  	dw_pcie_write_dbi(pci, reg, 0x4, val);
> > @@ -501,6 +517,18 @@ static inline void dw_pcie_stop_link(struct dw_pcie *pci)
> >  		pci->ops->stop_link(pci);
> >  }
> >  
> > +static inline enum dw_pcie_ltssm dw_pcie_get_ltssm(struct dw_pcie *pci)
> > +{
> > +	u32 val;
> > +
> > +	if (pci->ops && pci->ops->get_ltssm)
> > +		return pci->ops->get_ltssm(pci);
> > +
> > +	val = dw_pcie_readl_dbi(pci, PCIE_PORT_DEBUG0);
> > +
> > +	return (enum dw_pcie_ltssm)FIELD_GET(PORT_LOGIC_LTSSM_STATE_MASK, val);
> > +}
> > +
> >  #ifdef CONFIG_PCIE_DW_HOST
> >  irqreturn_t dw_handle_msi_irq(struct dw_pcie_rp *pp);
> >  int dw_pcie_setup_rc(struct dw_pcie_rp *pp);
> > -- 
> > 2.34.1
> > 
> 
> -- 
> மணிவண்ணன் சதாசிவம்
Manivannan Sadhasivam July 18, 2023, 10:04 a.m. UTC | #6
On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
> > On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> > > Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> > > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> > > 
> > > Typical L2 entry workflow:
> > > 
> > > 1. Transmit PME turn off signal to PCI devices.
> > > 2. Await link entering L2_IDLE state.
> > 
> > AFAIK, typical workflow is to wait for PME_To_Ack.
> 
> 1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
> I think PCI RC needs some time to set link enter L2 after get ACK from
> PME.
> 
> > 
> > > 3. Transition Root complex to D3 state.
> > > 
> > > Typical L2 exit workflow:
> > > 
> > > 1. Transition Root complex to D0 state.
> > > 2. Issue exit from L2 command.
> > > 3. Reinitialize PCI host.
> > > 4. Wait for link to become active.
> > > 
> > > Signed-off-by: Frank Li <Frank.Li@nxp.com>
> > > ---
> > > Change from v2 to v3: 
> > > - Basic rewrite whole patch according rob herry suggestion. 
> > >   put common function into dwc, so more soc can share the same logic.
> > >   
> > >  .../pci/controller/dwc/pcie-designware-host.c | 80 +++++++++++++++++++
> > >  drivers/pci/controller/dwc/pcie-designware.h  | 28 +++++++
> > >  2 files changed, 108 insertions(+)
> > > 
> > > diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
> > > index 9952057c8819..ef6869488bde 100644
> > > --- a/drivers/pci/controller/dwc/pcie-designware-host.c
> > > +++ b/drivers/pci/controller/dwc/pcie-designware-host.c
> > > @@ -8,6 +8,7 @@
> > >   * Author: Jingoo Han <jg1.han@samsung.com>
> > >   */
> > >  
> > > +#include <linux/iopoll.h>
> > >  #include <linux/irqchip/chained_irq.h>
> > >  #include <linux/irqdomain.h>
> > >  #include <linux/msi.h>
> > > @@ -807,3 +808,82 @@ int dw_pcie_setup_rc(struct dw_pcie_rp *pp)
> > >  	return 0;
> > >  }
> > >  EXPORT_SYMBOL_GPL(dw_pcie_setup_rc);
> > > +
> > > +/*
> > > + * There are for configuring host controllers, which are bridges *to* PCI devices
> > > + * but are not PCI devices themselves.
> > 
> > None of the functions applicable to the devices. So there is no need for this
> > comment.
> 
> I copy comments in drivers/pci/controller/dwc/pcie-designware.c.
> 
> /*
>  * These interfaces resemble the pci_find_*capability() interfaces, but these
>  * are for configuring host controllers, which are bridges *to* PCI devices but
>  * are not PCI devices themselves.
>  */
> static u8 __dw_pcie_find_next_cap(struct dw_pcie *pci, u8 cap_ptr,
>                                   u8 cap)
> 
> 
> I think it is reasonalble because it is too similar with standard API
> pci_set_power_state();
> 

Ok, then please add this API similarity in the comment as like
__dw_pcie_find_next_cap(). Also change "There" to "These".

> > 
> > > + */
> > > +static void dw_pcie_set_dstate(struct dw_pcie *pci, u32 dstate)
> > 
> > Please use pci_power_t defines for dstates.
> 
> Although dwc use the same define, it is difference things. 
> 

Sorry, what difference? Could you please clarify?

> > 
> > > +{
> > > +	u8 offset = dw_pcie_find_capability(pci, PCI_CAP_ID_PM);
> > > +	u32 val;
> > > +
> > > +	val = dw_pcie_readw_dbi(pci, offset + PCI_PM_CTRL);
> > 
> > Please use PCI accessors for accessing spec compliant registers.
> 
> According to comments in pcie-designware.c, it is difference concept
> even though register define is the same as PCI spec. It was used to
> control root bridges.
> 

Ah, I got slightly confused. This is fine.

> > 
> > > +	val &= ~PCI_PM_CTRL_STATE_MASK;
> > > +	val |= dstate;
> > > +	dw_pcie_writew_dbi(pci, offset + PCI_PM_CTRL, val);
> > > +}
> > > +
> > > +int dw_pcie_suspend_noirq(struct dw_pcie *pci)
> > > +{
> > > +	u32 val;
> > > +	int ret;
> > > +
> > > +	if (dw_pcie_get_ltssm(pci) <= DW_PCIE_LTSSM_DETECT_ACT)
> > > +		return 0;
> > > +
> > > +	pci->pp.ops->pme_turn_off(&pci->pp);
> > 
> > You should first check for the existence of the callback before invoking. This
> > applies to all callbacks in this patch.
> 
> Yes, I will update.
> 
> > 
> > > +
> > > +	/*
> > > +	 * PCI Express Base Specification Rev 4.0
> > > +	 * 5.3.3.2.1 PME Synchronization
> > > +	 * Recommand 1ms to 10ms timeout to check L2 ready
> > > +	 */
> > > +	ret = read_poll_timeout(dw_pcie_get_ltssm, val, val == DW_PCIE_LTSSM_L2_IDLE,
> > > +				100, 10000, false, pci);
> > 
> > Is there no way to wait for PME_To_Ack TLP?
> 
> 
> Suppose PME_turn_off should wait for ACK before return. 

Ok. I didn't see this behavior in the spec, hence curious.

> Here, just make sure Link enter L2 status. Hardware need some time to put
> link to L2 after get ACK from bus, even it is very short generally.
> 

Fine then. But can we check for PM_LINKST_IN_L2 SII System Information Interface
(SII) instead of LTSSM state?

> > 
> > > +	if (ret) {
> > > +		dev_err(pci->dev, "PCIe link enter L2 timeout! ltssm = 0x%x\n", val);
> > > +		return ret;
> > > +	}
> > > +
> > > +	dw_pcie_set_dstate(pci, 0x3);
> > > +
> > > +	pci->suspended = true;
> > > +
> > > +	return ret;
> > > +}
> > > +EXPORT_SYMBOL_GPL(dw_pcie_suspend_noirq);
> > > +
> > > +int dw_pcie_resume_noirq(struct dw_pcie *pci)
> > > +{
> > > +	int ret;
> > > +
> > > +	if (!pci->suspended)
> > > +		return 0;
> > > +
> > > +	pci->suspended = false;
> > > +
> > > +	dw_pcie_set_dstate(pci, 0x0);
> > > +
> > > +	pci->pp.ops->exit_from_l2(&pci->pp);
> > > +
> > > +	/* delay 10 ms to access EP */
> > 
> > Is this delay as part of the DWC spec? If so, please quote the section.
> > 
> > > +	mdelay(10);
> > > +
> > > +	ret = pci->pp.ops->host_init(&pci->pp);
> > > +	if (ret) {
> > > +		dev_err(pci->dev, "ls_pcie_host_init failed! ret = 0x%x\n", ret);
> > 
> > s/ls_pcie_host_init/Host init
> > 
> > > +		return ret;
> > > +	}
> > > +
> > > +	dw_pcie_setup_rc(&pci->pp);
> > > +
> > 
> > Don't you need to configure iATU?
> > 
> > > +	ret = dw_pcie_wait_for_link(pci);
> > 
> > Don't you need to start the link beforehand?
> 
> Suppose need start link, it works at layerscape platform just because dwc
> have not full power off. some state still kept.
> 

It may work for your platform but not for all if the power gets removed. So
please start the link manually.

- Mani

> > 
> > > +	if (ret) {
> > > +		dev_err(pci->dev, "wait link up timeout! ret = 0x%x\n", ret);
> > 
> > dw_pcie_wait_for_link() itself prints error message on failure. So no need to do
> > the same here.
> 
> Okay
> 
> > 
> > - Mani
> > 
> > > +		return ret;
> > > +	}
> > > +
> > > +	return ret;
> > > +}
> > > +EXPORT_SYMBOL_GPL(dw_pcie_resume_noirq);
> > > diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h
> > > index 79713ce075cc..effb07a506e4 100644
> > > --- a/drivers/pci/controller/dwc/pcie-designware.h
> > > +++ b/drivers/pci/controller/dwc/pcie-designware.h
> > > @@ -288,10 +288,21 @@ enum dw_pcie_core_rst {
> > >  	DW_PCIE_NUM_CORE_RSTS
> > >  };
> > >  
> > > +enum dw_pcie_ltssm {
> > > +	DW_PCIE_LTSSM_UNKNOWN = 0xFFFFFFFF,
> > > +	/* Need align PCIE_PORT_DEBUG0 bit0:5 */
> > > +	DW_PCIE_LTSSM_DETECT_QUIET = 0x0,
> > > +	DW_PCIE_LTSSM_DETECT_ACT = 0x1,
> > > +	DW_PCIE_LTSSM_L0 = 0x11,
> > > +	DW_PCIE_LTSSM_L2_IDLE = 0x15,
> > > +};
> > > +
> > >  struct dw_pcie_host_ops {
> > >  	int (*host_init)(struct dw_pcie_rp *pp);
> > >  	void (*host_deinit)(struct dw_pcie_rp *pp);
> > >  	int (*msi_host_init)(struct dw_pcie_rp *pp);
> > > +	void (*pme_turn_off)(struct dw_pcie_rp *pp);
> > > +	void (*exit_from_l2)(struct dw_pcie_rp *pp);
> > >  };
> > >  
> > >  struct dw_pcie_rp {
> > > @@ -364,6 +375,7 @@ struct dw_pcie_ops {
> > >  	void    (*write_dbi2)(struct dw_pcie *pcie, void __iomem *base, u32 reg,
> > >  			      size_t size, u32 val);
> > >  	int	(*link_up)(struct dw_pcie *pcie);
> > > +	enum dw_pcie_ltssm (*get_ltssm)(struct dw_pcie *pcie);
> > >  	int	(*start_link)(struct dw_pcie *pcie);
> > >  	void	(*stop_link)(struct dw_pcie *pcie);
> > >  };
> > > @@ -393,6 +405,7 @@ struct dw_pcie {
> > >  	struct reset_control_bulk_data	app_rsts[DW_PCIE_NUM_APP_RSTS];
> > >  	struct reset_control_bulk_data	core_rsts[DW_PCIE_NUM_CORE_RSTS];
> > >  	struct gpio_desc		*pe_rst;
> > > +	bool			suspended;
> > >  };
> > >  
> > >  #define to_dw_pcie_from_pp(port) container_of((port), struct dw_pcie, pp)
> > > @@ -430,6 +443,9 @@ void dw_pcie_iatu_detect(struct dw_pcie *pci);
> > >  int dw_pcie_edma_detect(struct dw_pcie *pci);
> > >  void dw_pcie_edma_remove(struct dw_pcie *pci);
> > >  
> > > +int dw_pcie_suspend_noirq(struct dw_pcie *pci);
> > > +int dw_pcie_resume_noirq(struct dw_pcie *pci);
> > > +
> > >  static inline void dw_pcie_writel_dbi(struct dw_pcie *pci, u32 reg, u32 val)
> > >  {
> > >  	dw_pcie_write_dbi(pci, reg, 0x4, val);
> > > @@ -501,6 +517,18 @@ static inline void dw_pcie_stop_link(struct dw_pcie *pci)
> > >  		pci->ops->stop_link(pci);
> > >  }
> > >  
> > > +static inline enum dw_pcie_ltssm dw_pcie_get_ltssm(struct dw_pcie *pci)
> > > +{
> > > +	u32 val;
> > > +
> > > +	if (pci->ops && pci->ops->get_ltssm)
> > > +		return pci->ops->get_ltssm(pci);
> > > +
> > > +	val = dw_pcie_readl_dbi(pci, PCIE_PORT_DEBUG0);
> > > +
> > > +	return (enum dw_pcie_ltssm)FIELD_GET(PORT_LOGIC_LTSSM_STATE_MASK, val);
> > > +}
> > > +
> > >  #ifdef CONFIG_PCIE_DW_HOST
> > >  irqreturn_t dw_handle_msi_irq(struct dw_pcie_rp *pp);
> > >  int dw_pcie_setup_rc(struct dw_pcie_rp *pp);
> > > -- 
> > > 2.34.1
> > > 
> > 
> > -- 
> > மணிவண்ணன் சதாசிவம்
Frank Li July 19, 2023, 7:16 p.m. UTC | #7
On Tue, Jul 18, 2023 at 03:34:00PM +0530, Manivannan Sadhasivam wrote:
> On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> 
> Fine then. But can we check for PM_LINKST_IN_L2 SII System Information Interface
> (SII) instead of LTSSM state?
>
where define PM_LINKST_IN_L2? I just find one at

drivers/pci/controller/dwc/pcie-qcom.c:#define PARF_DEBUG_CNT_PM_LINKST_IN_L2           0xc04

Frank
Manivannan Sadhasivam July 20, 2023, 2:20 p.m. UTC | #8
On Wed, Jul 19, 2023 at 03:16:48PM -0400, Frank Li wrote:
> On Tue, Jul 18, 2023 at 03:34:00PM +0530, Manivannan Sadhasivam wrote:
> > On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> > 
> > Fine then. But can we check for PM_LINKST_IN_L2 SII System Information Interface
> > (SII) instead of LTSSM state?
> >
> where define PM_LINKST_IN_L2? I just find one at
> 
> drivers/pci/controller/dwc/pcie-qcom.c:#define PARF_DEBUG_CNT_PM_LINKST_IN_L2           0xc04
> 

Yes, this register is not exposed by the DWC core itself but the vendors may
expose such as Qcom as you pointed out. If it is not available on all platforms,
then feel free to ignore my advice.

- Mani

> Frank
>
Manivannan Sadhasivam July 20, 2023, 2:25 p.m. UTC | #9
On Tue, Jul 18, 2023 at 03:34:26PM +0530, Manivannan Sadhasivam wrote:
> On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> > On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
> > > On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> > > > Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> > > > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> > > > 
> > > > Typical L2 entry workflow:
> > > > 
> > > > 1. Transmit PME turn off signal to PCI devices.
> > > > 2. Await link entering L2_IDLE state.
> > > 
> > > AFAIK, typical workflow is to wait for PME_To_Ack.
> > 
> > 1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
> > I think PCI RC needs some time to set link enter L2 after get ACK from
> > PME.
> > 

One more comment. If you transition the device to L2/L3, then it can loose power
if Vaux was not provided. In that case, can all the devices work after resume?
Most notably NVMe?

- Mani

> > > 
> > > > 3. Transition Root complex to D3 state.
> > > > 
> > > > Typical L2 exit workflow:
> > > > 
> > > > 1. Transition Root complex to D0 state.
> > > > 2. Issue exit from L2 command.
> > > > 3. Reinitialize PCI host.
> > > > 4. Wait for link to become active.
> > > > 
> > > > Signed-off-by: Frank Li <Frank.Li@nxp.com>
> > > > ---
> > > > Change from v2 to v3: 
> > > > - Basic rewrite whole patch according rob herry suggestion. 
> > > >   put common function into dwc, so more soc can share the same logic.
> > > >   
> > > >  .../pci/controller/dwc/pcie-designware-host.c | 80 +++++++++++++++++++
> > > >  drivers/pci/controller/dwc/pcie-designware.h  | 28 +++++++
> > > >  2 files changed, 108 insertions(+)
> > > > 
> > > > diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
> > > > index 9952057c8819..ef6869488bde 100644
> > > > --- a/drivers/pci/controller/dwc/pcie-designware-host.c
> > > > +++ b/drivers/pci/controller/dwc/pcie-designware-host.c
> > > > @@ -8,6 +8,7 @@
> > > >   * Author: Jingoo Han <jg1.han@samsung.com>
> > > >   */
> > > >  
> > > > +#include <linux/iopoll.h>
> > > >  #include <linux/irqchip/chained_irq.h>
> > > >  #include <linux/irqdomain.h>
> > > >  #include <linux/msi.h>
> > > > @@ -807,3 +808,82 @@ int dw_pcie_setup_rc(struct dw_pcie_rp *pp)
> > > >  	return 0;
> > > >  }
> > > >  EXPORT_SYMBOL_GPL(dw_pcie_setup_rc);
> > > > +
> > > > +/*
> > > > + * There are for configuring host controllers, which are bridges *to* PCI devices
> > > > + * but are not PCI devices themselves.
> > > 
> > > None of the functions applicable to the devices. So there is no need for this
> > > comment.
> > 
> > I copy comments in drivers/pci/controller/dwc/pcie-designware.c.
> > 
> > /*
> >  * These interfaces resemble the pci_find_*capability() interfaces, but these
> >  * are for configuring host controllers, which are bridges *to* PCI devices but
> >  * are not PCI devices themselves.
> >  */
> > static u8 __dw_pcie_find_next_cap(struct dw_pcie *pci, u8 cap_ptr,
> >                                   u8 cap)
> > 
> > 
> > I think it is reasonalble because it is too similar with standard API
> > pci_set_power_state();
> > 
> 
> Ok, then please add this API similarity in the comment as like
> __dw_pcie_find_next_cap(). Also change "There" to "These".
> 
> > > 
> > > > + */
> > > > +static void dw_pcie_set_dstate(struct dw_pcie *pci, u32 dstate)
> > > 
> > > Please use pci_power_t defines for dstates.
> > 
> > Although dwc use the same define, it is difference things. 
> > 
> 
> Sorry, what difference? Could you please clarify?
> 
> > > 
> > > > +{
> > > > +	u8 offset = dw_pcie_find_capability(pci, PCI_CAP_ID_PM);
> > > > +	u32 val;
> > > > +
> > > > +	val = dw_pcie_readw_dbi(pci, offset + PCI_PM_CTRL);
> > > 
> > > Please use PCI accessors for accessing spec compliant registers.
> > 
> > According to comments in pcie-designware.c, it is difference concept
> > even though register define is the same as PCI spec. It was used to
> > control root bridges.
> > 
> 
> Ah, I got slightly confused. This is fine.
> 
> > > 
> > > > +	val &= ~PCI_PM_CTRL_STATE_MASK;
> > > > +	val |= dstate;
> > > > +	dw_pcie_writew_dbi(pci, offset + PCI_PM_CTRL, val);
> > > > +}
> > > > +
> > > > +int dw_pcie_suspend_noirq(struct dw_pcie *pci)
> > > > +{
> > > > +	u32 val;
> > > > +	int ret;
> > > > +
> > > > +	if (dw_pcie_get_ltssm(pci) <= DW_PCIE_LTSSM_DETECT_ACT)
> > > > +		return 0;
> > > > +
> > > > +	pci->pp.ops->pme_turn_off(&pci->pp);
> > > 
> > > You should first check for the existence of the callback before invoking. This
> > > applies to all callbacks in this patch.
> > 
> > Yes, I will update.
> > 
> > > 
> > > > +
> > > > +	/*
> > > > +	 * PCI Express Base Specification Rev 4.0
> > > > +	 * 5.3.3.2.1 PME Synchronization
> > > > +	 * Recommand 1ms to 10ms timeout to check L2 ready
> > > > +	 */
> > > > +	ret = read_poll_timeout(dw_pcie_get_ltssm, val, val == DW_PCIE_LTSSM_L2_IDLE,
> > > > +				100, 10000, false, pci);
> > > 
> > > Is there no way to wait for PME_To_Ack TLP?
> > 
> > 
> > Suppose PME_turn_off should wait for ACK before return. 
> 
> Ok. I didn't see this behavior in the spec, hence curious.
> 
> > Here, just make sure Link enter L2 status. Hardware need some time to put
> > link to L2 after get ACK from bus, even it is very short generally.
> > 
> 
> Fine then. But can we check for PM_LINKST_IN_L2 SII System Information Interface
> (SII) instead of LTSSM state?
> 
> > > 
> > > > +	if (ret) {
> > > > +		dev_err(pci->dev, "PCIe link enter L2 timeout! ltssm = 0x%x\n", val);
> > > > +		return ret;
> > > > +	}
> > > > +
> > > > +	dw_pcie_set_dstate(pci, 0x3);
> > > > +
> > > > +	pci->suspended = true;
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +EXPORT_SYMBOL_GPL(dw_pcie_suspend_noirq);
> > > > +
> > > > +int dw_pcie_resume_noirq(struct dw_pcie *pci)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	if (!pci->suspended)
> > > > +		return 0;
> > > > +
> > > > +	pci->suspended = false;
> > > > +
> > > > +	dw_pcie_set_dstate(pci, 0x0);
> > > > +
> > > > +	pci->pp.ops->exit_from_l2(&pci->pp);
> > > > +
> > > > +	/* delay 10 ms to access EP */
> > > 
> > > Is this delay as part of the DWC spec? If so, please quote the section.
> > > 
> > > > +	mdelay(10);
> > > > +
> > > > +	ret = pci->pp.ops->host_init(&pci->pp);
> > > > +	if (ret) {
> > > > +		dev_err(pci->dev, "ls_pcie_host_init failed! ret = 0x%x\n", ret);
> > > 
> > > s/ls_pcie_host_init/Host init
> > > 
> > > > +		return ret;
> > > > +	}
> > > > +
> > > > +	dw_pcie_setup_rc(&pci->pp);
> > > > +
> > > 
> > > Don't you need to configure iATU?
> > > 
> > > > +	ret = dw_pcie_wait_for_link(pci);
> > > 
> > > Don't you need to start the link beforehand?
> > 
> > Suppose need start link, it works at layerscape platform just because dwc
> > have not full power off. some state still kept.
> > 
> 
> It may work for your platform but not for all if the power gets removed. So
> please start the link manually.
> 
> - Mani
> 
> > > 
> > > > +	if (ret) {
> > > > +		dev_err(pci->dev, "wait link up timeout! ret = 0x%x\n", ret);
> > > 
> > > dw_pcie_wait_for_link() itself prints error message on failure. So no need to do
> > > the same here.
> > 
> > Okay
> > 
> > > 
> > > - Mani
> > > 
> > > > +		return ret;
> > > > +	}
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +EXPORT_SYMBOL_GPL(dw_pcie_resume_noirq);
> > > > diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h
> > > > index 79713ce075cc..effb07a506e4 100644
> > > > --- a/drivers/pci/controller/dwc/pcie-designware.h
> > > > +++ b/drivers/pci/controller/dwc/pcie-designware.h
> > > > @@ -288,10 +288,21 @@ enum dw_pcie_core_rst {
> > > >  	DW_PCIE_NUM_CORE_RSTS
> > > >  };
> > > >  
> > > > +enum dw_pcie_ltssm {
> > > > +	DW_PCIE_LTSSM_UNKNOWN = 0xFFFFFFFF,
> > > > +	/* Need align PCIE_PORT_DEBUG0 bit0:5 */
> > > > +	DW_PCIE_LTSSM_DETECT_QUIET = 0x0,
> > > > +	DW_PCIE_LTSSM_DETECT_ACT = 0x1,
> > > > +	DW_PCIE_LTSSM_L0 = 0x11,
> > > > +	DW_PCIE_LTSSM_L2_IDLE = 0x15,
> > > > +};
> > > > +
> > > >  struct dw_pcie_host_ops {
> > > >  	int (*host_init)(struct dw_pcie_rp *pp);
> > > >  	void (*host_deinit)(struct dw_pcie_rp *pp);
> > > >  	int (*msi_host_init)(struct dw_pcie_rp *pp);
> > > > +	void (*pme_turn_off)(struct dw_pcie_rp *pp);
> > > > +	void (*exit_from_l2)(struct dw_pcie_rp *pp);
> > > >  };
> > > >  
> > > >  struct dw_pcie_rp {
> > > > @@ -364,6 +375,7 @@ struct dw_pcie_ops {
> > > >  	void    (*write_dbi2)(struct dw_pcie *pcie, void __iomem *base, u32 reg,
> > > >  			      size_t size, u32 val);
> > > >  	int	(*link_up)(struct dw_pcie *pcie);
> > > > +	enum dw_pcie_ltssm (*get_ltssm)(struct dw_pcie *pcie);
> > > >  	int	(*start_link)(struct dw_pcie *pcie);
> > > >  	void	(*stop_link)(struct dw_pcie *pcie);
> > > >  };
> > > > @@ -393,6 +405,7 @@ struct dw_pcie {
> > > >  	struct reset_control_bulk_data	app_rsts[DW_PCIE_NUM_APP_RSTS];
> > > >  	struct reset_control_bulk_data	core_rsts[DW_PCIE_NUM_CORE_RSTS];
> > > >  	struct gpio_desc		*pe_rst;
> > > > +	bool			suspended;
> > > >  };
> > > >  
> > > >  #define to_dw_pcie_from_pp(port) container_of((port), struct dw_pcie, pp)
> > > > @@ -430,6 +443,9 @@ void dw_pcie_iatu_detect(struct dw_pcie *pci);
> > > >  int dw_pcie_edma_detect(struct dw_pcie *pci);
> > > >  void dw_pcie_edma_remove(struct dw_pcie *pci);
> > > >  
> > > > +int dw_pcie_suspend_noirq(struct dw_pcie *pci);
> > > > +int dw_pcie_resume_noirq(struct dw_pcie *pci);
> > > > +
> > > >  static inline void dw_pcie_writel_dbi(struct dw_pcie *pci, u32 reg, u32 val)
> > > >  {
> > > >  	dw_pcie_write_dbi(pci, reg, 0x4, val);
> > > > @@ -501,6 +517,18 @@ static inline void dw_pcie_stop_link(struct dw_pcie *pci)
> > > >  		pci->ops->stop_link(pci);
> > > >  }
> > > >  
> > > > +static inline enum dw_pcie_ltssm dw_pcie_get_ltssm(struct dw_pcie *pci)
> > > > +{
> > > > +	u32 val;
> > > > +
> > > > +	if (pci->ops && pci->ops->get_ltssm)
> > > > +		return pci->ops->get_ltssm(pci);
> > > > +
> > > > +	val = dw_pcie_readl_dbi(pci, PCIE_PORT_DEBUG0);
> > > > +
> > > > +	return (enum dw_pcie_ltssm)FIELD_GET(PORT_LOGIC_LTSSM_STATE_MASK, val);
> > > > +}
> > > > +
> > > >  #ifdef CONFIG_PCIE_DW_HOST
> > > >  irqreturn_t dw_handle_msi_irq(struct dw_pcie_rp *pp);
> > > >  int dw_pcie_setup_rc(struct dw_pcie_rp *pp);
> > > > -- 
> > > > 2.34.1
> > > > 
> > > 
> > > -- 
> > > மணிவண்ணன் சதாசிவம்
> 
> -- 
> மணிவண்ணன் சதாசிவம்
Frank Li July 20, 2023, 2:37 p.m. UTC | #10
On Thu, Jul 20, 2023 at 07:55:09PM +0530, Manivannan Sadhasivam wrote:
> On Tue, Jul 18, 2023 at 03:34:26PM +0530, Manivannan Sadhasivam wrote:
> > On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> > > On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
> > > > On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> > > > > Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> > > > > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> > > > > 
> > > > > Typical L2 entry workflow:
> > > > > 
> > > > > 1. Transmit PME turn off signal to PCI devices.
> > > > > 2. Await link entering L2_IDLE state.
> > > > 
> > > > AFAIK, typical workflow is to wait for PME_To_Ack.
> > > 
> > > 1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
> > > I think PCI RC needs some time to set link enter L2 after get ACK from
> > > PME.
> > > 
> 
> One more comment. If you transition the device to L2/L3, then it can loose power
> if Vaux was not provided. In that case, can all the devices work after resume?
> Most notably NVMe?

I have not hardware to do such test, NVMe driver will reinit everything after
resume if no L1.1\L1.2 support. If there are L1.1\L1.2, NVME expect it leave
at L1.2 at suspend to get better resume latency.

This API help remove duplicate codes and it can be improved gradually.


> 
> - Mani
> 
> 
> -- 
> மணிவண்ணன் சதாசிவம்
Manivannan Sadhasivam July 20, 2023, 4:07 p.m. UTC | #11
On Thu, Jul 20, 2023 at 10:37:36AM -0400, Frank Li wrote:
> On Thu, Jul 20, 2023 at 07:55:09PM +0530, Manivannan Sadhasivam wrote:
> > On Tue, Jul 18, 2023 at 03:34:26PM +0530, Manivannan Sadhasivam wrote:
> > > On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> > > > On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> > > > > > Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> > > > > > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> > > > > > 
> > > > > > Typical L2 entry workflow:
> > > > > > 
> > > > > > 1. Transmit PME turn off signal to PCI devices.
> > > > > > 2. Await link entering L2_IDLE state.
> > > > > 
> > > > > AFAIK, typical workflow is to wait for PME_To_Ack.
> > > > 
> > > > 1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
> > > > I think PCI RC needs some time to set link enter L2 after get ACK from
> > > > PME.
> > > > 
> > 
> > One more comment. If you transition the device to L2/L3, then it can loose power
> > if Vaux was not provided. In that case, can all the devices work after resume?
> > Most notably NVMe?
> 
> I have not hardware to do such test, NVMe driver will reinit everything after
> resume if no L1.1\L1.2 support. If there are L1.1\L1.2, NVME expect it leave
> at L1.2 at suspend to get better resume latency.
> 

To be precise, NVMe driver will shutdown the device if there is no ASPM support
and keep it in low power mode otherwise (there are other cases as well but we do
not need to worry).

But here you are not checking for ASPM state in the suspend path, and just
forcing the link to be in L2/L3 (thereby D3Cold) even though NVMe driver may
expect it to be in low power state like ASPM/APST.

So you should only put the link to L2/L3 if there is no ASPM support. Otherwise,
you'll ending up with bug reports when users connect NVMe to it.

- Mani

> This API help remove duplicate codes and it can be improved gradually.
> 
> 
> > 
> > - Mani
> > 
> > 
> > -- 
> > மணிவண்ணன் சதாசிவம்
Bjorn Helgaas July 20, 2023, 4:20 p.m. UTC | #12
On Thu, Jul 20, 2023 at 09:37:38PM +0530, Manivannan Sadhasivam wrote:
> On Thu, Jul 20, 2023 at 10:37:36AM -0400, Frank Li wrote:
> > On Thu, Jul 20, 2023 at 07:55:09PM +0530, Manivannan Sadhasivam wrote:
> > > On Tue, Jul 18, 2023 at 03:34:26PM +0530, Manivannan Sadhasivam wrote:
> > > > On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> > > > > On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > > On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> > > > > > > Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> > > > > > > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> > > > > > > 
> > > > > > > Typical L2 entry workflow:
> > > > > > > 
> > > > > > > 1. Transmit PME turn off signal to PCI devices.
> > > > > > > 2. Await link entering L2_IDLE state.
> > > > > > 
> > > > > > AFAIK, typical workflow is to wait for PME_To_Ack.
> > > > > 
> > > > > 1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
> > > > > I think PCI RC needs some time to set link enter L2 after get ACK from
> > > > > PME.
> > > 
> > > One more comment. If you transition the device to L2/L3, then it
> > > can lose power if Vaux was not provided. In that case, can all
> > > the devices work after resume?  Most notably NVMe?
> > 
> > I have not hardware to do such test, NVMe driver will reinit
> > everything after resume if no L1.1\L1.2 support. If there are
> > L1.1\L1.2, NVME expect it leave at L1.2 at suspend to get better
> > resume latency.
> 
> To be precise, NVMe driver will shutdown the device if there is no
> ASPM support and keep it in low power mode otherwise (there are
> other cases as well but we do not need to worry).
> 
> But here you are not checking for ASPM state in the suspend path,
> and just forcing the link to be in L2/L3 (thereby D3Cold) even
> though NVMe driver may expect it to be in low power state like
> ASPM/APST.
> 
> So you should only put the link to L2/L3 if there is no ASPM
> support. Otherwise, you'll ending up with bug reports when users
> connect NVMe to it.

Can you point me to the NVMe code that shuts down the device if
there's no ASPM support?  That sounds interesting and of interest to
other drivers that want to do suspend.

Bjorn
Frank Li July 20, 2023, 4:26 p.m. UTC | #13
On Thu, Jul 20, 2023 at 09:37:38PM +0530, Manivannan Sadhasivam wrote:
> On Thu, Jul 20, 2023 at 10:37:36AM -0400, Frank Li wrote:
> > On Thu, Jul 20, 2023 at 07:55:09PM +0530, Manivannan Sadhasivam wrote:
> > > On Tue, Jul 18, 2023 at 03:34:26PM +0530, Manivannan Sadhasivam wrote:
> > > > On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> > > > > On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > > On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> > > > > > > Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> > > > > > > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> > > > > > > 
> > > > > > > Typical L2 entry workflow:
> > > > > > > 
> > > > > > > 1. Transmit PME turn off signal to PCI devices.
> > > > > > > 2. Await link entering L2_IDLE state.
> > > > > > 
> > > > > > AFAIK, typical workflow is to wait for PME_To_Ack.
> > > > > 
> > > > > 1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
> > > > > I think PCI RC needs some time to set link enter L2 after get ACK from
> > > > > PME.
> > > > > 
> > > 
> > > One more comment. If you transition the device to L2/L3, then it can loose power
> > > if Vaux was not provided. In that case, can all the devices work after resume?
> > > Most notably NVMe?
> > 
> > I have not hardware to do such test, NVMe driver will reinit everything after
> > resume if no L1.1\L1.2 support. If there are L1.1\L1.2, NVME expect it leave
> > at L1.2 at suspend to get better resume latency.
> > 
> 
> To be precise, NVMe driver will shutdown the device if there is no ASPM support
> and keep it in low power mode otherwise (there are other cases as well but we do
> not need to worry).

I supposed this should work. but I have not hardware to test it now. NMVE already
sent shut down command to SSD, which can safely turn off. after resume, that most
likely a cold reset.

> 
> But here you are not checking for ASPM state in the suspend path, and just
> forcing the link to be in L2/L3 (thereby D3Cold) even though NVMe driver may
> expect it to be in low power state like ASPM/APST.

This function is not called defaultly and need platform driver to call it as need.
Actually, I think PCI framework should handle L1.2 and L2 case, some devices
or user case want to L1.2 to get better resume latency, some devices want to
L2 to get better power saving, which out of scope of this patches.

This patch just handle L2 case, I remember L1.2 don't expect send PME at all.

> 
> So you should only put the link to L2/L3 if there is no ASPM support. Otherwise,
> you'll ending up with bug reports when users connect NVMe to it.

Platform should choose call or no call this function according to their
user case. So far, I have not found a good mathod to let ASPM to affect
suspend/resume. 

> 
> - Mani
> 
> > This API help remove duplicate codes and it can be improved gradually.
> > 
> > 
> > > 
> > > - Mani
> > > 
> > > 
> > > -- 
> > > மணிவண்ணன் சதாசிவம்
> 
> -- 
> மணிவண்ணன் சதாசிவம்
Manivannan Sadhasivam July 20, 2023, 4:27 p.m. UTC | #14
On Thu, Jul 20, 2023 at 11:20:27AM -0500, Bjorn Helgaas wrote:
> On Thu, Jul 20, 2023 at 09:37:38PM +0530, Manivannan Sadhasivam wrote:
> > On Thu, Jul 20, 2023 at 10:37:36AM -0400, Frank Li wrote:
> > > On Thu, Jul 20, 2023 at 07:55:09PM +0530, Manivannan Sadhasivam wrote:
> > > > On Tue, Jul 18, 2023 at 03:34:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> > > > > > On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > > > On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> > > > > > > > Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> > > > > > > > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> > > > > > > > 
> > > > > > > > Typical L2 entry workflow:
> > > > > > > > 
> > > > > > > > 1. Transmit PME turn off signal to PCI devices.
> > > > > > > > 2. Await link entering L2_IDLE state.
> > > > > > > 
> > > > > > > AFAIK, typical workflow is to wait for PME_To_Ack.
> > > > > > 
> > > > > > 1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
> > > > > > I think PCI RC needs some time to set link enter L2 after get ACK from
> > > > > > PME.
> > > > 
> > > > One more comment. If you transition the device to L2/L3, then it
> > > > can lose power if Vaux was not provided. In that case, can all
> > > > the devices work after resume?  Most notably NVMe?
> > > 
> > > I have not hardware to do such test, NVMe driver will reinit
> > > everything after resume if no L1.1\L1.2 support. If there are
> > > L1.1\L1.2, NVME expect it leave at L1.2 at suspend to get better
> > > resume latency.
> > 
> > To be precise, NVMe driver will shutdown the device if there is no
> > ASPM support and keep it in low power mode otherwise (there are
> > other cases as well but we do not need to worry).
> > 
> > But here you are not checking for ASPM state in the suspend path,
> > and just forcing the link to be in L2/L3 (thereby D3Cold) even
> > though NVMe driver may expect it to be in low power state like
> > ASPM/APST.
> > 
> > So you should only put the link to L2/L3 if there is no ASPM
> > support. Otherwise, you'll ending up with bug reports when users
> > connect NVMe to it.
> 
> Can you point me to the NVMe code that shuts down the device if
> there's no ASPM support?  That sounds interesting and of interest to
> other drivers that want to do suspend.
> 

drivers/nvme/host/pci.c #3185

Note that, with ACPI based systems and for a few SSDs the behavior may change
(check NVME_QUIRK_SIMPLE_SUSPEND flag).

- Mani

> Bjorn
Manivannan Sadhasivam July 20, 2023, 4:43 p.m. UTC | #15
On Thu, Jul 20, 2023 at 12:26:38PM -0400, Frank Li wrote:
> On Thu, Jul 20, 2023 at 09:37:38PM +0530, Manivannan Sadhasivam wrote:
> > On Thu, Jul 20, 2023 at 10:37:36AM -0400, Frank Li wrote:
> > > On Thu, Jul 20, 2023 at 07:55:09PM +0530, Manivannan Sadhasivam wrote:
> > > > On Tue, Jul 18, 2023 at 03:34:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> > > > > > On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > > > On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> > > > > > > > Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> > > > > > > > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> > > > > > > > 
> > > > > > > > Typical L2 entry workflow:
> > > > > > > > 
> > > > > > > > 1. Transmit PME turn off signal to PCI devices.
> > > > > > > > 2. Await link entering L2_IDLE state.
> > > > > > > 
> > > > > > > AFAIK, typical workflow is to wait for PME_To_Ack.
> > > > > > 
> > > > > > 1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
> > > > > > I think PCI RC needs some time to set link enter L2 after get ACK from
> > > > > > PME.
> > > > > > 
> > > > 
> > > > One more comment. If you transition the device to L2/L3, then it can loose power
> > > > if Vaux was not provided. In that case, can all the devices work after resume?
> > > > Most notably NVMe?
> > > 
> > > I have not hardware to do such test, NVMe driver will reinit everything after
> > > resume if no L1.1\L1.2 support. If there are L1.1\L1.2, NVME expect it leave
> > > at L1.2 at suspend to get better resume latency.
> > > 
> > 
> > To be precise, NVMe driver will shutdown the device if there is no ASPM support
> > and keep it in low power mode otherwise (there are other cases as well but we do
> > not need to worry).
> 
> I supposed this should work. but I have not hardware to test it now. NMVE already
> sent shut down command to SSD, which can safely turn off. after resume, that most
> likely a cold reset.
> 

NO, it won't work and that's the reason the Qcom platforms are not transitioning
the link to L2/L3 state during suspend. This applies to other platforms
including layerscape as well.

> > 
> > But here you are not checking for ASPM state in the suspend path, and just
> > forcing the link to be in L2/L3 (thereby D3Cold) even though NVMe driver may
> > expect it to be in low power state like ASPM/APST.
> 
> This function is not called defaultly and need platform driver to call it as need.
> Actually, I think PCI framework should handle L1.2 and L2 case, some devices
> or user case want to L1.2 to get better resume latency, some devices want to
> L2 to get better power saving, which out of scope of this patches.
> 

I'm referring to the platform where these helper functions are being used which
is layerscape. It doesn't matter whether you test this series with NVMe or not,
it will not work unless you disable ASPM.

> This patch just handle L2 case, I remember L1.2 don't expect send PME at all.
> 
> > 
> > So you should only put the link to L2/L3 if there is no ASPM support. Otherwise,
> > you'll ending up with bug reports when users connect NVMe to it.
> 
> Platform should choose call or no call this function according to their
> user case. So far, I have not found a good mathod to let ASPM to affect
> suspend/resume. 
> 

You are missing my point here. If any platform decides to use these helper
functions, they will face problems with NVMe. So please add a check for ASPM
state before doing any L2/L3 handling.

I agree that it may not be optimal w.r.t power savings, but the PCIe controller
driver should work for all devices.

- Mani

> > 
> > - Mani
> > 
> > > This API help remove duplicate codes and it can be improved gradually.
> > > 
> > > 
> > > > 
> > > > - Mani
> > > > 
> > > > 
> > > > -- 
> > > > மணிவண்ணன் சதாசிவம்
> > 
> > -- 
> > மணிவண்ணன் சதாசிவம்
Frank Li July 20, 2023, 4:59 p.m. UTC | #16
On Thu, Jul 20, 2023 at 10:13:18PM +0530, Manivannan Sadhasivam wrote:
> On Thu, Jul 20, 2023 at 12:26:38PM -0400, Frank Li wrote:
> > On Thu, Jul 20, 2023 at 09:37:38PM +0530, Manivannan Sadhasivam wrote:
> > > On Thu, Jul 20, 2023 at 10:37:36AM -0400, Frank Li wrote:
> > > > On Thu, Jul 20, 2023 at 07:55:09PM +0530, Manivannan Sadhasivam wrote:
> > > > > On Tue, Jul 18, 2023 at 03:34:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > > On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> > > > > > > On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > > > > On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> > > > > > > > > Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> > > > > > > > > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> > > > > > > > > 
> > > > > > > > > Typical L2 entry workflow:
> > > > > > > > > 
> > > > > > > > > 1. Transmit PME turn off signal to PCI devices.
> > > > > > > > > 2. Await link entering L2_IDLE state.
> > > > > > > > 
> > > > > > > > AFAIK, typical workflow is to wait for PME_To_Ack.
> > > > > > > 
> > > > > > > 1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
> > > > > > > I think PCI RC needs some time to set link enter L2 after get ACK from
> > > > > > > PME.
> > > > > > > 
> > > > > 
> > > > > One more comment. If you transition the device to L2/L3, then it can loose power
> > > > > if Vaux was not provided. In that case, can all the devices work after resume?
> > > > > Most notably NVMe?
> > > > 
> > > > I have not hardware to do such test, NVMe driver will reinit everything after
> > > > resume if no L1.1\L1.2 support. If there are L1.1\L1.2, NVME expect it leave
> > > > at L1.2 at suspend to get better resume latency.
> > > > 
> > > 
> > > To be precise, NVMe driver will shutdown the device if there is no ASPM support
> > > and keep it in low power mode otherwise (there are other cases as well but we do
> > > not need to worry).
> > 
> > I supposed this should work. but I have not hardware to test it now. NMVE already
> > sent shut down command to SSD, which can safely turn off. after resume, that most
> > likely a cold reset.
> > 
> 
> NO, it won't work and that's the reason the Qcom platforms are not transitioning
> the link to L2/L3 state during suspend. This applies to other platforms
> including layerscape as well.
> 
> > > 
> > > But here you are not checking for ASPM state in the suspend path, and just
> > > forcing the link to be in L2/L3 (thereby D3Cold) even though NVMe driver may
> > > expect it to be in low power state like ASPM/APST.
> > 
> > This function is not called defaultly and need platform driver to call it as need.
> > Actually, I think PCI framework should handle L1.2 and L2 case, some devices
> > or user case want to L1.2 to get better resume latency, some devices want to
> > L2 to get better power saving, which out of scope of this patches.
> > 
> 
> I'm referring to the platform where these helper functions are being used which
> is layerscape. It doesn't matter whether you test this series with NVMe or not,
> it will not work unless you disable ASPM.

I tested with NVNE. You are right, ASPM is disabled at layerscape platform.

> 
> > This patch just handle L2 case, I remember L1.2 don't expect send PME at all.
> > 
> > > 
> > > So you should only put the link to L2/L3 if there is no ASPM support. Otherwise,
> > > you'll ending up with bug reports when users connect NVMe to it.
> > 
> > Platform should choose call or no call this function according to their
> > user case. So far, I have not found a good mathod to let ASPM to affect
> > suspend/resume. 
> > 
> 
> You are missing my point here. If any platform decides to use these helper
> functions, they will face problems with NVMe. So please add a check for ASPM
> state before doing any L2/L3 handling.
> 
> I agree that it may not be optimal w.r.t power savings, but the PCIe controller
> driver should work for all devices.

I can add aspm check here. but I hope there are a flags in future, which can
show if expect pci controller enter L2.

Frank

> 
> - Mani
> 
> > > 
> > > - Mani
> > > 
> > > > This API help remove duplicate codes and it can be improved gradually.
> > > > 
> > > > 
> > > > > 
> > > > > - Mani
> > > > > 
> > > > > 
> > > > > -- 
> > > > > மணிவண்ணன் சதாசிவம்
> > > 
> > > -- 
> > > மணிவண்ணன் சதாசிவம்
> 
> -- 
> மணிவண்ணன் சதாசிவம்
Bjorn Helgaas July 20, 2023, 6:35 p.m. UTC | #17
On Thu, Jul 20, 2023 at 09:57:58PM +0530, Manivannan Sadhasivam wrote:
> On Thu, Jul 20, 2023 at 11:20:27AM -0500, Bjorn Helgaas wrote:
> > On Thu, Jul 20, 2023 at 09:37:38PM +0530, Manivannan Sadhasivam wrote:
> > ...

> > > To be precise, NVMe driver will shutdown the device if there is
> > > no ASPM support and keep it in low power mode otherwise (there
> > > are other cases as well but we do not need to worry).
> > > 
> > > But here you are not checking for ASPM state in the suspend
> > > path, and just forcing the link to be in L2/L3 (thereby D3Cold)
> > > even though NVMe driver may expect it to be in low power state
> > > like ASPM/APST.
> > > 
> > > So you should only put the link to L2/L3 if there is no ASPM
> > > support. Otherwise, you'll ending up with bug reports when users
> > > connect NVMe to it.
> > 
> > Can you point me to the NVMe code that shuts down the device if
> > there's no ASPM support?  That sounds interesting and of interest
> > to other drivers that want to do suspend.
> 
> drivers/nvme/host/pci.c #3185
> 
> Note that, with ACPI based systems and for a few SSDs the behavior
> may change (check NVME_QUIRK_SIMPLE_SUSPEND flag).

For posterity, since the filename and line number may change:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/nvme/host/pci.c?id=v6.4#n3185

  static int nvme_suspend(struct device *dev)
  {
    struct pci_dev *pdev = to_pci_dev(dev);
    struct nvme_dev *ndev = pci_get_drvdata(pdev);
    struct nvme_ctrl *ctrl = &ndev->ctrl;
    int ret = -EBUSY;

    ndev->last_ps = U32_MAX;

    /*
     * The platform does not remove power for a kernel managed suspend so
     * use host managed nvme power settings for lowest idle power if
     * possible. This should have quicker resume latency than a full device
     * shutdown.  But if the firmware is involved after the suspend or the
     * device does not support any non-default power states, shut down the
     * device fully.
     *
     * If ASPM is not enabled for the device, shut down the device and allow
     * the PCI bus layer to put it into D3 in order to take the PCIe link
     * down, so as to allow the platform to achieve its minimum low-power
     * state (which may not be possible if the link is up).
     */
    if (pm_suspend_via_firmware() || !ctrl->npss ||
        !pcie_aspm_enabled(pdev) ||
        (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
            return nvme_disable_prepare_reset(ndev, true);

    nvme_start_freeze(ctrl);
    nvme_wait_freeze(ctrl);
    nvme_sync_queues(ctrl);
    ...

Added by 4eaefe8c621c ("nvme-pci: Allow PCI bus-level PM to be used if
ASPM is disabled"): https://git.kernel.org/linus/4eaefe8c621c
Shawn Lin July 21, 2023, 2:09 a.m. UTC | #18
On 2023/7/21 0:07, Manivannan Sadhasivam wrote:
> On Thu, Jul 20, 2023 at 10:37:36AM -0400, Frank Li wrote:
>> On Thu, Jul 20, 2023 at 07:55:09PM +0530, Manivannan Sadhasivam wrote:
>>> On Tue, Jul 18, 2023 at 03:34:26PM +0530, Manivannan Sadhasivam wrote:
>>>> On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
>>>>> On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
>>>>>> On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
>>>>>>> Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
>>>>>>> Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
>>>>>>>
>>>>>>> Typical L2 entry workflow:
>>>>>>>
>>>>>>> 1. Transmit PME turn off signal to PCI devices.
>>>>>>> 2. Await link entering L2_IDLE state.
>>>>>>
>>>>>> AFAIK, typical workflow is to wait for PME_To_Ack.
>>>>>
>>>>> 1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
>>>>> I think PCI RC needs some time to set link enter L2 after get ACK from
>>>>> PME.
>>>>>
>>>
>>> One more comment. If you transition the device to L2/L3, then it can loose power
>>> if Vaux was not provided. In that case, can all the devices work after resume?
>>> Most notably NVMe?
>>
>> I have not hardware to do such test, NVMe driver will reinit everything after
>> resume if no L1.1\L1.2 support. If there are L1.1\L1.2, NVME expect it leave
>> at L1.2 at suspend to get better resume latency.
>>
> 
> To be precise, NVMe driver will shutdown the device if there is no ASPM support
> and keep it in low power mode otherwise (there are other cases as well but we do
> not need to worry).
> 
> But here you are not checking for ASPM state in the suspend path, and just
> forcing the link to be in L2/L3 (thereby D3Cold) even though NVMe driver may
> expect it to be in low power state like ASPM/APST.
> 
> So you should only put the link to L2/L3 if there is no ASPM support. Otherwise,
> you'll ending up with bug reports when users connect NVMe to it.
> 


At this topic, it's very interesting to look at

drivers/pci/controller/dwc/pcie-tegra194.c


static int tegra_pcie_dw_suspend_noirq(struct device *dev)
{
         struct tegra_pcie_dw *pcie = dev_get_drvdata(dev);

         if (!pcie->link_state)
                 return 0;

         tegra_pcie_downstream_dev_to_D0(pcie);
         tegra_pcie_dw_pme_turnoff(pcie);
         tegra_pcie_unconfig_controller(pcie);

         return 0;
}

It brings back all the downstream components to D0, as I assumed it was 
L0 indeed, before sending PME aiming to enter L2.

> - Mani
> 
>> This API help remove duplicate codes and it can be improved gradually.
>>
>>
>>>
>>> - Mani
>>>
>>>
>>> -- 
>>> மணிவண்ணன் சதாசிவம்
>
Frank Li July 21, 2023, 2:10 p.m. UTC | #19
On Fri, Jul 21, 2023 at 10:09:18AM +0800, Shawn Lin wrote:
> 
> On 2023/7/21 0:07, Manivannan Sadhasivam wrote:
> > On Thu, Jul 20, 2023 at 10:37:36AM -0400, Frank Li wrote:
> > > On Thu, Jul 20, 2023 at 07:55:09PM +0530, Manivannan Sadhasivam wrote:
> > > > On Tue, Jul 18, 2023 at 03:34:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> > > > > > On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > > > On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> > > > > > > > Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> > > > > > > > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> > > > > > > > 
> > > > > > > > Typical L2 entry workflow:
> > > > > > > > 
> > > > > > > > 1. Transmit PME turn off signal to PCI devices.
> > > > > > > > 2. Await link entering L2_IDLE state.
> > > > > > > 
> > > > > > > AFAIK, typical workflow is to wait for PME_To_Ack.
> > > > > > 
> > > > > > 1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
> > > > > > I think PCI RC needs some time to set link enter L2 after get ACK from
> > > > > > PME.
> > > > > > 
> > > > 
> > > > One more comment. If you transition the device to L2/L3, then it can loose power
> > > > if Vaux was not provided. In that case, can all the devices work after resume?
> > > > Most notably NVMe?
> > > 
> > > I have not hardware to do such test, NVMe driver will reinit everything after
> > > resume if no L1.1\L1.2 support. If there are L1.1\L1.2, NVME expect it leave
> > > at L1.2 at suspend to get better resume latency.
> > > 
> > 
> > To be precise, NVMe driver will shutdown the device if there is no ASPM support
> > and keep it in low power mode otherwise (there are other cases as well but we do
> > not need to worry).
> > 
> > But here you are not checking for ASPM state in the suspend path, and just
> > forcing the link to be in L2/L3 (thereby D3Cold) even though NVMe driver may
> > expect it to be in low power state like ASPM/APST.
> > 
> > So you should only put the link to L2/L3 if there is no ASPM support. Otherwise,
> > you'll ending up with bug reports when users connect NVMe to it.
> > 
> 
> 
> At this topic, it's very interesting to look at
> 
> drivers/pci/controller/dwc/pcie-tegra194.c
> 
> 
> static int tegra_pcie_dw_suspend_noirq(struct device *dev)
> {
>         struct tegra_pcie_dw *pcie = dev_get_drvdata(dev);
> 
>         if (!pcie->link_state)
>                 return 0;
> 
>         tegra_pcie_downstream_dev_to_D0(pcie);
>         tegra_pcie_dw_pme_turnoff(pcie);
>         tegra_pcie_unconfig_controller(pcie);
> 
>         return 0;
> }
> 
> It brings back all the downstream components to D0, as I assumed it was L0
> indeed, before sending PME aiming to enter L2.

If current state is L1.1 or L1.2, hardware can auto enter to D0\L0 when
there are any PCI bus activity, include PME. I supposed
tegra_pcie_downstream_dev_to_D0() just make sure come back from L2/L3,
which may enter by runtime PM previously, or other reason.

NVME ASPM problem is (at least when I debug at other platform about 1 year
ago): 

    1. NVME will not release MSI interrupt during suspsend.
    2. PCI controler enter L2 at suspned_noirq();
    3. CPU hot plug try to down second core (CORE1, CORE2, ...)
    4. GIC try to disable MSI irq by write config space.
    5. panic here because config space can't be access at L2.

I suposed tegra should have problem when ASPM enable with NVME devices.

Frank
> 
> > - Mani
> > 
> > > This API help remove duplicate codes and it can be improved gradually.
> > > 
> > > 
> > > > 
> > > > - Mani
> > > > 
> > > > 
> > > > -- 
> > > > மணிவண்ணன் சதாசிவம்
> >
Manivannan Sadhasivam July 21, 2023, 2:54 p.m. UTC | #20
On Fri, Jul 21, 2023 at 10:09:18AM +0800, Shawn Lin wrote:
> 
> On 2023/7/21 0:07, Manivannan Sadhasivam wrote:
> > On Thu, Jul 20, 2023 at 10:37:36AM -0400, Frank Li wrote:
> > > On Thu, Jul 20, 2023 at 07:55:09PM +0530, Manivannan Sadhasivam wrote:
> > > > On Tue, Jul 18, 2023 at 03:34:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> > > > > > On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > > > On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> > > > > > > > Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> > > > > > > > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> > > > > > > > 
> > > > > > > > Typical L2 entry workflow:
> > > > > > > > 
> > > > > > > > 1. Transmit PME turn off signal to PCI devices.
> > > > > > > > 2. Await link entering L2_IDLE state.
> > > > > > > 
> > > > > > > AFAIK, typical workflow is to wait for PME_To_Ack.
> > > > > > 
> > > > > > 1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
> > > > > > I think PCI RC needs some time to set link enter L2 after get ACK from
> > > > > > PME.
> > > > > > 
> > > > 
> > > > One more comment. If you transition the device to L2/L3, then it can loose power
> > > > if Vaux was not provided. In that case, can all the devices work after resume?
> > > > Most notably NVMe?
> > > 
> > > I have not hardware to do such test, NVMe driver will reinit everything after
> > > resume if no L1.1\L1.2 support. If there are L1.1\L1.2, NVME expect it leave
> > > at L1.2 at suspend to get better resume latency.
> > > 
> > 
> > To be precise, NVMe driver will shutdown the device if there is no ASPM support
> > and keep it in low power mode otherwise (there are other cases as well but we do
> > not need to worry).
> > 
> > But here you are not checking for ASPM state in the suspend path, and just
> > forcing the link to be in L2/L3 (thereby D3Cold) even though NVMe driver may
> > expect it to be in low power state like ASPM/APST.
> > 
> > So you should only put the link to L2/L3 if there is no ASPM support. Otherwise,
> > you'll ending up with bug reports when users connect NVMe to it.
> > 
> 
> 
> At this topic, it's very interesting to look at
> 
> drivers/pci/controller/dwc/pcie-tegra194.c
> 
> 
> static int tegra_pcie_dw_suspend_noirq(struct device *dev)
> {
>         struct tegra_pcie_dw *pcie = dev_get_drvdata(dev);
> 
>         if (!pcie->link_state)
>                 return 0;
> 
>         tegra_pcie_downstream_dev_to_D0(pcie);
>         tegra_pcie_dw_pme_turnoff(pcie);
>         tegra_pcie_unconfig_controller(pcie);
> 
>         return 0;
> }
> 
> It brings back all the downstream components to D0, as I assumed it was L0
> indeed, before sending PME aiming to enter L2.
> 

The behavior is Tegra specific as mentioned in the comment in
tegra_pcie_downstream_dev_to_D0():

        /*
         * link doesn't go into L2 state with some of the endpoints with Tegra
         * if they are not in D0 state. So, need to make sure that immediate
         * downstream devices are in D0 state before sending PME_TurnOff to put
         * link into L2 state.
         * This is as per PCI Express Base r4.0 v1.0 September 27-2017,
         * 5.2 Link State Power Management (Page #428).
         */

But I couldn't find the behavior documented in the spec as per the comment. Not
sure if I'm reading it wrong!

Also, I can confirm from previous interations with the linux-nvme list that
Tegra also faces the suspend issue with NVMe devices.

- Mani

- Mani

> > - Mani
> > 
> > > This API help remove duplicate codes and it can be improved gradually.
> > > 
> > > 
> > > > 
> > > > - Mani
> > > > 
> > > > 
> > > > -- 
> > > > மணிவண்ணன் சதாசிவம்
> >
Manivannan Sadhasivam July 21, 2023, 4:07 p.m. UTC | #21
On Fri, Jul 21, 2023 at 10:10:11AM -0400, Frank Li wrote:
> On Fri, Jul 21, 2023 at 10:09:18AM +0800, Shawn Lin wrote:
> > 
> > On 2023/7/21 0:07, Manivannan Sadhasivam wrote:
> > > On Thu, Jul 20, 2023 at 10:37:36AM -0400, Frank Li wrote:
> > > > On Thu, Jul 20, 2023 at 07:55:09PM +0530, Manivannan Sadhasivam wrote:
> > > > > On Tue, Jul 18, 2023 at 03:34:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > > On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> > > > > > > On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > > > > On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> > > > > > > > > Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> > > > > > > > > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> > > > > > > > > 
> > > > > > > > > Typical L2 entry workflow:
> > > > > > > > > 
> > > > > > > > > 1. Transmit PME turn off signal to PCI devices.
> > > > > > > > > 2. Await link entering L2_IDLE state.
> > > > > > > > 
> > > > > > > > AFAIK, typical workflow is to wait for PME_To_Ack.
> > > > > > > 
> > > > > > > 1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
> > > > > > > I think PCI RC needs some time to set link enter L2 after get ACK from
> > > > > > > PME.
> > > > > > > 
> > > > > 
> > > > > One more comment. If you transition the device to L2/L3, then it can loose power
> > > > > if Vaux was not provided. In that case, can all the devices work after resume?
> > > > > Most notably NVMe?
> > > > 
> > > > I have not hardware to do such test, NVMe driver will reinit everything after
> > > > resume if no L1.1\L1.2 support. If there are L1.1\L1.2, NVME expect it leave
> > > > at L1.2 at suspend to get better resume latency.
> > > > 
> > > 
> > > To be precise, NVMe driver will shutdown the device if there is no ASPM support
> > > and keep it in low power mode otherwise (there are other cases as well but we do
> > > not need to worry).
> > > 
> > > But here you are not checking for ASPM state in the suspend path, and just
> > > forcing the link to be in L2/L3 (thereby D3Cold) even though NVMe driver may
> > > expect it to be in low power state like ASPM/APST.
> > > 
> > > So you should only put the link to L2/L3 if there is no ASPM support. Otherwise,
> > > you'll ending up with bug reports when users connect NVMe to it.
> > > 
> > 
> > 
> > At this topic, it's very interesting to look at
> > 
> > drivers/pci/controller/dwc/pcie-tegra194.c
> > 
> > 
> > static int tegra_pcie_dw_suspend_noirq(struct device *dev)
> > {
> >         struct tegra_pcie_dw *pcie = dev_get_drvdata(dev);
> > 
> >         if (!pcie->link_state)
> >                 return 0;
> > 
> >         tegra_pcie_downstream_dev_to_D0(pcie);
> >         tegra_pcie_dw_pme_turnoff(pcie);
> >         tegra_pcie_unconfig_controller(pcie);
> > 
> >         return 0;
> > }
> > 
> > It brings back all the downstream components to D0, as I assumed it was L0
> > indeed, before sending PME aiming to enter L2.
> 
> If current state is L1.1 or L1.2, hardware can auto enter to D0\L0 when
> there are any PCI bus activity, include PME. I supposed
> tegra_pcie_downstream_dev_to_D0() just make sure come back from L2/L3,
> which may enter by runtime PM previously, or other reason.
> 
> NVME ASPM problem is (at least when I debug at other platform about 1 year
> ago): 
> 
>     1. NVME will not release MSI interrupt during suspsend.
>     2. PCI controler enter L2 at suspned_noirq();
>     3. CPU hot plug try to down second core (CORE1, CORE2, ...)
>     4. GIC try to disable MSI irq by write config space.

Just for the record, this will only happen during deep sleep (s2ram) where the
CPUs are powered down (including the boot CPU).

>     5. panic here because config space can't be access at L2.
> 
> I suposed tegra should have problem when ASPM enable with NVME devices.
> 

NVMe suspend issue has several faces:

If NVMe is powered down during suspend, it will result in considerable power
savings. But at the same time, the suspend should not happen too frequently as
it may deteriorate the lifetime of the device. Most of the recent NVMe devices
have 2M power cycles (only).

We can workaround the above lifetime issue by powering down the device only
during s2ram. It will work great for Laptop use cases if s2ram is supported.
Unfortunately, not all Laptops support s2ram though. And if the device is
powered down during s2idle, it will hit the above life time issue when it is
used in Android platforms such as Tablets (even future mobile phones?) which
doesn't support s2ram.

So I'm thinking of the following options to address the issue(s):

1. Modify the NVMe driver to power down the device during s2ram (the driver can
use the global pm_suspend_target_state flag to detect the suspend state) and use
the same logic to put the link into L2/L3 state in the PCIe controller drivers.
For s2idle, maintain both the device and link in low power states.

2. Get the power management decision from the userspace and use that to decide
the power down logic in s2idle for both NVMe and PCIe drivers. This will ensure
that the NVMe device will be powered down in suitable usecases like Laptop
without s2ram and kept in low power states for Android platforms. But this is
quite an involved task and I don't know if it is possible at all.

I'm just dumping my thoughts here. And I plan to intiate a discussion with NVMe/
power folks soon. Maybe during Plumbers?

- Mani

> Frank
> > 
> > > - Mani
> > > 
> > > > This API help remove duplicate codes and it can be improved gradually.
> > > > 
> > > > 
> > > > > 
> > > > > - Mani
> > > > > 
> > > > > 
> > > > > -- 
> > > > > மணிவண்ணன் சதாசிவம்
> > >
Frank Li July 21, 2023, 4:21 p.m. UTC | #22
On Fri, Jul 21, 2023 at 09:37:55PM +0530, Manivannan Sadhasivam wrote:
> On Fri, Jul 21, 2023 at 10:10:11AM -0400, Frank Li wrote:
> > On Fri, Jul 21, 2023 at 10:09:18AM +0800, Shawn Lin wrote:
> > > 
> > > On 2023/7/21 0:07, Manivannan Sadhasivam wrote:
> > > > On Thu, Jul 20, 2023 at 10:37:36AM -0400, Frank Li wrote:
> > > > > On Thu, Jul 20, 2023 at 07:55:09PM +0530, Manivannan Sadhasivam wrote:
> > > > > > On Tue, Jul 18, 2023 at 03:34:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > > > On Mon, Jul 17, 2023 at 02:36:19PM -0400, Frank Li wrote:
> > > > > > > > On Mon, Jul 17, 2023 at 10:15:26PM +0530, Manivannan Sadhasivam wrote:
> > > > > > > > > On Wed, Apr 19, 2023 at 12:41:17PM -0400, Frank Li wrote:
> > > > > > > > > > Introduced helper function dw_pcie_get_ltssm to retrieve SMLH_LTSS_STATE.
> > > > > > > > > > Added API pme_turn_off and exit_from_l2 for managing L2/L3 state transitions.
> > > > > > > > > > 
> > > > > > > > > > Typical L2 entry workflow:
> > > > > > > > > > 
> > > > > > > > > > 1. Transmit PME turn off signal to PCI devices.
> > > > > > > > > > 2. Await link entering L2_IDLE state.
> > > > > > > > > 
> > > > > > > > > AFAIK, typical workflow is to wait for PME_To_Ack.
> > > > > > > > 
> > > > > > > > 1 Already wait for PME_to_ACK,  2, just wait for link actual enter L2.
> > > > > > > > I think PCI RC needs some time to set link enter L2 after get ACK from
> > > > > > > > PME.
> > > > > > > > 
> > > > > > 
> > > > > > One more comment. If you transition the device to L2/L3, then it can loose power
> > > > > > if Vaux was not provided. In that case, can all the devices work after resume?
> > > > > > Most notably NVMe?
> > > > > 
> > > > > I have not hardware to do such test, NVMe driver will reinit everything after
> > > > > resume if no L1.1\L1.2 support. If there are L1.1\L1.2, NVME expect it leave
> > > > > at L1.2 at suspend to get better resume latency.
> > > > > 
> > > > 
> > > > To be precise, NVMe driver will shutdown the device if there is no ASPM support
> > > > and keep it in low power mode otherwise (there are other cases as well but we do
> > > > not need to worry).
> > > > 
> > > > But here you are not checking for ASPM state in the suspend path, and just
> > > > forcing the link to be in L2/L3 (thereby D3Cold) even though NVMe driver may
> > > > expect it to be in low power state like ASPM/APST.
> > > > 
> > > > So you should only put the link to L2/L3 if there is no ASPM support. Otherwise,
> > > > you'll ending up with bug reports when users connect NVMe to it.
> > > > 
> > > 
> > > 
> > > At this topic, it's very interesting to look at
> > > 
> > > drivers/pci/controller/dwc/pcie-tegra194.c
> > > 
> > > 
> > > static int tegra_pcie_dw_suspend_noirq(struct device *dev)
> > > {
> > >         struct tegra_pcie_dw *pcie = dev_get_drvdata(dev);
> > > 
> > >         if (!pcie->link_state)
> > >                 return 0;
> > > 
> > >         tegra_pcie_downstream_dev_to_D0(pcie);
> > >         tegra_pcie_dw_pme_turnoff(pcie);
> > >         tegra_pcie_unconfig_controller(pcie);
> > > 
> > >         return 0;
> > > }
> > > 
> > > It brings back all the downstream components to D0, as I assumed it was L0
> > > indeed, before sending PME aiming to enter L2.
> > 
> > If current state is L1.1 or L1.2, hardware can auto enter to D0\L0 when
> > there are any PCI bus activity, include PME. I supposed
> > tegra_pcie_downstream_dev_to_D0() just make sure come back from L2/L3,
> > which may enter by runtime PM previously, or other reason.
> > 
> > NVME ASPM problem is (at least when I debug at other platform about 1 year
> > ago): 
> > 
> >     1. NVME will not release MSI interrupt during suspsend.
> >     2. PCI controler enter L2 at suspned_noirq();
> >     3. CPU hot plug try to down second core (CORE1, CORE2, ...)
> >     4. GIC try to disable MSI irq by write config space.
> 
> Just for the record, this will only happen during deep sleep (s2ram) where the
> CPUs are powered down (including the boot CPU).
> 
> >     5. panic here because config space can't be access at L2.
> > 
> > I suposed tegra should have problem when ASPM enable with NVME devices.
> > 
> 
> NVMe suspend issue has several faces:
> 
> If NVMe is powered down during suspend, it will result in considerable power
> savings. But at the same time, the suspend should not happen too frequently as
> it may deteriorate the lifetime of the device. Most of the recent NVMe devices
> have 2M power cycles (only).
> 
> We can workaround the above lifetime issue by powering down the device only
> during s2ram. It will work great for Laptop use cases if s2ram is supported.
> Unfortunately, not all Laptops support s2ram though. And if the device is
> powered down during s2idle, it will hit the above life time issue when it is
> used in Android platforms such as Tablets (even future mobile phones?) which
> doesn't support s2ram.
> 
> So I'm thinking of the following options to address the issue(s):
> 
> 1. Modify the NVMe driver to power down the device during s2ram (the driver can
> use the global pm_suspend_target_state flag to detect the suspend state) and use
> the same logic to put the link into L2/L3 state in the PCIe controller drivers.
> For s2idle, maintain both the device and link in low power states.
> 
> 2. Get the power management decision from the userspace and use that to decide
> the power down logic in s2idle for both NVMe and PCIe drivers. This will ensure
> that the NVMe device will be powered down in suitable usecases like Laptop
> without s2ram and kept in low power states for Android platforms. But this is
> quite an involved task and I don't know if it is possible at all.
> 
> I'm just dumping my thoughts here. And I plan to intiate a discussion with NVMe/
> power folks soon. Maybe during Plumbers?

Yes, it is big work. Anyway, if DWC can use common suspend/resume function,
it will be easy to be updated when a better policy applied at future.

Frank Li

> 
> - Mani
> 
> > Frank
> > > 
> > > > - Mani
> > > > 
> > > > > This API help remove duplicate codes and it can be improved gradually.
> > > > > 
> > > > > 
> > > > > > 
> > > > > > - Mani
> > > > > > 
> > > > > > 
> > > > > > -- 
> > > > > > மணிவண்ணன் சதாசிவம்
> > > > 
> 
> -- 
> மணிவண்ணன் சதாசிவம்
diff mbox series

Patch

diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
index 9952057c8819..ef6869488bde 100644
--- a/drivers/pci/controller/dwc/pcie-designware-host.c
+++ b/drivers/pci/controller/dwc/pcie-designware-host.c
@@ -8,6 +8,7 @@ 
  * Author: Jingoo Han <jg1.han@samsung.com>
  */
 
+#include <linux/iopoll.h>
 #include <linux/irqchip/chained_irq.h>
 #include <linux/irqdomain.h>
 #include <linux/msi.h>
@@ -807,3 +808,82 @@  int dw_pcie_setup_rc(struct dw_pcie_rp *pp)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dw_pcie_setup_rc);
+
+/*
+ * There are for configuring host controllers, which are bridges *to* PCI devices
+ * but are not PCI devices themselves.
+ */
+static void dw_pcie_set_dstate(struct dw_pcie *pci, u32 dstate)
+{
+	u8 offset = dw_pcie_find_capability(pci, PCI_CAP_ID_PM);
+	u32 val;
+
+	val = dw_pcie_readw_dbi(pci, offset + PCI_PM_CTRL);
+	val &= ~PCI_PM_CTRL_STATE_MASK;
+	val |= dstate;
+	dw_pcie_writew_dbi(pci, offset + PCI_PM_CTRL, val);
+}
+
+int dw_pcie_suspend_noirq(struct dw_pcie *pci)
+{
+	u32 val;
+	int ret;
+
+	if (dw_pcie_get_ltssm(pci) <= DW_PCIE_LTSSM_DETECT_ACT)
+		return 0;
+
+	pci->pp.ops->pme_turn_off(&pci->pp);
+
+	/*
+	 * PCI Express Base Specification Rev 4.0
+	 * 5.3.3.2.1 PME Synchronization
+	 * Recommand 1ms to 10ms timeout to check L2 ready
+	 */
+	ret = read_poll_timeout(dw_pcie_get_ltssm, val, val == DW_PCIE_LTSSM_L2_IDLE,
+				100, 10000, false, pci);
+	if (ret) {
+		dev_err(pci->dev, "PCIe link enter L2 timeout! ltssm = 0x%x\n", val);
+		return ret;
+	}
+
+	dw_pcie_set_dstate(pci, 0x3);
+
+	pci->suspended = true;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dw_pcie_suspend_noirq);
+
+int dw_pcie_resume_noirq(struct dw_pcie *pci)
+{
+	int ret;
+
+	if (!pci->suspended)
+		return 0;
+
+	pci->suspended = false;
+
+	dw_pcie_set_dstate(pci, 0x0);
+
+	pci->pp.ops->exit_from_l2(&pci->pp);
+
+	/* delay 10 ms to access EP */
+	mdelay(10);
+
+	ret = pci->pp.ops->host_init(&pci->pp);
+	if (ret) {
+		dev_err(pci->dev, "ls_pcie_host_init failed! ret = 0x%x\n", ret);
+		return ret;
+	}
+
+	dw_pcie_setup_rc(&pci->pp);
+
+	ret = dw_pcie_wait_for_link(pci);
+	if (ret) {
+		dev_err(pci->dev, "wait link up timeout! ret = 0x%x\n", ret);
+		return ret;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dw_pcie_resume_noirq);
diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h
index 79713ce075cc..effb07a506e4 100644
--- a/drivers/pci/controller/dwc/pcie-designware.h
+++ b/drivers/pci/controller/dwc/pcie-designware.h
@@ -288,10 +288,21 @@  enum dw_pcie_core_rst {
 	DW_PCIE_NUM_CORE_RSTS
 };
 
+enum dw_pcie_ltssm {
+	DW_PCIE_LTSSM_UNKNOWN = 0xFFFFFFFF,
+	/* Need align PCIE_PORT_DEBUG0 bit0:5 */
+	DW_PCIE_LTSSM_DETECT_QUIET = 0x0,
+	DW_PCIE_LTSSM_DETECT_ACT = 0x1,
+	DW_PCIE_LTSSM_L0 = 0x11,
+	DW_PCIE_LTSSM_L2_IDLE = 0x15,
+};
+
 struct dw_pcie_host_ops {
 	int (*host_init)(struct dw_pcie_rp *pp);
 	void (*host_deinit)(struct dw_pcie_rp *pp);
 	int (*msi_host_init)(struct dw_pcie_rp *pp);
+	void (*pme_turn_off)(struct dw_pcie_rp *pp);
+	void (*exit_from_l2)(struct dw_pcie_rp *pp);
 };
 
 struct dw_pcie_rp {
@@ -364,6 +375,7 @@  struct dw_pcie_ops {
 	void    (*write_dbi2)(struct dw_pcie *pcie, void __iomem *base, u32 reg,
 			      size_t size, u32 val);
 	int	(*link_up)(struct dw_pcie *pcie);
+	enum dw_pcie_ltssm (*get_ltssm)(struct dw_pcie *pcie);
 	int	(*start_link)(struct dw_pcie *pcie);
 	void	(*stop_link)(struct dw_pcie *pcie);
 };
@@ -393,6 +405,7 @@  struct dw_pcie {
 	struct reset_control_bulk_data	app_rsts[DW_PCIE_NUM_APP_RSTS];
 	struct reset_control_bulk_data	core_rsts[DW_PCIE_NUM_CORE_RSTS];
 	struct gpio_desc		*pe_rst;
+	bool			suspended;
 };
 
 #define to_dw_pcie_from_pp(port) container_of((port), struct dw_pcie, pp)
@@ -430,6 +443,9 @@  void dw_pcie_iatu_detect(struct dw_pcie *pci);
 int dw_pcie_edma_detect(struct dw_pcie *pci);
 void dw_pcie_edma_remove(struct dw_pcie *pci);
 
+int dw_pcie_suspend_noirq(struct dw_pcie *pci);
+int dw_pcie_resume_noirq(struct dw_pcie *pci);
+
 static inline void dw_pcie_writel_dbi(struct dw_pcie *pci, u32 reg, u32 val)
 {
 	dw_pcie_write_dbi(pci, reg, 0x4, val);
@@ -501,6 +517,18 @@  static inline void dw_pcie_stop_link(struct dw_pcie *pci)
 		pci->ops->stop_link(pci);
 }
 
+static inline enum dw_pcie_ltssm dw_pcie_get_ltssm(struct dw_pcie *pci)
+{
+	u32 val;
+
+	if (pci->ops && pci->ops->get_ltssm)
+		return pci->ops->get_ltssm(pci);
+
+	val = dw_pcie_readl_dbi(pci, PCIE_PORT_DEBUG0);
+
+	return (enum dw_pcie_ltssm)FIELD_GET(PORT_LOGIC_LTSSM_STATE_MASK, val);
+}
+
 #ifdef CONFIG_PCIE_DW_HOST
 irqreturn_t dw_handle_msi_irq(struct dw_pcie_rp *pp);
 int dw_pcie_setup_rc(struct dw_pcie_rp *pp);