diff mbox series

PCI: qcom-ep: Move controller cleanups to qcom_pcie_perst_deassert()

Message ID 20240729122245.33410-1-manivannan.sadhasivam@linaro.org (mailing list archive)
State Accepted
Delegated to: Krzysztof Wilczyński
Headers show
Series PCI: qcom-ep: Move controller cleanups to qcom_pcie_perst_deassert() | expand

Commit Message

Manivannan Sadhasivam July 29, 2024, 12:22 p.m. UTC
Currently, the endpoint cleanup function dw_pcie_ep_cleanup() and EPF
deinit notify function pci_epc_deinit_notify() are called during the
execution of qcom_pcie_perst_assert() i.e., when the host has asserted
PERST#. But quickly after this step, refclk will also be disabled by the
host.

All of the Qcom endpoint SoCs supported as of now depend on the refclk from
the host for keeping the controller operational. Due to this limitation,
any access to the hardware registers in the absence of refclk will result
in a whole endpoint crash. Unfortunately, most of the controller cleanups
require accessing the hardware registers (like eDMA cleanup performed in
dw_pcie_ep_cleanup(), powering down MHI EPF etc...). So these cleanup
functions are currently causing the crash in the endpoint SoC once host
asserts PERST#.

One way to address this issue is by generating the refclk in the endpoint
itself and not depending on the host. But that is not always possible as
some of the endpoint designs do require the endpoint to consume refclk from
the host (as I was told by the Qcom engineers).

So let's fix this crash by moving the controller cleanups to the start of
the qcom_pcie_perst_deassert() function. qcom_pcie_perst_deassert() is
called whenever the host has deasserted PERST# and it is guaranteed that
the refclk would be active at this point. So at the start of this function,
the controller cleanup can be performed. Once finished, rest of the code
execution for PERST# deassert can continue as usual.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/pci/controller/dwc/pcie-qcom-ep.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

Comments

Krishna Chaitanya Chundru July 29, 2024, 12:28 p.m. UTC | #1
On 7/29/2024 5:52 PM, Manivannan Sadhasivam wrote:
> Currently, the endpoint cleanup function dw_pcie_ep_cleanup() and EPF
> deinit notify function pci_epc_deinit_notify() are called during the
> execution of qcom_pcie_perst_assert() i.e., when the host has asserted
> PERST#. But quickly after this step, refclk will also be disabled by the
> host.
> 
> All of the Qcom endpoint SoCs supported as of now depend on the refclk from
> the host for keeping the controller operational. Due to this limitation,
> any access to the hardware registers in the absence of refclk will result
> in a whole endpoint crash. Unfortunately, most of the controller cleanups
> require accessing the hardware registers (like eDMA cleanup performed in
> dw_pcie_ep_cleanup(), powering down MHI EPF etc...). So these cleanup
> functions are currently causing the crash in the endpoint SoC once host
> asserts PERST#.
> 
> One way to address this issue is by generating the refclk in the endpoint
> itself and not depending on the host. But that is not always possible as
> some of the endpoint designs do require the endpoint to consume refclk from
> the host (as I was told by the Qcom engineers).
> 
> So let's fix this crash by moving the controller cleanups to the start of
> the qcom_pcie_perst_deassert() function. qcom_pcie_perst_deassert() is
> called whenever the host has deasserted PERST# and it is guaranteed that
> the refclk would be active at this point. So at the start of this function,
> the controller cleanup can be performed. Once finished, rest of the code
> execution for PERST# deassert can continue as usual.
> 
How about doing the cleanup as part of pme turnoff message.
As host waits for L23 ready from the device side. we can use that time
to cleanup the host before sending L23 ready.

- Krishna Chaitanya.
> Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> ---
>   drivers/pci/controller/dwc/pcie-qcom-ep.c | 12 ++++++++++--
>   1 file changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c
> index 2319ff2ae9f6..e024b4dcd76d 100644
> --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c
> +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c
> @@ -186,6 +186,8 @@ struct qcom_pcie_ep_cfg {
>    * @link_status: PCIe Link status
>    * @global_irq: Qualcomm PCIe specific Global IRQ
>    * @perst_irq: PERST# IRQ
> + * @cleanup_pending: Cleanup is pending for the controller (because refclk is
> + *                   needed for cleanup)
>    */
>   struct qcom_pcie_ep {
>   	struct dw_pcie pci;
> @@ -214,6 +216,7 @@ struct qcom_pcie_ep {
>   	enum qcom_pcie_ep_link_status link_status;
>   	int global_irq;
>   	int perst_irq;
> +	bool cleanup_pending;
>   };
>   
>   static int qcom_pcie_ep_core_reset(struct qcom_pcie_ep *pcie_ep)
> @@ -389,6 +392,12 @@ static int qcom_pcie_perst_deassert(struct dw_pcie *pci)
>   		return ret;
>   	}
>   
> +	if (pcie_ep->cleanup_pending) {
> +		pci_epc_deinit_notify(pci->ep.epc);
> +		dw_pcie_ep_cleanup(&pci->ep);
> +		pcie_ep->cleanup_pending = false;
> +	}
> +
>   	/* Assert WAKE# to RC to indicate device is ready */
>   	gpiod_set_value_cansleep(pcie_ep->wake, 1);
>   	usleep_range(WAKE_DELAY_US, WAKE_DELAY_US + 500);
> @@ -522,10 +531,9 @@ static void qcom_pcie_perst_assert(struct dw_pcie *pci)
>   {
>   	struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci);
>   
> -	pci_epc_deinit_notify(pci->ep.epc);
> -	dw_pcie_ep_cleanup(&pci->ep);
>   	qcom_pcie_disable_resources(pcie_ep);
>   	pcie_ep->link_status = QCOM_PCIE_EP_LINK_DISABLED;
> +	pcie_ep->cleanup_pending = true;
>   }
>   
>   /* Common DWC controller ops */
Manivannan Sadhasivam July 29, 2024, 1:55 p.m. UTC | #2
On Mon, Jul 29, 2024 at 05:58:31PM +0530, Krishna Chaitanya Chundru wrote:
> 
> 
> On 7/29/2024 5:52 PM, Manivannan Sadhasivam wrote:
> > Currently, the endpoint cleanup function dw_pcie_ep_cleanup() and EPF
> > deinit notify function pci_epc_deinit_notify() are called during the
> > execution of qcom_pcie_perst_assert() i.e., when the host has asserted
> > PERST#. But quickly after this step, refclk will also be disabled by the
> > host.
> > 
> > All of the Qcom endpoint SoCs supported as of now depend on the refclk from
> > the host for keeping the controller operational. Due to this limitation,
> > any access to the hardware registers in the absence of refclk will result
> > in a whole endpoint crash. Unfortunately, most of the controller cleanups
> > require accessing the hardware registers (like eDMA cleanup performed in
> > dw_pcie_ep_cleanup(), powering down MHI EPF etc...). So these cleanup
> > functions are currently causing the crash in the endpoint SoC once host
> > asserts PERST#.
> > 
> > One way to address this issue is by generating the refclk in the endpoint
> > itself and not depending on the host. But that is not always possible as
> > some of the endpoint designs do require the endpoint to consume refclk from
> > the host (as I was told by the Qcom engineers).
> > 
> > So let's fix this crash by moving the controller cleanups to the start of
> > the qcom_pcie_perst_deassert() function. qcom_pcie_perst_deassert() is
> > called whenever the host has deasserted PERST# and it is guaranteed that
> > the refclk would be active at this point. So at the start of this function,
> > the controller cleanup can be performed. Once finished, rest of the code
> > execution for PERST# deassert can continue as usual.
> > 
> How about doing the cleanup as part of pme turnoff message.
> As host waits for L23 ready from the device side. we can use that time
> to cleanup the host before sending L23 ready.
> 

Yes, but that's only applicable if the host properly powers down the device. But
it won't work in the case of host crash or host abrupt poweroff.

- Mani

> - Krishna Chaitanya.
> > Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> > ---
> >   drivers/pci/controller/dwc/pcie-qcom-ep.c | 12 ++++++++++--
> >   1 file changed, 10 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c
> > index 2319ff2ae9f6..e024b4dcd76d 100644
> > --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c
> > +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c
> > @@ -186,6 +186,8 @@ struct qcom_pcie_ep_cfg {
> >    * @link_status: PCIe Link status
> >    * @global_irq: Qualcomm PCIe specific Global IRQ
> >    * @perst_irq: PERST# IRQ
> > + * @cleanup_pending: Cleanup is pending for the controller (because refclk is
> > + *                   needed for cleanup)
> >    */
> >   struct qcom_pcie_ep {
> >   	struct dw_pcie pci;
> > @@ -214,6 +216,7 @@ struct qcom_pcie_ep {
> >   	enum qcom_pcie_ep_link_status link_status;
> >   	int global_irq;
> >   	int perst_irq;
> > +	bool cleanup_pending;
> >   };
> >   static int qcom_pcie_ep_core_reset(struct qcom_pcie_ep *pcie_ep)
> > @@ -389,6 +392,12 @@ static int qcom_pcie_perst_deassert(struct dw_pcie *pci)
> >   		return ret;
> >   	}
> > +	if (pcie_ep->cleanup_pending) {
> > +		pci_epc_deinit_notify(pci->ep.epc);
> > +		dw_pcie_ep_cleanup(&pci->ep);
> > +		pcie_ep->cleanup_pending = false;
> > +	}
> > +
> >   	/* Assert WAKE# to RC to indicate device is ready */
> >   	gpiod_set_value_cansleep(pcie_ep->wake, 1);
> >   	usleep_range(WAKE_DELAY_US, WAKE_DELAY_US + 500);
> > @@ -522,10 +531,9 @@ static void qcom_pcie_perst_assert(struct dw_pcie *pci)
> >   {
> >   	struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci);
> > -	pci_epc_deinit_notify(pci->ep.epc);
> > -	dw_pcie_ep_cleanup(&pci->ep);
> >   	qcom_pcie_disable_resources(pcie_ep);
> >   	pcie_ep->link_status = QCOM_PCIE_EP_LINK_DISABLED;
> > +	pcie_ep->cleanup_pending = true;
> >   }
> >   /* Common DWC controller ops */
Krishna Chaitanya Chundru July 31, 2024, 4:31 a.m. UTC | #3
On 7/29/2024 7:25 PM, Manivannan Sadhasivam wrote:
> On Mon, Jul 29, 2024 at 05:58:31PM +0530, Krishna Chaitanya Chundru wrote:
>>
>>
>> On 7/29/2024 5:52 PM, Manivannan Sadhasivam wrote:
>>> Currently, the endpoint cleanup function dw_pcie_ep_cleanup() and EPF
>>> deinit notify function pci_epc_deinit_notify() are called during the
>>> execution of qcom_pcie_perst_assert() i.e., when the host has asserted
>>> PERST#. But quickly after this step, refclk will also be disabled by the
>>> host.
>>>
>>> All of the Qcom endpoint SoCs supported as of now depend on the refclk from
>>> the host for keeping the controller operational. Due to this limitation,
>>> any access to the hardware registers in the absence of refclk will result
>>> in a whole endpoint crash. Unfortunately, most of the controller cleanups
>>> require accessing the hardware registers (like eDMA cleanup performed in
>>> dw_pcie_ep_cleanup(), powering down MHI EPF etc...). So these cleanup
>>> functions are currently causing the crash in the endpoint SoC once host
>>> asserts PERST#.
>>>
>>> One way to address this issue is by generating the refclk in the endpoint
>>> itself and not depending on the host. But that is not always possible as
>>> some of the endpoint designs do require the endpoint to consume refclk from
>>> the host (as I was told by the Qcom engineers).
>>>
>>> So let's fix this crash by moving the controller cleanups to the start of
>>> the qcom_pcie_perst_deassert() function. qcom_pcie_perst_deassert() is
>>> called whenever the host has deasserted PERST# and it is guaranteed that
>>> the refclk would be active at this point. So at the start of this function,
>>> the controller cleanup can be performed. Once finished, rest of the code
>>> execution for PERST# deassert can continue as usual.
>>>
>> How about doing the cleanup as part of pme turnoff message.
>> As host waits for L23 ready from the device side. we can use that time
>> to cleanup the host before sending L23 ready.
>>
> 
> Yes, but that's only applicable if the host properly powers down the device. But
> it won't work in the case of host crash or host abrupt poweroff.
> 
> - Mani
> 
Ack.

- Krishna Chaitanya.
>> - Krishna Chaitanya.
>>> Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
>>> ---
>>>    drivers/pci/controller/dwc/pcie-qcom-ep.c | 12 ++++++++++--
>>>    1 file changed, 10 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c
>>> index 2319ff2ae9f6..e024b4dcd76d 100644
>>> --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c
>>> +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c
>>> @@ -186,6 +186,8 @@ struct qcom_pcie_ep_cfg {
>>>     * @link_status: PCIe Link status
>>>     * @global_irq: Qualcomm PCIe specific Global IRQ
>>>     * @perst_irq: PERST# IRQ
>>> + * @cleanup_pending: Cleanup is pending for the controller (because refclk is
>>> + *                   needed for cleanup)
>>>     */
>>>    struct qcom_pcie_ep {
>>>    	struct dw_pcie pci;
>>> @@ -214,6 +216,7 @@ struct qcom_pcie_ep {
>>>    	enum qcom_pcie_ep_link_status link_status;
>>>    	int global_irq;
>>>    	int perst_irq;
>>> +	bool cleanup_pending;
>>>    };
>>>    static int qcom_pcie_ep_core_reset(struct qcom_pcie_ep *pcie_ep)
>>> @@ -389,6 +392,12 @@ static int qcom_pcie_perst_deassert(struct dw_pcie *pci)
>>>    		return ret;
>>>    	}
>>> +	if (pcie_ep->cleanup_pending) {
>>> +		pci_epc_deinit_notify(pci->ep.epc);
>>> +		dw_pcie_ep_cleanup(&pci->ep);
>>> +		pcie_ep->cleanup_pending = false;
>>> +	}
>>> +
>>>    	/* Assert WAKE# to RC to indicate device is ready */
>>>    	gpiod_set_value_cansleep(pcie_ep->wake, 1);
>>>    	usleep_range(WAKE_DELAY_US, WAKE_DELAY_US + 500);
>>> @@ -522,10 +531,9 @@ static void qcom_pcie_perst_assert(struct dw_pcie *pci)
>>>    {
>>>    	struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci);
>>> -	pci_epc_deinit_notify(pci->ep.epc);
>>> -	dw_pcie_ep_cleanup(&pci->ep);
>>>    	qcom_pcie_disable_resources(pcie_ep);
>>>    	pcie_ep->link_status = QCOM_PCIE_EP_LINK_DISABLED;
>>> +	pcie_ep->cleanup_pending = true;
>>>    }
>>>    /* Common DWC controller ops */
>
Krzysztof Wilczy��ski Aug. 13, 2024, 8:28 p.m. UTC | #4
Hello,

> Currently, the endpoint cleanup function dw_pcie_ep_cleanup() and EPF
> deinit notify function pci_epc_deinit_notify() are called during the
> execution of qcom_pcie_perst_assert() i.e., when the host has asserted
> PERST#. But quickly after this step, refclk will also be disabled by the
> host.
> 
> All of the Qcom endpoint SoCs supported as of now depend on the refclk from
> the host for keeping the controller operational. Due to this limitation,
> any access to the hardware registers in the absence of refclk will result
> in a whole endpoint crash. Unfortunately, most of the controller cleanups
> require accessing the hardware registers (like eDMA cleanup performed in
> dw_pcie_ep_cleanup(), powering down MHI EPF etc...). So these cleanup
> functions are currently causing the crash in the endpoint SoC once host
> asserts PERST#.
> 
> One way to address this issue is by generating the refclk in the endpoint
> itself and not depending on the host. But that is not always possible as
> some of the endpoint designs do require the endpoint to consume refclk from
> the host (as I was told by the Qcom engineers).
> 
> So let's fix this crash by moving the controller cleanups to the start of
> the qcom_pcie_perst_deassert() function. qcom_pcie_perst_deassert() is
> called whenever the host has deasserted PERST# and it is guaranteed that
> the refclk would be active at this point. So at the start of this function,
> the controller cleanup can be performed. Once finished, rest of the code
> execution for PERST# deassert can continue as usual.

Applied to controller/qcom, thank you!

[1/1] PCI: qcom-ep: Move controller cleanups to qcom_pcie_perst_deassert()
      https://git.kernel.org/pci/pci/c/6960cdc1ef97

	Krzysztof
Bjorn Helgaas Aug. 15, 2024, 10:47 p.m. UTC | #5
[+cc Vidya, Jon since tegra194 does similar things]

On Mon, Jul 29, 2024 at 05:52:45PM +0530, Manivannan Sadhasivam wrote:
> Currently, the endpoint cleanup function dw_pcie_ep_cleanup() and EPF
> deinit notify function pci_epc_deinit_notify() are called during the
> execution of qcom_pcie_perst_assert() i.e., when the host has asserted
> PERST#. But quickly after this step, refclk will also be disabled by the
> host.
> 
> All of the Qcom endpoint SoCs supported as of now depend on the refclk from
> the host for keeping the controller operational. Due to this limitation,
> any access to the hardware registers in the absence of refclk will result
> in a whole endpoint crash. Unfortunately, most of the controller cleanups
> require accessing the hardware registers (like eDMA cleanup performed in
> dw_pcie_ep_cleanup(), powering down MHI EPF etc...). So these cleanup
> functions are currently causing the crash in the endpoint SoC once host
> asserts PERST#.
> 
> One way to address this issue is by generating the refclk in the endpoint
> itself and not depending on the host. But that is not always possible as
> some of the endpoint designs do require the endpoint to consume refclk from
> the host (as I was told by the Qcom engineers).
> 
> So let's fix this crash by moving the controller cleanups to the start of
> the qcom_pcie_perst_deassert() function. qcom_pcie_perst_deassert() is
> called whenever the host has deasserted PERST# and it is guaranteed that
> the refclk would be active at this point. So at the start of this function,
> the controller cleanup can be performed. Once finished, rest of the code
> execution for PERST# deassert can continue as usual.

What makes this v6.11 material?  Does it fix a problem we added in
v6.11-rc1?

Is there a Fixes: commit?

This patch essentially does this:

  qcom_pcie_perst_assert
-   pci_epc_deinit_notify
-   dw_pcie_ep_cleanup
    qcom_pcie_disable_resources

  qcom_pcie_perst_deassert
+   if (pcie_ep->cleanup_pending)
+     pci_epc_deinit_notify(pci->ep.epc);
+     dw_pcie_ep_cleanup(&pci->ep);
    dw_pcie_ep_init_registers
    pci_epc_init_notify

Maybe it makes sense to call both pci_epc_deinit_notify() and
pci_epc_init_notify() from the PERST# deassert function, but it makes
me question whether we really need both.

pcie-tegra194.c has a similar structure:

  pex_ep_event_pex_rst_assert
    pci_epc_deinit_notify
    dw_pcie_ep_cleanup

  pex_ep_event_pex_rst_deassert
    dw_pcie_ep_init_registers
    pci_epc_init_notify

Is there a reason to make them different, or could/should a similar
change be made to tegra?

> Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> ---
>  drivers/pci/controller/dwc/pcie-qcom-ep.c | 12 ++++++++++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c
> index 2319ff2ae9f6..e024b4dcd76d 100644
> --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c
> +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c
> @@ -186,6 +186,8 @@ struct qcom_pcie_ep_cfg {
>   * @link_status: PCIe Link status
>   * @global_irq: Qualcomm PCIe specific Global IRQ
>   * @perst_irq: PERST# IRQ
> + * @cleanup_pending: Cleanup is pending for the controller (because refclk is
> + *                   needed for cleanup)
>   */
>  struct qcom_pcie_ep {
>  	struct dw_pcie pci;
> @@ -214,6 +216,7 @@ struct qcom_pcie_ep {
>  	enum qcom_pcie_ep_link_status link_status;
>  	int global_irq;
>  	int perst_irq;
> +	bool cleanup_pending;
>  };
>  
>  static int qcom_pcie_ep_core_reset(struct qcom_pcie_ep *pcie_ep)
> @@ -389,6 +392,12 @@ static int qcom_pcie_perst_deassert(struct dw_pcie *pci)
>  		return ret;
>  	}
>  
> +	if (pcie_ep->cleanup_pending) {

Do we really need this flag?  I assume the cleanup functions could
tell whether any previous setup was done?

> +		pci_epc_deinit_notify(pci->ep.epc);
> +		dw_pcie_ep_cleanup(&pci->ep);
> +		pcie_ep->cleanup_pending = false;
> +	}
> +
>  	/* Assert WAKE# to RC to indicate device is ready */
>  	gpiod_set_value_cansleep(pcie_ep->wake, 1);
>  	usleep_range(WAKE_DELAY_US, WAKE_DELAY_US + 500);
> @@ -522,10 +531,9 @@ static void qcom_pcie_perst_assert(struct dw_pcie *pci)
>  {
>  	struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci);
>  
> -	pci_epc_deinit_notify(pci->ep.epc);
> -	dw_pcie_ep_cleanup(&pci->ep);
>  	qcom_pcie_disable_resources(pcie_ep);
>  	pcie_ep->link_status = QCOM_PCIE_EP_LINK_DISABLED;
> +	pcie_ep->cleanup_pending = true;
>  }
>  
>  /* Common DWC controller ops */
> -- 
> 2.25.1
>
Manivannan Sadhasivam Aug. 16, 2024, 5 a.m. UTC | #6
On Thu, Aug 15, 2024 at 05:47:17PM -0500, Bjorn Helgaas wrote:
> [+cc Vidya, Jon since tegra194 does similar things]
> 
> On Mon, Jul 29, 2024 at 05:52:45PM +0530, Manivannan Sadhasivam wrote:
> > Currently, the endpoint cleanup function dw_pcie_ep_cleanup() and EPF
> > deinit notify function pci_epc_deinit_notify() are called during the
> > execution of qcom_pcie_perst_assert() i.e., when the host has asserted
> > PERST#. But quickly after this step, refclk will also be disabled by the
> > host.
> > 
> > All of the Qcom endpoint SoCs supported as of now depend on the refclk from
> > the host for keeping the controller operational. Due to this limitation,
> > any access to the hardware registers in the absence of refclk will result
> > in a whole endpoint crash. Unfortunately, most of the controller cleanups
> > require accessing the hardware registers (like eDMA cleanup performed in
> > dw_pcie_ep_cleanup(), powering down MHI EPF etc...). So these cleanup
> > functions are currently causing the crash in the endpoint SoC once host
> > asserts PERST#.
> > 
> > One way to address this issue is by generating the refclk in the endpoint
> > itself and not depending on the host. But that is not always possible as
> > some of the endpoint designs do require the endpoint to consume refclk from
> > the host (as I was told by the Qcom engineers).
> > 
> > So let's fix this crash by moving the controller cleanups to the start of
> > the qcom_pcie_perst_deassert() function. qcom_pcie_perst_deassert() is
> > called whenever the host has deasserted PERST# and it is guaranteed that
> > the refclk would be active at this point. So at the start of this function,
> > the controller cleanup can be performed. Once finished, rest of the code
> > execution for PERST# deassert can continue as usual.
> 
> What makes this v6.11 material?  Does it fix a problem we added in
> v6.11-rc1?
> 

No, this is not a 6.11 material, but the rest of the patches I shared offline.

> Is there a Fixes: commit?
> 

Hmm, the controller addition commit could be the valid fixes tag.

> This patch essentially does this:
> 
>   qcom_pcie_perst_assert
> -   pci_epc_deinit_notify
> -   dw_pcie_ep_cleanup
>     qcom_pcie_disable_resources
> 
>   qcom_pcie_perst_deassert
> +   if (pcie_ep->cleanup_pending)
> +     pci_epc_deinit_notify(pci->ep.epc);
> +     dw_pcie_ep_cleanup(&pci->ep);
>     dw_pcie_ep_init_registers
>     pci_epc_init_notify
> 
> Maybe it makes sense to call both pci_epc_deinit_notify() and
> pci_epc_init_notify() from the PERST# deassert function, but it makes
> me question whether we really need both.
> 

There is really no need to call pci_epc_deinit_notify() during the first
deassert (i.e., during the ep boot) because there are no cleanups to be done.
It is only needed during a successive PERST# assert + deassert.

> pcie-tegra194.c has a similar structure:
> 
>   pex_ep_event_pex_rst_assert
>     pci_epc_deinit_notify
>     dw_pcie_ep_cleanup
> 
>   pex_ep_event_pex_rst_deassert
>     dw_pcie_ep_init_registers
>     pci_epc_init_notify
> 
> Is there a reason to make them different, or could/should a similar
> change be made to tegra?
> 

Design wise both drivers are similar, so it could apply. I didn't spin a patch
because if testing of tegra driver gets delayed (I've seen this before), then I
do not want to stall merging the whole series. For Qcom it is important to get
this merged asap to avoid the crash.

> > Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> > ---
> >  drivers/pci/controller/dwc/pcie-qcom-ep.c | 12 ++++++++++--
> >  1 file changed, 10 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c
> > index 2319ff2ae9f6..e024b4dcd76d 100644
> > --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c
> > +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c
> > @@ -186,6 +186,8 @@ struct qcom_pcie_ep_cfg {
> >   * @link_status: PCIe Link status
> >   * @global_irq: Qualcomm PCIe specific Global IRQ
> >   * @perst_irq: PERST# IRQ
> > + * @cleanup_pending: Cleanup is pending for the controller (because refclk is
> > + *                   needed for cleanup)
> >   */
> >  struct qcom_pcie_ep {
> >  	struct dw_pcie pci;
> > @@ -214,6 +216,7 @@ struct qcom_pcie_ep {
> >  	enum qcom_pcie_ep_link_status link_status;
> >  	int global_irq;
> >  	int perst_irq;
> > +	bool cleanup_pending;
> >  };
> >  
> >  static int qcom_pcie_ep_core_reset(struct qcom_pcie_ep *pcie_ep)
> > @@ -389,6 +392,12 @@ static int qcom_pcie_perst_deassert(struct dw_pcie *pci)
> >  		return ret;
> >  	}
> >  
> > +	if (pcie_ep->cleanup_pending) {
> 
> Do we really need this flag?  I assume the cleanup functions could
> tell whether any previous setup was done?
> 

Not so. Some cleanup functions may trigger a warning if attempted to do it
before 'setup'. I think dw_edma_remove() that is part of dw_pcie_ep_cleanup()
does that IIRC.

- Mani
Bjorn Helgaas Aug. 16, 2024, 7:12 p.m. UTC | #7
On Fri, Aug 16, 2024 at 10:30:29AM +0530, Manivannan Sadhasivam wrote:
> On Thu, Aug 15, 2024 at 05:47:17PM -0500, Bjorn Helgaas wrote:
> > [+cc Vidya, Jon since tegra194 does similar things]
> > 
> > On Mon, Jul 29, 2024 at 05:52:45PM +0530, Manivannan Sadhasivam wrote:
> > > Currently, the endpoint cleanup function dw_pcie_ep_cleanup() and EPF
> > > deinit notify function pci_epc_deinit_notify() are called during the
> > > execution of qcom_pcie_perst_assert() i.e., when the host has asserted
> > > PERST#. But quickly after this step, refclk will also be disabled by the
> > > host.
> > > 
> > > All of the Qcom endpoint SoCs supported as of now depend on the refclk from
> > > the host for keeping the controller operational. Due to this limitation,
> > > any access to the hardware registers in the absence of refclk will result
> > > in a whole endpoint crash. Unfortunately, most of the controller cleanups
> > > require accessing the hardware registers (like eDMA cleanup performed in
> > > dw_pcie_ep_cleanup(), powering down MHI EPF etc...). So these cleanup
> > > functions are currently causing the crash in the endpoint SoC once host
> > > asserts PERST#.
> > > 
> > > One way to address this issue is by generating the refclk in the endpoint
> > > itself and not depending on the host. But that is not always possible as
> > > some of the endpoint designs do require the endpoint to consume refclk from
> > > the host (as I was told by the Qcom engineers).
> > > 
> > > So let's fix this crash by moving the controller cleanups to the start of
> > > the qcom_pcie_perst_deassert() function. qcom_pcie_perst_deassert() is
> > > called whenever the host has deasserted PERST# and it is guaranteed that
> > > the refclk would be active at this point. So at the start of this function,
> > > the controller cleanup can be performed. Once finished, rest of the code
> > > execution for PERST# deassert can continue as usual.
> > 
> > What makes this v6.11 material?  Does it fix a problem we added in
> > v6.11-rc1?
> 
> No, this is not a 6.11 material, but the rest of the patches I
> shared offline.

For reference, the patches you shared offline are:

  PCI: qcom: Use OPP only if the platform supports it
  PCI: qcom-ep: Do not enable resources during probe()
  PCI: qcom-ep: Disable MHI RAM data parity error interrupt for SA8775P SoC
  PCI: qcom-ep: Move controller cleanups to qcom_pcie_perst_deassert()

> > Is there a Fixes: commit?
> 
> Hmm, the controller addition commit could be the valid fixes tag.
> 
> > This patch essentially does this:
> > 
> >   qcom_pcie_perst_assert
> > -   pci_epc_deinit_notify
> > -   dw_pcie_ep_cleanup
> >     qcom_pcie_disable_resources
> > 
> >   qcom_pcie_perst_deassert
> > +   if (pcie_ep->cleanup_pending)
> > +     pci_epc_deinit_notify(pci->ep.epc);
> > +     dw_pcie_ep_cleanup(&pci->ep);
> >     dw_pcie_ep_init_registers
> >     pci_epc_init_notify
> > 
> > Maybe it makes sense to call both pci_epc_deinit_notify() and
> > pci_epc_init_notify() from the PERST# deassert function, but it makes
> > me question whether we really need both.
> 
> There is really no need to call pci_epc_deinit_notify() during the first
> deassert (i.e., during the ep boot) because there are no cleanups to be done.
> It is only needed during a successive PERST# assert + deassert.
> 
> > pcie-tegra194.c has a similar structure:
> > 
> >   pex_ep_event_pex_rst_assert
> >     pci_epc_deinit_notify
> >     dw_pcie_ep_cleanup
> > 
> >   pex_ep_event_pex_rst_deassert
> >     dw_pcie_ep_init_registers
> >     pci_epc_init_notify
> > 
> > Is there a reason to make them different, or could/should a similar
> > change be made to tegra?
> 
> Design wise both drivers are similar, so it could apply. I didn't
> spin a patch because if testing of tegra driver gets delayed (I've
> seen this before), then I do not want to stall merging the whole
> series. 

It can and should be separate patches, one per driver.  But I don't
want to end up with the drivers being needlessly different.

> For Qcom it is important to get this merged asap to avoid
> the crash.

If this is not v6.11 material, there's time to work this out.

> > > +	if (pcie_ep->cleanup_pending) {
> > 
> > Do we really need this flag?  I assume the cleanup functions could
> > tell whether any previous setup was done?
> 
> Not so. Some cleanup functions may trigger a warning if attempted to do it
> before 'setup'. I think dw_edma_remove() that is part of dw_pcie_ep_cleanup()
> does that IIRC.

It looks safe to me:

  dw_pcie_ep_cleanup
    dw_pcie_edma_remove
      dw_edma_remove(chip = &pci->edma)       # struct dw_pcie *pci
        dev = chip->dev
        dw = chip->dw
        if (!dw)
          return -ENODEV

but if not, it could probably be made safe by adding a NULL pointer
check and/or a "chip->dw = NULL" at the right spot.

We hardly have any cleanup functions affected by "cleanup_pending", so
I think we can decide that they should be safe before 'setup' and just
make it so.

Bjorn
Manivannan Sadhasivam Aug. 17, 2024, 2:01 a.m. UTC | #8
On Fri, Aug 16, 2024 at 02:12:22PM -0500, Bjorn Helgaas wrote:
> On Fri, Aug 16, 2024 at 10:30:29AM +0530, Manivannan Sadhasivam wrote:
> > On Thu, Aug 15, 2024 at 05:47:17PM -0500, Bjorn Helgaas wrote:
> > > [+cc Vidya, Jon since tegra194 does similar things]
> > > 
> > > On Mon, Jul 29, 2024 at 05:52:45PM +0530, Manivannan Sadhasivam wrote:
> > > > Currently, the endpoint cleanup function dw_pcie_ep_cleanup() and EPF
> > > > deinit notify function pci_epc_deinit_notify() are called during the
> > > > execution of qcom_pcie_perst_assert() i.e., when the host has asserted
> > > > PERST#. But quickly after this step, refclk will also be disabled by the
> > > > host.
> > > > 
> > > > All of the Qcom endpoint SoCs supported as of now depend on the refclk from
> > > > the host for keeping the controller operational. Due to this limitation,
> > > > any access to the hardware registers in the absence of refclk will result
> > > > in a whole endpoint crash. Unfortunately, most of the controller cleanups
> > > > require accessing the hardware registers (like eDMA cleanup performed in
> > > > dw_pcie_ep_cleanup(), powering down MHI EPF etc...). So these cleanup
> > > > functions are currently causing the crash in the endpoint SoC once host
> > > > asserts PERST#.
> > > > 
> > > > One way to address this issue is by generating the refclk in the endpoint
> > > > itself and not depending on the host. But that is not always possible as
> > > > some of the endpoint designs do require the endpoint to consume refclk from
> > > > the host (as I was told by the Qcom engineers).
> > > > 
> > > > So let's fix this crash by moving the controller cleanups to the start of
> > > > the qcom_pcie_perst_deassert() function. qcom_pcie_perst_deassert() is
> > > > called whenever the host has deasserted PERST# and it is guaranteed that
> > > > the refclk would be active at this point. So at the start of this function,
> > > > the controller cleanup can be performed. Once finished, rest of the code
> > > > execution for PERST# deassert can continue as usual.
> > > 
> > > What makes this v6.11 material?  Does it fix a problem we added in
> > > v6.11-rc1?
> > 
> > No, this is not a 6.11 material, but the rest of the patches I
> > shared offline.
> 
> For reference, the patches you shared offline are:
> 
>   PCI: qcom: Use OPP only if the platform supports it
>   PCI: qcom-ep: Do not enable resources during probe()
>   PCI: qcom-ep: Disable MHI RAM data parity error interrupt for SA8775P SoC
>   PCI: qcom-ep: Move controller cleanups to qcom_pcie_perst_deassert()
> 

And then the note...

"last one is not strictly a 6.11 material, but rest are"

Sorry if that confused you. I shouldn't have mentioned this patch anyway.

> > > Is there a Fixes: commit?
> > 
> > Hmm, the controller addition commit could be the valid fixes tag.
> > 
> > > This patch essentially does this:
> > > 
> > >   qcom_pcie_perst_assert
> > > -   pci_epc_deinit_notify
> > > -   dw_pcie_ep_cleanup
> > >     qcom_pcie_disable_resources
> > > 
> > >   qcom_pcie_perst_deassert
> > > +   if (pcie_ep->cleanup_pending)
> > > +     pci_epc_deinit_notify(pci->ep.epc);
> > > +     dw_pcie_ep_cleanup(&pci->ep);
> > >     dw_pcie_ep_init_registers
> > >     pci_epc_init_notify
> > > 
> > > Maybe it makes sense to call both pci_epc_deinit_notify() and
> > > pci_epc_init_notify() from the PERST# deassert function, but it makes
> > > me question whether we really need both.
> > 
> > There is really no need to call pci_epc_deinit_notify() during the first
> > deassert (i.e., during the ep boot) because there are no cleanups to be done.
> > It is only needed during a successive PERST# assert + deassert.
> > 
> > > pcie-tegra194.c has a similar structure:
> > > 
> > >   pex_ep_event_pex_rst_assert
> > >     pci_epc_deinit_notify
> > >     dw_pcie_ep_cleanup
> > > 
> > >   pex_ep_event_pex_rst_deassert
> > >     dw_pcie_ep_init_registers
> > >     pci_epc_init_notify
> > > 
> > > Is there a reason to make them different, or could/should a similar
> > > change be made to tegra?
> > 
> > Design wise both drivers are similar, so it could apply. I didn't
> > spin a patch because if testing of tegra driver gets delayed (I've
> > seen this before), then I do not want to stall merging the whole
> > series. 
> 
> It can and should be separate patches, one per driver.  But I don't
> want to end up with the drivers being needlessly different.
> 

Ok. Let me spin a patch for that driver also.

> > For Qcom it is important to get this merged asap to avoid
> > the crash.
> 
> If this is not v6.11 material, there's time to work this out.
> 
> > > > +	if (pcie_ep->cleanup_pending) {
> > > 
> > > Do we really need this flag?  I assume the cleanup functions could
> > > tell whether any previous setup was done?
> > 
> > Not so. Some cleanup functions may trigger a warning if attempted to do it
> > before 'setup'. I think dw_edma_remove() that is part of dw_pcie_ep_cleanup()
> > does that IIRC.
> 
> It looks safe to me:
> 
>   dw_pcie_ep_cleanup
>     dw_pcie_edma_remove
>       dw_edma_remove(chip = &pci->edma)       # struct dw_pcie *pci
>         dev = chip->dev
>         dw = chip->dw
>         if (!dw)
>           return -ENODEV
> 
> but if not, it could probably be made safe by adding a NULL pointer
> check and/or a "chip->dw = NULL" at the right spot.
> 
> We hardly have any cleanup functions affected by "cleanup_pending", so
> I think we can decide that they should be safe before 'setup' and just
> make it so.
> 

I just tested by removing the cleanup flag and it doesn't seem to scream. Maybe
the issue I saw previously was unrelated.

- Mani
Bjorn Helgaas Aug. 21, 2024, 9:43 p.m. UTC | #9
On Wed, Aug 14, 2024 at 05:28:37AM +0900, Krzysztof Wilczyński wrote:
> > Currently, the endpoint cleanup function dw_pcie_ep_cleanup() and EPF
> > deinit notify function pci_epc_deinit_notify() are called during the
> > execution of qcom_pcie_perst_assert() i.e., when the host has asserted
> > PERST#. But quickly after this step, refclk will also be disabled by the
> > host.
> > 
> > All of the Qcom endpoint SoCs supported as of now depend on the refclk from
> > the host for keeping the controller operational. Due to this limitation,
> > any access to the hardware registers in the absence of refclk will result
> > in a whole endpoint crash. Unfortunately, most of the controller cleanups
> > require accessing the hardware registers (like eDMA cleanup performed in
> > dw_pcie_ep_cleanup(), powering down MHI EPF etc...). So these cleanup
> > functions are currently causing the crash in the endpoint SoC once host
> > asserts PERST#.
> > 
> > One way to address this issue is by generating the refclk in the endpoint
> > itself and not depending on the host. But that is not always possible as
> > some of the endpoint designs do require the endpoint to consume refclk from
> > the host (as I was told by the Qcom engineers).
> > 
> > So let's fix this crash by moving the controller cleanups to the start of
> > the qcom_pcie_perst_deassert() function. qcom_pcie_perst_deassert() is
> > called whenever the host has deasserted PERST# and it is guaranteed that
> > the refclk would be active at this point. So at the start of this function,
> > the controller cleanup can be performed. Once finished, rest of the code
> > execution for PERST# deassert can continue as usual.
> 
> Applied to controller/qcom, thank you!
> 
> [1/1] PCI: qcom-ep: Move controller cleanups to qcom_pcie_perst_deassert()
>       https://git.kernel.org/pci/pci/c/6960cdc1ef97

I dropped this for now, looking for a new simpler version without
"cleanup_pending" and a similar change for tegra194 (separate patch).

I think it's still an open question whether both
pci_epc_deinit_notify() and pci_epc_init_notify() are needed, but that
should be separate and I don't think that would fix a crash.

You said this was not strictly v6.11 material, but it does fix a
crash, and it only touches the endpoint driver, so ... it seems like a
possible candidate, especially if we can identify a recent commit that
caused the crash.

Bjorn
Krzysztof Wilczy��ski Sept. 1, 2024, 4:35 p.m. UTC | #10
Hello,

[...]
> Applied to controller/qcom, thank you!

Based on the conversation here, I removed this patch from the branch.

	Krzysztof
diff mbox series

Patch

diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c
index 2319ff2ae9f6..e024b4dcd76d 100644
--- a/drivers/pci/controller/dwc/pcie-qcom-ep.c
+++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c
@@ -186,6 +186,8 @@  struct qcom_pcie_ep_cfg {
  * @link_status: PCIe Link status
  * @global_irq: Qualcomm PCIe specific Global IRQ
  * @perst_irq: PERST# IRQ
+ * @cleanup_pending: Cleanup is pending for the controller (because refclk is
+ *                   needed for cleanup)
  */
 struct qcom_pcie_ep {
 	struct dw_pcie pci;
@@ -214,6 +216,7 @@  struct qcom_pcie_ep {
 	enum qcom_pcie_ep_link_status link_status;
 	int global_irq;
 	int perst_irq;
+	bool cleanup_pending;
 };
 
 static int qcom_pcie_ep_core_reset(struct qcom_pcie_ep *pcie_ep)
@@ -389,6 +392,12 @@  static int qcom_pcie_perst_deassert(struct dw_pcie *pci)
 		return ret;
 	}
 
+	if (pcie_ep->cleanup_pending) {
+		pci_epc_deinit_notify(pci->ep.epc);
+		dw_pcie_ep_cleanup(&pci->ep);
+		pcie_ep->cleanup_pending = false;
+	}
+
 	/* Assert WAKE# to RC to indicate device is ready */
 	gpiod_set_value_cansleep(pcie_ep->wake, 1);
 	usleep_range(WAKE_DELAY_US, WAKE_DELAY_US + 500);
@@ -522,10 +531,9 @@  static void qcom_pcie_perst_assert(struct dw_pcie *pci)
 {
 	struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci);
 
-	pci_epc_deinit_notify(pci->ep.epc);
-	dw_pcie_ep_cleanup(&pci->ep);
 	qcom_pcie_disable_resources(pcie_ep);
 	pcie_ep->link_status = QCOM_PCIE_EP_LINK_DISABLED;
+	pcie_ep->cleanup_pending = true;
 }
 
 /* Common DWC controller ops */