diff mbox series

[RFC] usb: typec: ucsi: ack connector change after all tasks finish

Message ID vuh25ueep3rwcmthlkvhb2avpkqzc6lsbee3qdmerolijq7azq@rwmakgznqvmq (mailing list archive)
State New, archived
Headers show
Series [RFC] usb: typec: ucsi: ack connector change after all tasks finish | expand

Commit Message

Diogo Ivo March 27, 2024, 12:39 p.m. UTC
The UCSI specification mentions that when the OPM is notified by the
PPM of an asynchronous event it should first send all the commands it
needs to get the details of the event and only after acknowledge it by
sending ACK_CC_CI with the Connector Change bit set, while the current
code sends this ack immediately after scheduling all the partner_tasks.
Move this ACK_CC_CI command to the end of all partner_tasks.

This fixes a problem with some LG Gram laptops where the PPM sometimes
notifies the OPM with the Connector Change Indicator field set in the
CCI after an ACK_CC_CI command is sent, causing the UCSI stack to check
the connector status indefinitely since the EVENT_PENDING bit is already
cleared. This causes an interrupt storm and an artificial high load on
these platforms.

It would also be interesting to see if we could take this approach and
implement the discussion in [1] regarding sending an ACK_CC_CI command
that acks both the command completion and the connector change.

Signed-off-by: Diogo Ivo <diogo.ivo@tecnico.ulisboa.pt>

[1]: https://lore.kernel.org/all/20240320073927.1641788-1-lk@c--e.de/
---
 drivers/usb/typec/ucsi/ucsi.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

Comments

Dmitry Baryshkov March 27, 2024, 1:06 p.m. UTC | #1
On Wed, 27 Mar 2024 at 14:39, Diogo Ivo <diogo.ivo@tecnico.ulisboa.pt> wrote:
>
> The UCSI specification mentions that when the OPM is notified by the
> PPM of an asynchronous event it should first send all the commands it
> needs to get the details of the event and only after acknowledge it by
> sending ACK_CC_CI with the Connector Change bit set, while the current
> code sends this ack immediately after scheduling all the partner_tasks.
> Move this ACK_CC_CI command to the end of all partner_tasks.
>
> This fixes a problem with some LG Gram laptops where the PPM sometimes
> notifies the OPM with the Connector Change Indicator field set in the
> CCI after an ACK_CC_CI command is sent, causing the UCSI stack to check
> the connector status indefinitely since the EVENT_PENDING bit is already
> cleared. This causes an interrupt storm and an artificial high load on
> these platforms.
>
> It would also be interesting to see if we could take this approach and
> implement the discussion in [1] regarding sending an ACK_CC_CI command
> that acks both the command completion and the connector change.
>
> Signed-off-by: Diogo Ivo <diogo.ivo@tecnico.ulisboa.pt>
>
> [1]: https://lore.kernel.org/all/20240320073927.1641788-1-lk@c--e.de/

We had similar issue, see
https://lore.kernel.org/linux-arm-msm/20240313-qcom-ucsi-fixes-v1-1-74d90cb48a00@linaro.org/


> ---
>  drivers/usb/typec/ucsi/ucsi.c | 27 ++++++++++++++++++++-------
>  1 file changed, 20 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c
> index 0c8f3b3a99d6..b8b39e43aba8 100644
> --- a/drivers/usb/typec/ucsi/ucsi.c
> +++ b/drivers/usb/typec/ucsi/ucsi.c
> @@ -69,6 +69,23 @@ static int ucsi_acknowledge_connector_change(struct ucsi *ucsi)
>         return ucsi->ops->sync_write(ucsi, UCSI_CONTROL, &ctrl, sizeof(ctrl));
>  }
>
> +static void ucsi_handle_ack_connector_change(struct ucsi_connector *con)
> +{
> +       struct ucsi *ucsi = con->ucsi;
> +       int ret;
> +
> +       if (list_empty(&con->partner_tasks)) {
> +               mutex_lock(&ucsi->ppm_lock);
> +               ret = ucsi_acknowledge_connector_change(ucsi);
> +               mutex_unlock(&ucsi->ppm_lock);
> +
> +               if (ret)
> +                       dev_err(ucsi->dev, "%s: ACK failed (%d)", __func__, ret);
> +
> +               clear_bit(EVENT_PENDING, &ucsi->flags);
> +       }
> +}
> +
>  static int ucsi_exec_command(struct ucsi *ucsi, u64 command);
>
>  static int ucsi_read_error(struct ucsi *ucsi)
> @@ -222,6 +239,7 @@ static void ucsi_poll_worker(struct work_struct *work)
>                 list_del(&uwork->node);
>                 mutex_unlock(&con->lock);
>                 kfree(uwork);
> +               ucsi_handle_ack_connector_change(con);
>                 return;
>         }
>
> @@ -232,6 +250,7 @@ static void ucsi_poll_worker(struct work_struct *work)
>         } else {
>                 list_del(&uwork->node);
>                 kfree(uwork);
> +               ucsi_handle_ack_connector_change(con);
>         }
>
>         mutex_unlock(&con->lock);
> @@ -1215,13 +1234,7 @@ static void ucsi_handle_connector_change(struct work_struct *work)
>         if (con->status.change & UCSI_CONSTAT_CAM_CHANGE)
>                 ucsi_partner_task(con, ucsi_check_altmodes, 1, 0);
>
> -       clear_bit(EVENT_PENDING, &con->ucsi->flags);
> -
> -       mutex_lock(&ucsi->ppm_lock);
> -       ret = ucsi_acknowledge_connector_change(ucsi);
> -       mutex_unlock(&ucsi->ppm_lock);
> -       if (ret)
> -               dev_err(ucsi->dev, "%s: ACK failed (%d)", __func__, ret);
> +       ucsi_handle_ack_connector_change(con);
>
>  out_unlock:
>         mutex_unlock(&con->lock);
> --
> 2.44.0
>
Diogo Ivo March 27, 2024, 2:37 p.m. UTC | #2
On Wed, Mar 27, 2024 at 03:06:28PM +0200, Dmitry Baryshkov wrote:
> On Wed, 27 Mar 2024 at 14:39, Diogo Ivo <diogo.ivo@tecnico.ulisboa.pt> wrote:
> >
> > The UCSI specification mentions that when the OPM is notified by the
> > PPM of an asynchronous event it should first send all the commands it
> > needs to get the details of the event and only after acknowledge it by
> > sending ACK_CC_CI with the Connector Change bit set, while the current
> > code sends this ack immediately after scheduling all the partner_tasks.
> > Move this ACK_CC_CI command to the end of all partner_tasks.
> >
> > This fixes a problem with some LG Gram laptops where the PPM sometimes
> > notifies the OPM with the Connector Change Indicator field set in the
> > CCI after an ACK_CC_CI command is sent, causing the UCSI stack to check
> > the connector status indefinitely since the EVENT_PENDING bit is already
> > cleared. This causes an interrupt storm and an artificial high load on
> > these platforms.
> >
> > It would also be interesting to see if we could take this approach and
> > implement the discussion in [1] regarding sending an ACK_CC_CI command
> > that acks both the command completion and the connector change.
> >
> > Signed-off-by: Diogo Ivo <diogo.ivo@tecnico.ulisboa.pt>
> >
> > [1]: https://lore.kernel.org/all/20240320073927.1641788-1-lk@c--e.de/
> 
> We had similar issue, see
> https://lore.kernel.org/linux-arm-msm/20240313-qcom-ucsi-fixes-v1-1-74d90cb48a00@linaro.org/

Hi Dmitry,

I had seen that patch and applied it to see if it fixed the problem but
unfortunately it doesn't. The problem I am seeing is that during the
various partner_tasks the CCI sometimes comes back with the Connector
Change Indicator set after an ACK_CC_CI to ack the commands, for which
the patch you mentioned cannot have an effect.

Besides, there is the question of spec compliance. From my interpretation
of page 14 of the 1.0 spec:

"Once the OPM is notified of either a command completion and/or an
asynchronous event it shall:

1. Read the CCI and optionally the STATUS Data Structures
2. If this was an asynchronous event then it shall send any other commands
it needs to get details on the asynchronous event.
3. Acknowledge the notification via the ACK_CC_CI (Acknowledge Command
Completion and/or Change Indication) command. The only notification that is
not acknowledge by the OPM is the command completion notification for the
ACK_CC_CI or the PPM_RESET command."

we first need to send the commands and only after ACK_CC_CI the connector
change which would be addressed by my proposed change, but please let me
know your thoughts on this/if this makes sense.

Best regards,

Diogo
Christian A. Ehrhardt March 27, 2024, 4:06 p.m. UTC | #3
Hi,

thanks for bringing this to my attention.

On Wed, Mar 27, 2024 at 12:39:04PM +0000, Diogo Ivo wrote:
> The UCSI specification mentions that when the OPM is notified by the
> PPM of an asynchronous event it should first send all the commands it
> needs to get the details of the event and only after acknowledge it by
> sending ACK_CC_CI with the Connector Change bit set, while the current
> code sends this ack immediately after scheduling all the partner_tasks.
> Move this ACK_CC_CI command to the end of all partner_tasks.

I think this is reading too much into the spec. The only thing we
really need to know about the event itself is what we get from
the initial UCSI_GET_CONNECTOR_STATUS command. I was planning to
send a change that acks the connector change directly along with this.

All of the problems that I saw in this area were due to the fact
that the connector change indicator was cleared too late and not
too early.

Moreover, it can take quite some time to handle a connector change on
one connector. This would block any progress on a different connector,
too.

> This fixes a problem with some LG Gram laptops where the PPM sometimes
> notifies the OPM with the Connector Change Indicator field set in the
> CCI after an ACK_CC_CI command is sent,causing the UCSI stack to check
> the connector status indefinitely since the EVENT_PENDING bit is already
> cleared. This causes an interrupt storm and an artificial high load on
> these platforms.

If the PPM does this for a connector change ACK_CC_CI command it is
IMHO violating the spec (unless there is a _new_ event).
When I saw this type of loops the connector change indicator was set
in response to an ACK_CC_CI command for a command (sent by a different
thread for a different connector) between clearing the EVENT_PENDING
bit and acquiring the PPM lock.

Can you test if the changes that already are in usb-linus are
sufficient to fix your issues?

      regards    Christian
Christian A. Ehrhardt March 27, 2024, 4:42 p.m. UTC | #4
Hi,

On Wed, Mar 27, 2024 at 12:39:04PM +0000, Diogo Ivo wrote:
> The UCSI specification mentions that when the OPM is notified by the
> PPM of an asynchronous event it should first send all the commands it
> needs to get the details of the event and only after acknowledge it by
> sending ACK_CC_CI with the Connector Change bit set, while the current
> code sends this ack immediately after scheduling all the partner_tasks.
> Move this ACK_CC_CI command to the end of all partner_tasks.
> 
> This fixes a problem with some LG Gram laptops where the PPM sometimes
> notifies the OPM with the Connector Change Indicator field set in the
> CCI after an ACK_CC_CI command is sent, causing the UCSI stack to check
> the connector status indefinitely since the EVENT_PENDING bit is already
> cleared. This causes an interrupt storm and an artificial high load on
> these platforms.
> 
> It would also be interesting to see if we could take this approach and
> implement the discussion in [1] regarding sending an ACK_CC_CI command
> that acks both the command completion and the connector change.
> 
> Signed-off-by: Diogo Ivo <diogo.ivo@tecnico.ulisboa.pt>
> 
> [1]: https://lore.kernel.org/all/20240320073927.1641788-1-lk@c--e.de/
> ---
>  drivers/usb/typec/ucsi/ucsi.c | 27 ++++++++++++++++++++-------
>  1 file changed, 20 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c
> index 0c8f3b3a99d6..b8b39e43aba8 100644
> --- a/drivers/usb/typec/ucsi/ucsi.c
> +++ b/drivers/usb/typec/ucsi/ucsi.c
> @@ -69,6 +69,23 @@ static int ucsi_acknowledge_connector_change(struct ucsi *ucsi)
>  	return ucsi->ops->sync_write(ucsi, UCSI_CONTROL, &ctrl, sizeof(ctrl));
>  }
>  
> +static void ucsi_handle_ack_connector_change(struct ucsi_connector *con)
> +{
> +	struct ucsi *ucsi = con->ucsi;
> +	int ret;
> +
> +	if (list_empty(&con->partner_tasks)) {
> +		mutex_lock(&ucsi->ppm_lock);
> +		ret = ucsi_acknowledge_connector_change(ucsi);
> +		mutex_unlock(&ucsi->ppm_lock);
> +
> +		if (ret)
> +			dev_err(ucsi->dev, "%s: ACK failed (%d)", __func__, ret);

What if a real async connector change event happens here? It can because
you just cleared the connector change condition. But it will be ignored
because EVENT_PENDING is still set. In practive the new event might even
be reported in the CCI along with the completion of the ACK command
above (without an additional async event).

So the PPM now thinks it reported the event to us and will not send
an async event again. But we will never check for it because we ignored
it and the UCSI will be stuck forever.

What UCSI backend (ACPI, CCG, ...) is this?

> +
> +		clear_bit(EVENT_PENDING, &ucsi->flags);
> +	}
> +}
> +
>  static int ucsi_exec_command(struct ucsi *ucsi, u64 command);

      regards  Christian
Diogo Ivo March 28, 2024, 3:42 p.m. UTC | #5
On Wed, Mar 27, 2024 at 05:06:38PM +0100, Christian Ehrhardt wrote:
> 
> Hi,
> 
> thanks for bringing this to my attention.
> 
> On Wed, Mar 27, 2024 at 12:39:04PM +0000, Diogo Ivo wrote:
> > The UCSI specification mentions that when the OPM is notified by the
> > PPM of an asynchronous event it should first send all the commands it
> > needs to get the details of the event and only after acknowledge it by
> > sending ACK_CC_CI with the Connector Change bit set, while the current
> > code sends this ack immediately after scheduling all the partner_tasks.
> > Move this ACK_CC_CI command to the end of all partner_tasks.
> 
> I think this is reading too much into the spec. The only thing we
> really need to know about the event itself is what we get from
> the initial UCSI_GET_CONNECTOR_STATUS command. I was planning to
> send a change that acks the connector change directly along with this.
> 
> All of the problems that I saw in this area were due to the fact
> that the connector change indicator was cleared too late and not
> too early.
> 
> Moreover, it can take quite some time to handle a connector change on
> one connector. This would block any progress on a different connector,
> too.

Yes, this is true. We could move EVENT_PENDING bit into ucsi_connector so
it wouldn't block progress on other connectors but if the interpretation
of the spec is that we don't need to send the connector change ACK_CC_CI
at the very end then I guess it doesn't make much sense.

> > This fixes a problem with some LG Gram laptops where the PPM sometimes
> > notifies the OPM with the Connector Change Indicator field set in the
> > CCI after an ACK_CC_CI command is sent,causing the UCSI stack to check
> > the connector status indefinitely since the EVENT_PENDING bit is already
> > cleared. This causes an interrupt storm and an artificial high load on
> > these platforms.
> 
> If the PPM does this for a connector change ACK_CC_CI command it is
> IMHO violating the spec (unless there is a _new_ event).

Yes, the problem is exactly that the PPM in these laptops is really not
conformant with the spec and moving the command change ACK_CC_CI to the
end circumvented the problems in the PPM. If [1] is the way to go then
we need some sort of quirk for these devices and I'll have to dig
deeper.

> When I saw this type of loops the connector change indicator was set
> in response to an ACK_CC_CI command for a command (sent by a different
> thread for a different connector) between clearing the EVENT_PENDING
> bit and acquiring the PPM lock.
> 
> Can you test if the changes that already are in usb-linus are
> sufficient to fix your issues?

I am seeing these problems when addressing one connector only, so other
threads for other connectors do not play a role here. I have tested the
latest usb-linus with and without your early ack patch set [1] on top
and the issue is still not fixed.

[1]: https://lore.kernel.org/linux-usb/20240327224554.1772525-1-lk@c--e.de/

Best regards,

Diogo
Diogo Ivo March 28, 2024, 4:04 p.m. UTC | #6
On Wed, Mar 27, 2024 at 05:42:44PM +0100, Christian Ehrhardt wrote:
> 
> Hi,
> 
> On Wed, Mar 27, 2024 at 12:39:04PM +0000, Diogo Ivo wrote:
> > 				...
> > +static void ucsi_handle_ack_connector_change(struct ucsi_connector *con)
> > +{
> > +	struct ucsi *ucsi = con->ucsi;
> > +	int ret;
> > +
> > +	if (list_empty(&con->partner_tasks)) {
> > +		mutex_lock(&ucsi->ppm_lock);
> > +		ret = ucsi_acknowledge_connector_change(ucsi);
> > +		mutex_unlock(&ucsi->ppm_lock);
> > +
> > +		if (ret)
> > +			dev_err(ucsi->dev, "%s: ACK failed (%d)", __func__, ret);
> 
> What if a real async connector change event happens here? It can because
> you just cleared the connector change condition. But it will be ignored
> because EVENT_PENDING is still set. In practive the new event might even
> be reported in the CCI along with the completion of the ACK command
> above (without an additional async event).

This patch was more to see how this idea would be received so I didn't
think too much about problems with timings, this would come after but
yes, you are correct in pointing this out.

> What UCSI backend (ACPI, CCG, ...) is this?

It is ACPI.

Best regards,

Diogo
Christian A. Ehrhardt March 28, 2024, 10:40 p.m. UTC | #7
Hi,

On Thu, Mar 28, 2024 at 03:42:40PM +0000, Diogo Ivo wrote:
> On Wed, Mar 27, 2024 at 05:06:38PM +0100, Christian Ehrhardt wrote:
> > On Wed, Mar 27, 2024 at 12:39:04PM +0000, Diogo Ivo wrote:
> > > This fixes a problem with some LG Gram laptops where the PPM sometimes
> > > notifies the OPM with the Connector Change Indicator field set in the
> > > CCI after an ACK_CC_CI command is sent,causing the UCSI stack to check
> > > the connector status indefinitely since the EVENT_PENDING bit is already
> > > cleared. This causes an interrupt storm and an artificial high load on
> > > these platforms.
> > 
> > If the PPM does this for a connector change ACK_CC_CI command it is
> > IMHO violating the spec (unless there is a _new_ event).
> 
> Yes, the problem is exactly that the PPM in these laptops is really not
> conformant with the spec and moving the command change ACK_CC_CI to the
> end circumvented the problems in the PPM. If [1] is the way to go then
> we need some sort of quirk for these devices and I'll have to dig
> deeper.

Just to make this clear: This is not my call to make.

> > When I saw this type of loops the connector change indicator was set
> > in response to an ACK_CC_CI command for a command (sent by a different
> > thread for a different connector) between clearing the EVENT_PENDING
> > bit and acquiring the PPM lock.
> > 
> > Can you test if the changes that already are in usb-linus are
> > sufficient to fix your issues?
> 
> I am seeing these problems when addressing one connector only, so other
> threads for other connectors do not play a role here. I have tested the
> latest usb-linus with and without your early ack patch set [1] on top
> and the issue is still not fixed.

There are legitimate reaons why the connector change indicator
is set in response to a command:
- If the condition was reported previously it is sticky until
  cleared.
- Something else changed on the connector.

For a more complicated device that I have here, there are five
different connector change events after plugging it in.

I'd like to understand why you run into a loop here.
Printing the completed command (if any) and the CCI in
ucsi_acpi_notify() and the details of the connector status in
ucsi_handle_connector_change() could shed some light on this.


> [1]: https://lore.kernel.org/linux-usb/20240327224554.1772525-1-lk@c--e.de/

Best regards   Christian
Diogo Ivo April 24, 2024, 11:23 a.m. UTC | #8
Hi Christian, sorry for the late reply!

On Thu, Mar 28, 2024 at 11:40:09PM +0100, Christian Ehrhardt wrote:
> 
> On Thu, Mar 28, 2024 at 03:42:40PM +0000, Diogo Ivo wrote:
> > On Wed, Mar 27, 2024 at 05:06:38PM +0100, Christian Ehrhardt wrote:
> > > On Wed, Mar 27, 2024 at 12:39:04PM +0000, Diogo Ivo wrote:
> > > > This fixes a problem with some LG Gram laptops where the PPM sometimes
> > > > notifies the OPM with the Connector Change Indicator field set in the
> > > > CCI after an ACK_CC_CI command is sent,causing the UCSI stack to check
> > > > the connector status indefinitely since the EVENT_PENDING bit is already
> > > > cleared. This causes an interrupt storm and an artificial high load on
> > > > these platforms.
> > > 
> > > If the PPM does this for a connector change ACK_CC_CI command it is
> > > IMHO violating the spec (unless there is a _new_ event).
> > > When I saw this type of loops the connector change indicator was set
> > > in response to an ACK_CC_CI command for a command (sent by a different
> > > thread for a different connector) between clearing the EVENT_PENDING
> > > bit and acquiring the PPM lock.
> 
> There are legitimate reaons why the connector change indicator
> is set in response to a command:
> - If the condition was reported previously it is sticky until
>   cleared.
> - Something else changed on the connector.
> 
> For a more complicated device that I have here, there are five
> different connector change events after plugging it in.
> 
> I'd like to understand why you run into a loop here.
> Printing the completed command (if any) and the CCI in
> ucsi_acpi_notify() and the details of the connector status in
> ucsi_handle_connector_change() could shed some light on this.

You are correct, my initial conclusions were not the cause of the issue.
After digging a bit more with your ACK early patch set [1] applied the initial
connector changes in CCI are meaningful:

/* Connect charge here */

[88248.531077] ucsi_acpi_notify: CCI: 20000002
[88248.531084] scheduling connector change
[88248.531094] COMMAND: 10012
[88248.690705] ucsi_acpi_notify: CCI: 80000902
[88248.705038] ucsi_acpi_notify: CCI: 80000902
[88248.769716] MESSAGE_IN: 0 294024
[88248.769721] COMMAND: 30004
[88248.905117] ucsi_acpi_notify: CCI: 20000002
[88248.905122] scheduling connector change
[88248.905238] COMMAND: 10012
[88248.921032] ucsi_acpi_notify: CCI: 20000002
[88249.052045] ucsi_acpi_notify: CCI: 80000902
[88249.094680] ucsi_acpi_notify: CCI: 80000902
[88249.113033] MESSAGE_IN: 1 42851545402b0a44
[88249.113036] COMMAND: 30004
[88249.234930] ucsi_acpi_notify: CCI: 20000000

as the reply to GET_CONNECTOR_STATUS changes.

Turns out the problematic command seems to be the GET_PDOS for the
source PDOs of the partner. After testing with multiple chargers and
hubs we always have this pattern:

[88249.235144] COMMAND: 700810010
[88249.431015] ucsi_acpi_notify: CCI: 80001000
[88249.444510] ucsi_acpi_notify: CCI: 80001000
[88249.474162] MESSAGE_IN: 641450004b12c 2d12c0801912c
[88249.474164] COMMAND: 20004
[88249.569839] ucsi_acpi_notify: CCI: 20000002
[88249.569843] scheduling connector change
[88249.569878] COMMAND: 604810010
[88249.694744] ucsi_acpi_notify: CCI: 80000002
[88249.756603] ucsi_acpi_notify: CCI: 80000002
[88249.772341] MESSAGE_IN: 0 0
[88249.772343] COMMAND: 20004
[88249.796672] ucsi_acpi_notify: CCI: 80000002
[88249.932743] ucsi_acpi_notify: CCI: 20000000
		...
[88250.229964] COMMAND: 10012
[88250.341815] ucsi_acpi_notify: CCI: 80000900
[88250.385756] ucsi_acpi_notify: CCI: 80000900
[88250.404292] MESSAGE_IN: 1 42851545402b0060

And the next time we check the partner source PDOS we get the same,

[88251.126928] COMMAND: 10012
[88251.316607] ucsi_acpi_notify: CCI: 80000900
[88251.330743] ucsi_acpi_notify: CCI: 80000900
[88251.358123] MESSAGE_IN: 1 42851545402b0000
[88251.358125] COMMAND: 20004
[88251.474957] ucsi_acpi_notify: CCI: 20000000
[88251.475109] COMMAND: 700810010
[88251.636812] ucsi_acpi_notify: CCI: 80001000
[88251.695686] ucsi_acpi_notify: CCI: 80001000
[88251.709026] MESSAGE_IN: 641450004b12c 2d12c0801912c
[88251.709028] COMMAND: 20004
[88251.827071] ucsi_acpi_notify: CCI: 20000002
[88251.827075] scheduling connector change
[88251.827189] COMMAND: 604810010
[88251.974138] ucsi_acpi_notify: CCI: 80000002
[88252.034895] ucsi_acpi_notify: CCI: 80000002
[88252.054572] ucsi_acpi_notify: CCI: 80000002
[88252.067847] MESSAGE_IN: 0 0
[88252.067849] COMMAND: 20004
[88252.197648] ucsi_acpi_notify: CCI: 20000000

which then leads to the infinite loop. I have checked that we always get
the same PDOs and nothing else is changing, leading me to believe that
this change is not meaningful. The only thing that is changes are bits 5
and 6 of the GET_CONNECTOR_STATUS reply that follows the GET_PDOS
command.

Is this a pattern you have seen before, and if so, do you have any
recommendations on how to properly address it? My first idea was to revive
the ACPI_SUPRESS bit from the Dell quirk and set it for this command from a
custom sync_write() callback for these laptops, at the expense of possibly
losing information on actual connector changes that occur.

On a related note, I noticed that the conditions for us to read the
partner PDOs are not based on the "Supported Provider Capabilities Change"
bit of the GET_CONNECTOR_STATUS reply as the spec instructs. Is there a
reason for this?

Lastly, I also noticed that in ucsi_pwr_opmode_change() we call both
ucsi_get_src_pdos() and ucsi_register_partner_pdos(), and both read the
partner source PDOs, which to my limited understanding of the driver
seems like a duplication of both the reading and the storage of the
PDOs. Again, my question here is if this is intended, or if we can
condense this into a single call.

Thank you in advance for your time and insights!

Best regards,
Diogo

[1]: https://lore.kernel.org/linux-usb/20240327224554.1772525-1-lk@c--e.de/
Christian A. Ehrhardt April 25, 2024, 6:05 p.m. UTC | #9
Hi Diogo,

On Wed, Apr 24, 2024 at 12:23:19PM +0100, Diogo Ivo wrote:
> Hi Christian, sorry for the late reply!
> 
> On Thu, Mar 28, 2024 at 11:40:09PM +0100, Christian Ehrhardt wrote:
> > 
> > On Thu, Mar 28, 2024 at 03:42:40PM +0000, Diogo Ivo wrote:
> > > On Wed, Mar 27, 2024 at 05:06:38PM +0100, Christian Ehrhardt wrote:
> > > > On Wed, Mar 27, 2024 at 12:39:04PM +0000, Diogo Ivo wrote:
> > > > > This fixes a problem with some LG Gram laptops where the PPM sometimes
> > > > > notifies the OPM with the Connector Change Indicator field set in the
> > > > > CCI after an ACK_CC_CI command is sent,causing the UCSI stack to check
> > > > > the connector status indefinitely since the EVENT_PENDING bit is already
> > > > > cleared. This causes an interrupt storm and an artificial high load on
> > > > > these platforms.
> > > > 
> > > > If the PPM does this for a connector change ACK_CC_CI command it is
> > > > IMHO violating the spec (unless there is a _new_ event).
> > > > When I saw this type of loops the connector change indicator was set
> > > > in response to an ACK_CC_CI command for a command (sent by a different
> > > > thread for a different connector) between clearing the EVENT_PENDING
> > > > bit and acquiring the PPM lock.
> > 
> > There are legitimate reaons why the connector change indicator
> > is set in response to a command:
> > - If the condition was reported previously it is sticky until
> >   cleared.
> > - Something else changed on the connector.
> > 
> > For a more complicated device that I have here, there are five
> > different connector change events after plugging it in.
> > 
> > I'd like to understand why you run into a loop here.
> > Printing the completed command (if any) and the CCI in
> > ucsi_acpi_notify() and the details of the connector status in
> > ucsi_handle_connector_change() could shed some light on this.
> 
> You are correct, my initial conclusions were not the cause of the issue.
> After digging a bit more with your ACK early patch set [1] applied the initial
> connector changes in CCI are meaningful:
> 
> /* Connect charge here */
> 
> [88248.531077] ucsi_acpi_notify: CCI: 20000002
> [88248.531084] scheduling connector change
> [88248.531094] COMMAND: 10012
> [88248.690705] ucsi_acpi_notify: CCI: 80000902
> [88248.705038] ucsi_acpi_notify: CCI: 80000902
> [88248.769716] MESSAGE_IN: 0 294024
> [88248.769721] COMMAND: 30004
> [88248.905117] ucsi_acpi_notify: CCI: 20000002
> [88248.905122] scheduling connector change
> [88248.905238] COMMAND: 10012
> [88248.921032] ucsi_acpi_notify: CCI: 20000002
> [88249.052045] ucsi_acpi_notify: CCI: 80000902
> [88249.094680] ucsi_acpi_notify: CCI: 80000902
> [88249.113033] MESSAGE_IN: 1 42851545402b0a44
> [88249.113036] COMMAND: 30004
> [88249.234930] ucsi_acpi_notify: CCI: 20000000
> 
> as the reply to GET_CONNECTOR_STATUS changes.
> 
> Turns out the problematic command seems to be the GET_PDOS for the
> source PDOs of the partner. After testing with multiple chargers and
> hubs we always have this pattern:
> 
> [88249.235144] COMMAND: 700810010
> [88249.431015] ucsi_acpi_notify: CCI: 80001000
> [88249.444510] ucsi_acpi_notify: CCI: 80001000
> [88249.474162] MESSAGE_IN: 641450004b12c 2d12c0801912c
> [88249.474164] COMMAND: 20004
> [88249.569839] ucsi_acpi_notify: CCI: 20000002
> [88249.569843] scheduling connector change
> [88249.569878] COMMAND: 604810010
> [88249.694744] ucsi_acpi_notify: CCI: 80000002
> [88249.756603] ucsi_acpi_notify: CCI: 80000002
> [88249.772341] MESSAGE_IN: 0 0
> [88249.772343] COMMAND: 20004
> [88249.796672] ucsi_acpi_notify: CCI: 80000002
> [88249.932743] ucsi_acpi_notify: CCI: 20000000
> 		...
> [88250.229964] COMMAND: 10012
> [88250.341815] ucsi_acpi_notify: CCI: 80000900
> [88250.385756] ucsi_acpi_notify: CCI: 80000900
> [88250.404292] MESSAGE_IN: 1 42851545402b0060
> 
> And the next time we check the partner source PDOS we get the same,
> 
> [88251.126928] COMMAND: 10012
> [88251.316607] ucsi_acpi_notify: CCI: 80000900
> [88251.330743] ucsi_acpi_notify: CCI: 80000900
> [88251.358123] MESSAGE_IN: 1 42851545402b0000
> [88251.358125] COMMAND: 20004
> [88251.474957] ucsi_acpi_notify: CCI: 20000000
> [88251.475109] COMMAND: 700810010
> [88251.636812] ucsi_acpi_notify: CCI: 80001000
> [88251.695686] ucsi_acpi_notify: CCI: 80001000
> [88251.709026] MESSAGE_IN: 641450004b12c 2d12c0801912c
> [88251.709028] COMMAND: 20004
> [88251.827071] ucsi_acpi_notify: CCI: 20000002
> [88251.827075] scheduling connector change
> [88251.827189] COMMAND: 604810010
> [88251.974138] ucsi_acpi_notify: CCI: 80000002
> [88252.034895] ucsi_acpi_notify: CCI: 80000002
> [88252.054572] ucsi_acpi_notify: CCI: 80000002
> [88252.067847] MESSAGE_IN: 0 0
> [88252.067849] COMMAND: 20004
> [88252.197648] ucsi_acpi_notify: CCI: 20000000
> 
> which then leads to the infinite loop. I have checked that we always get
> the same PDOs and nothing else is changing, leading me to believe that
> this change is not meaningful. The only thing that is changes are bits 5
> and 6 of the GET_CONNECTOR_STATUS reply that follows the GET_PDOS
> command.

If I understand your debug output correctly, each call to GET_PDOS
triggers a connector state change event. However, that event seems
to go away in CCI. In the connector status some changed bits remain.
These changed bits will cause us to re-read the PDOs resulting in
an infinite loop.

If this is about correct, I guess the simplest way to fix this would
be to enable the UCSI_NO_PARTNER_PDOS quirk for affected devices.


Best regards,
Christian
diff mbox series

Patch

diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c
index 0c8f3b3a99d6..b8b39e43aba8 100644
--- a/drivers/usb/typec/ucsi/ucsi.c
+++ b/drivers/usb/typec/ucsi/ucsi.c
@@ -69,6 +69,23 @@  static int ucsi_acknowledge_connector_change(struct ucsi *ucsi)
 	return ucsi->ops->sync_write(ucsi, UCSI_CONTROL, &ctrl, sizeof(ctrl));
 }
 
+static void ucsi_handle_ack_connector_change(struct ucsi_connector *con)
+{
+	struct ucsi *ucsi = con->ucsi;
+	int ret;
+
+	if (list_empty(&con->partner_tasks)) {
+		mutex_lock(&ucsi->ppm_lock);
+		ret = ucsi_acknowledge_connector_change(ucsi);
+		mutex_unlock(&ucsi->ppm_lock);
+
+		if (ret)
+			dev_err(ucsi->dev, "%s: ACK failed (%d)", __func__, ret);
+
+		clear_bit(EVENT_PENDING, &ucsi->flags);
+	}
+}
+
 static int ucsi_exec_command(struct ucsi *ucsi, u64 command);
 
 static int ucsi_read_error(struct ucsi *ucsi)
@@ -222,6 +239,7 @@  static void ucsi_poll_worker(struct work_struct *work)
 		list_del(&uwork->node);
 		mutex_unlock(&con->lock);
 		kfree(uwork);
+		ucsi_handle_ack_connector_change(con);
 		return;
 	}
 
@@ -232,6 +250,7 @@  static void ucsi_poll_worker(struct work_struct *work)
 	} else {
 		list_del(&uwork->node);
 		kfree(uwork);
+		ucsi_handle_ack_connector_change(con);
 	}
 
 	mutex_unlock(&con->lock);
@@ -1215,13 +1234,7 @@  static void ucsi_handle_connector_change(struct work_struct *work)
 	if (con->status.change & UCSI_CONSTAT_CAM_CHANGE)
 		ucsi_partner_task(con, ucsi_check_altmodes, 1, 0);
 
-	clear_bit(EVENT_PENDING, &con->ucsi->flags);
-
-	mutex_lock(&ucsi->ppm_lock);
-	ret = ucsi_acknowledge_connector_change(ucsi);
-	mutex_unlock(&ucsi->ppm_lock);
-	if (ret)
-		dev_err(ucsi->dev, "%s: ACK failed (%d)", __func__, ret);
+	ucsi_handle_ack_connector_change(con);
 
 out_unlock:
 	mutex_unlock(&con->lock);