diff mbox series

[2/3] usb: typec: ucsi: Move unregister out of atomic section

Message ID 20240818-pmic-glink-v6-11-races-v1-2-f87c577e0bc9@quicinc.com (mailing list archive)
State Handled Elsewhere, archived
Headers show
Series soc: qcom: pmic_glink: v6.11-rc bug fixes | expand

Commit Message

Bjorn Andersson Aug. 18, 2024, 11:17 p.m. UTC
Commit 'caa855189104 ("soc: qcom: pmic_glink: Fix race during
initialization")' moved the pmic_glink client list under a spinlock, as
it is accessed by the rpmsg/glink callback, which in turn is invoked
from IRQ context.

This means that ucsi_unregister() is now called from IRQ context, which
isn't feasible as it's expecting a sleepable context. An effort is under
way to get GLINK to invoke its callbacks in a sleepable context, but
until then lets schedule the unregistration.

A side effect of this is that ucsi_unregister() can now happen
after the remote processor, and thereby the communication link with it, is
gone. pmic_glink_send() is amended with a check to avoid the resulting
NULL pointer dereference, but it becomes expecting to see a failing send
upon shutting down the remote processor (e.g. during a restart following
a firmware crash):

  ucsi_glink.pmic_glink_ucsi pmic_glink.ucsi.0: failed to send UCSI write request: -5

Fixes: caa855189104 ("soc: qcom: pmic_glink: Fix race during initialization")
Cc: stable@vger.kernel.org
Signed-off-by: Bjorn Andersson <quic_bjorande@quicinc.com>
---
 drivers/soc/qcom/pmic_glink.c       | 10 +++++++++-
 drivers/usb/typec/ucsi/ucsi_glink.c | 28 +++++++++++++++++++++++-----
 2 files changed, 32 insertions(+), 6 deletions(-)

Comments

Dmitry Baryshkov Aug. 19, 2024, 1:16 a.m. UTC | #1
On 19 August 2024 06:17:38 GMT+07:00, Bjorn Andersson <quic_bjorande@quicinc.com> wrote:
>Commit 'caa855189104 ("soc: qcom: pmic_glink: Fix race during
>initialization")' moved the pmic_glink client list under a spinlock, as
>it is accessed by the rpmsg/glink callback, which in turn is invoked
>from IRQ context.
>
>This means that ucsi_unregister() is now called from IRQ context, which
>isn't feasible as it's expecting a sleepable context. An effort is under
>way to get GLINK to invoke its callbacks in a sleepable context, but
>until then lets schedule the unregistration.
>
>A side effect of this is that ucsi_unregister() can now happen
>after the remote processor, and thereby the communication link with it, is
>gone. pmic_glink_send() is amended with a check to avoid the resulting
>NULL pointer dereference, but it becomes expecting to see a failing send
>upon shutting down the remote processor (e.g. during a restart following
>a firmware crash):
>
>  ucsi_glink.pmic_glink_ucsi pmic_glink.ucsi.0: failed to send UCSI write request: -5
>
>Fixes: caa855189104 ("soc: qcom: pmic_glink: Fix race during initialization")
>Cc: stable@vger.kernel.org
>Signed-off-by: Bjorn Andersson <quic_bjorande@quicinc.com>
>---
> drivers/soc/qcom/pmic_glink.c       | 10 +++++++++-
> drivers/usb/typec/ucsi/ucsi_glink.c | 28 +++++++++++++++++++++++-----
> 2 files changed, 32 insertions(+), 6 deletions(-)
>
>diff --git a/drivers/soc/qcom/pmic_glink.c b/drivers/soc/qcom/pmic_glink.c
>index 58ec91767d79..e4747f1d3da5 100644
>--- a/drivers/soc/qcom/pmic_glink.c
>+++ b/drivers/soc/qcom/pmic_glink.c
>@@ -112,8 +112,16 @@ EXPORT_SYMBOL_GPL(pmic_glink_register_client);
> int pmic_glink_send(struct pmic_glink_client *client, void *data, size_t len)
> {
> 	struct pmic_glink *pg = client->pg;
>+	int ret;
> 
>-	return rpmsg_send(pg->ept, data, len);
>+	mutex_lock(&pg->state_lock);
>+	if (!pg->ept)
>+		ret = -ECONNRESET;
>+	else
>+		ret = rpmsg_send(pg->ept, data, len);
>+	mutex_unlock(&pg->state_lock);
>+
>+	return ret;
> }
> EXPORT_SYMBOL_GPL(pmic_glink_send);
> 
>diff --git a/drivers/usb/typec/ucsi/ucsi_glink.c b/drivers/usb/typec/ucsi/ucsi_glink.c
>index ac53a81c2a81..a33056eec83d 100644
>--- a/drivers/usb/typec/ucsi/ucsi_glink.c
>+++ b/drivers/usb/typec/ucsi/ucsi_glink.c
>@@ -68,6 +68,9 @@ struct pmic_glink_ucsi {
> 
> 	struct work_struct notify_work;
> 	struct work_struct register_work;
>+	spinlock_t state_lock;
>+	unsigned int pdr_state;
>+	unsigned int new_pdr_state;
> 
> 	u8 read_buf[UCSI_BUF_SIZE];
> };
>@@ -244,8 +247,22 @@ static void pmic_glink_ucsi_notify(struct work_struct *work)
> static void pmic_glink_ucsi_register(struct work_struct *work)
> {
> 	struct pmic_glink_ucsi *ucsi = container_of(work, struct pmic_glink_ucsi, register_work);
>+	unsigned long flags;
>+	unsigned int new_state;
>+
>+	spin_lock_irqsave(&ucsi->state_lock, flags);
>+	new_state = ucsi->new_pdr_state;
>+	spin_unlock_irqrestore(&ucsi->state_lock, flags);
>+
>+	if (ucsi->pdr_state != SERVREG_SERVICE_STATE_UP) {
>+		if (new_state == SERVREG_SERVICE_STATE_UP)
>+			ucsi_register(ucsi->ucsi);
>+	} else {
>+		if (new_state == SERVREG_SERVICE_STATE_DOWN)
>+			ucsi_unregister(ucsi->ucsi);
>+	}
> 
>-	ucsi_register(ucsi->ucsi);
>+	ucsi->pdr_state = new_state;
> }

Is there a chance if a race condition if the firmware is restarted quickly, but the system is under heavy mist: 
- the driver gets DOWN event, updates the state and schedules the work,
- the work starts to execute, reads the state,
- the driver gets UP event, updates the state, but the work is not rescheduled as it is still executing 
- the worker finishes unregistering the UCSI.



> 
> static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv)
>@@ -269,11 +286,12 @@ static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv)
> static void pmic_glink_ucsi_pdr_notify(void *priv, int state)
> {
> 	struct pmic_glink_ucsi *ucsi = priv;
>+	unsigned long flags;
> 
>-	if (state == SERVREG_SERVICE_STATE_UP)
>-		schedule_work(&ucsi->register_work);
>-	else if (state == SERVREG_SERVICE_STATE_DOWN)
>-		ucsi_unregister(ucsi->ucsi);
>+	spin_lock_irqsave(&ucsi->state_lock, flags);
>+	ucsi->new_pdr_state = state;
>+	spin_unlock_irqrestore(&ucsi->state_lock, flags);
>+	schedule_work(&ucsi->register_work);
> }
> 
> static void pmic_glink_ucsi_destroy(void *data)
>
Bjorn Andersson Aug. 19, 2024, 3:05 a.m. UTC | #2
On Mon, Aug 19, 2024 at 08:16:25AM +0700, Dmitry Baryshkov wrote:
> On 19 August 2024 06:17:38 GMT+07:00, Bjorn Andersson <quic_bjorande@quicinc.com> wrote:
> >Commit 'caa855189104 ("soc: qcom: pmic_glink: Fix race during
> >initialization")' moved the pmic_glink client list under a spinlock, as
> >it is accessed by the rpmsg/glink callback, which in turn is invoked
> >from IRQ context.
> >
> >This means that ucsi_unregister() is now called from IRQ context, which
> >isn't feasible as it's expecting a sleepable context. An effort is under
> >way to get GLINK to invoke its callbacks in a sleepable context, but
> >until then lets schedule the unregistration.
> >
> >A side effect of this is that ucsi_unregister() can now happen
> >after the remote processor, and thereby the communication link with it, is
> >gone. pmic_glink_send() is amended with a check to avoid the resulting
> >NULL pointer dereference, but it becomes expecting to see a failing send
> >upon shutting down the remote processor (e.g. during a restart following
> >a firmware crash):
> >
> >  ucsi_glink.pmic_glink_ucsi pmic_glink.ucsi.0: failed to send UCSI write request: -5
> >
> >Fixes: caa855189104 ("soc: qcom: pmic_glink: Fix race during initialization")
> >Cc: stable@vger.kernel.org
> >Signed-off-by: Bjorn Andersson <quic_bjorande@quicinc.com>
> >---
> > drivers/soc/qcom/pmic_glink.c       | 10 +++++++++-
> > drivers/usb/typec/ucsi/ucsi_glink.c | 28 +++++++++++++++++++++++-----
> > 2 files changed, 32 insertions(+), 6 deletions(-)
> >
> >diff --git a/drivers/soc/qcom/pmic_glink.c b/drivers/soc/qcom/pmic_glink.c
> >index 58ec91767d79..e4747f1d3da5 100644
> >--- a/drivers/soc/qcom/pmic_glink.c
> >+++ b/drivers/soc/qcom/pmic_glink.c
> >@@ -112,8 +112,16 @@ EXPORT_SYMBOL_GPL(pmic_glink_register_client);
> > int pmic_glink_send(struct pmic_glink_client *client, void *data, size_t len)
> > {
> > 	struct pmic_glink *pg = client->pg;
> >+	int ret;
> > 
> >-	return rpmsg_send(pg->ept, data, len);
> >+	mutex_lock(&pg->state_lock);
> >+	if (!pg->ept)
> >+		ret = -ECONNRESET;
> >+	else
> >+		ret = rpmsg_send(pg->ept, data, len);
> >+	mutex_unlock(&pg->state_lock);
> >+
> >+	return ret;
> > }
> > EXPORT_SYMBOL_GPL(pmic_glink_send);
> > 
> >diff --git a/drivers/usb/typec/ucsi/ucsi_glink.c b/drivers/usb/typec/ucsi/ucsi_glink.c
> >index ac53a81c2a81..a33056eec83d 100644
> >--- a/drivers/usb/typec/ucsi/ucsi_glink.c
> >+++ b/drivers/usb/typec/ucsi/ucsi_glink.c
> >@@ -68,6 +68,9 @@ struct pmic_glink_ucsi {
> > 
> > 	struct work_struct notify_work;
> > 	struct work_struct register_work;
> >+	spinlock_t state_lock;
> >+	unsigned int pdr_state;
> >+	unsigned int new_pdr_state;
> > 
> > 	u8 read_buf[UCSI_BUF_SIZE];
> > };
> >@@ -244,8 +247,22 @@ static void pmic_glink_ucsi_notify(struct work_struct *work)
> > static void pmic_glink_ucsi_register(struct work_struct *work)
> > {
> > 	struct pmic_glink_ucsi *ucsi = container_of(work, struct pmic_glink_ucsi, register_work);
> >+	unsigned long flags;
> >+	unsigned int new_state;
> >+
> >+	spin_lock_irqsave(&ucsi->state_lock, flags);
> >+	new_state = ucsi->new_pdr_state;
> >+	spin_unlock_irqrestore(&ucsi->state_lock, flags);
> >+
> >+	if (ucsi->pdr_state != SERVREG_SERVICE_STATE_UP) {
> >+		if (new_state == SERVREG_SERVICE_STATE_UP)
> >+			ucsi_register(ucsi->ucsi);
> >+	} else {
> >+		if (new_state == SERVREG_SERVICE_STATE_DOWN)
> >+			ucsi_unregister(ucsi->ucsi);
> >+	}
> > 
> >-	ucsi_register(ucsi->ucsi);
> >+	ucsi->pdr_state = new_state;
> > }
> 
> Is there a chance if a race condition if the firmware is restarted quickly, but the system is under heavy mist: 
> - the driver gets DOWN event, updates the state and schedules the work,
> - the work starts to execute, reads the state,
> - the driver gets UP event, updates the state, but the work is not rescheduled as it is still executing 
> - the worker finishes unregistering the UCSI.
> 

I was under the impression that if we reach the point where we start
executing the worker, then a second schedule_work() would cause the
worker to run again. But I might be mistaken here.

What I do expect though is that if we for some reason don't start
executing the work before the state becomes UP again, the UCSI core
wouldn't know that the firmware has been reset.


My proposal is to accept this risk for v6.11 (and get the benefit of
things actually working) and then take a new swing at getting rid of all
these workers for v6.12/13. Does that sound reasonable?

Regards,
Bjorn

> 
> 
> > 
> > static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv)
> >@@ -269,11 +286,12 @@ static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv)
> > static void pmic_glink_ucsi_pdr_notify(void *priv, int state)
> > {
> > 	struct pmic_glink_ucsi *ucsi = priv;
> >+	unsigned long flags;
> > 
> >-	if (state == SERVREG_SERVICE_STATE_UP)
> >-		schedule_work(&ucsi->register_work);
> >-	else if (state == SERVREG_SERVICE_STATE_DOWN)
> >-		ucsi_unregister(ucsi->ucsi);
> >+	spin_lock_irqsave(&ucsi->state_lock, flags);
> >+	ucsi->new_pdr_state = state;
> >+	spin_unlock_irqrestore(&ucsi->state_lock, flags);
> >+	schedule_work(&ucsi->register_work);
> > }
> > 
> > static void pmic_glink_ucsi_destroy(void *data)
> >
>
Dmitry Baryshkov Aug. 19, 2024, 7:32 a.m. UTC | #3
On 19 August 2024 10:05:49 GMT+07:00, Bjorn Andersson <quic_bjorande@quicinc.com> wrote:
>On Mon, Aug 19, 2024 at 08:16:25AM +0700, Dmitry Baryshkov wrote:
>> On 19 August 2024 06:17:38 GMT+07:00, Bjorn Andersson <quic_bjorande@quicinc.com> wrote:
>> >Commit 'caa855189104 ("soc: qcom: pmic_glink: Fix race during
>> >initialization")' moved the pmic_glink client list under a spinlock, as
>> >it is accessed by the rpmsg/glink callback, which in turn is invoked
>> >from IRQ context.
>> >
>> >This means that ucsi_unregister() is now called from IRQ context, which
>> >isn't feasible as it's expecting a sleepable context. An effort is under
>> >way to get GLINK to invoke its callbacks in a sleepable context, but
>> >until then lets schedule the unregistration.
>> >
>> >A side effect of this is that ucsi_unregister() can now happen
>> >after the remote processor, and thereby the communication link with it, is
>> >gone. pmic_glink_send() is amended with a check to avoid the resulting
>> >NULL pointer dereference, but it becomes expecting to see a failing send
>> >upon shutting down the remote processor (e.g. during a restart following
>> >a firmware crash):
>> >
>> >  ucsi_glink.pmic_glink_ucsi pmic_glink.ucsi.0: failed to send UCSI write request: -5
>> >
>> >Fixes: caa855189104 ("soc: qcom: pmic_glink: Fix race during initialization")
>> >Cc: stable@vger.kernel.org
>> >Signed-off-by: Bjorn Andersson <quic_bjorande@quicinc.com>
>> >---
>> > drivers/soc/qcom/pmic_glink.c       | 10 +++++++++-
>> > drivers/usb/typec/ucsi/ucsi_glink.c | 28 +++++++++++++++++++++++-----
>> > 2 files changed, 32 insertions(+), 6 deletions(-)
>> >
>> >diff --git a/drivers/soc/qcom/pmic_glink.c b/drivers/soc/qcom/pmic_glink.c
>> >index 58ec91767d79..e4747f1d3da5 100644
>> >--- a/drivers/soc/qcom/pmic_glink.c
>> >+++ b/drivers/soc/qcom/pmic_glink.c
>> >@@ -112,8 +112,16 @@ EXPORT_SYMBOL_GPL(pmic_glink_register_client);
>> > int pmic_glink_send(struct pmic_glink_client *client, void *data, size_t len)
>> > {
>> > 	struct pmic_glink *pg = client->pg;
>> >+	int ret;
>> > 
>> >-	return rpmsg_send(pg->ept, data, len);
>> >+	mutex_lock(&pg->state_lock);
>> >+	if (!pg->ept)
>> >+		ret = -ECONNRESET;
>> >+	else
>> >+		ret = rpmsg_send(pg->ept, data, len);
>> >+	mutex_unlock(&pg->state_lock);
>> >+
>> >+	return ret;
>> > }
>> > EXPORT_SYMBOL_GPL(pmic_glink_send);
>> > 
>> >diff --git a/drivers/usb/typec/ucsi/ucsi_glink.c b/drivers/usb/typec/ucsi/ucsi_glink.c
>> >index ac53a81c2a81..a33056eec83d 100644
>> >--- a/drivers/usb/typec/ucsi/ucsi_glink.c
>> >+++ b/drivers/usb/typec/ucsi/ucsi_glink.c
>> >@@ -68,6 +68,9 @@ struct pmic_glink_ucsi {
>> > 
>> > 	struct work_struct notify_work;
>> > 	struct work_struct register_work;
>> >+	spinlock_t state_lock;
>> >+	unsigned int pdr_state;
>> >+	unsigned int new_pdr_state;
>> > 
>> > 	u8 read_buf[UCSI_BUF_SIZE];
>> > };
>> >@@ -244,8 +247,22 @@ static void pmic_glink_ucsi_notify(struct work_struct *work)
>> > static void pmic_glink_ucsi_register(struct work_struct *work)
>> > {
>> > 	struct pmic_glink_ucsi *ucsi = container_of(work, struct pmic_glink_ucsi, register_work);
>> >+	unsigned long flags;
>> >+	unsigned int new_state;
>> >+
>> >+	spin_lock_irqsave(&ucsi->state_lock, flags);
>> >+	new_state = ucsi->new_pdr_state;
>> >+	spin_unlock_irqrestore(&ucsi->state_lock, flags);
>> >+
>> >+	if (ucsi->pdr_state != SERVREG_SERVICE_STATE_UP) {
>> >+		if (new_state == SERVREG_SERVICE_STATE_UP)
>> >+			ucsi_register(ucsi->ucsi);
>> >+	} else {
>> >+		if (new_state == SERVREG_SERVICE_STATE_DOWN)
>> >+			ucsi_unregister(ucsi->ucsi);
>> >+	}
>> > 
>> >-	ucsi_register(ucsi->ucsi);
>> >+	ucsi->pdr_state = new_state;
>> > }
>> 
>> Is there a chance if a race condition if the firmware is restarted quickly, but the system is under heavy mist: 
>> - the driver gets DOWN event, updates the state and schedules the work,
>> - the work starts to execute, reads the state,
>> - the driver gets UP event, updates the state, but the work is not rescheduled as it is still executing 
>> - the worker finishes unregistering the UCSI.
>> 
>
>I was under the impression that if we reach the point where we start
>executing the worker, then a second schedule_work() would cause the
>worker to run again. But I might be mistaken here.

I don't have full source code at hand and the docs only speak about being queued, so it is perfectly possible that I am mistaken here.

>
>What I do expect though is that if we for some reason don't start
>executing the work before the state becomes UP again, the UCSI core
>wouldn't know that the firmware has been reset.
>
>
>My proposal is to accept this risk for v6.11 (and get the benefit of
>things actually working) and then take a new swing at getting rid of all
>these workers for v6.12/13. Does that sound reasonable?


Yes, makes sense to me. 

Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>


>
>Regards,
>Bjorn
>
>> 
>> 
>> > 
>> > static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv)
>> >@@ -269,11 +286,12 @@ static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv)
>> > static void pmic_glink_ucsi_pdr_notify(void *priv, int state)
>> > {
>> > 	struct pmic_glink_ucsi *ucsi = priv;
>> >+	unsigned long flags;
>> > 
>> >-	if (state == SERVREG_SERVICE_STATE_UP)
>> >-		schedule_work(&ucsi->register_work);
>> >-	else if (state == SERVREG_SERVICE_STATE_DOWN)
>> >-		ucsi_unregister(ucsi->ucsi);
>> >+	spin_lock_irqsave(&ucsi->state_lock, flags);
>> >+	ucsi->new_pdr_state = state;
>> >+	spin_unlock_irqrestore(&ucsi->state_lock, flags);
>> >+	schedule_work(&ucsi->register_work);
>> > }
>> > 
>> > static void pmic_glink_ucsi_destroy(void *data)
>> >
>>
Neil Armstrong Aug. 19, 2024, 9:16 a.m. UTC | #4
On 19/08/2024 01:17, Bjorn Andersson wrote:
> Commit 'caa855189104 ("soc: qcom: pmic_glink: Fix race during
> initialization")' moved the pmic_glink client list under a spinlock, as
> it is accessed by the rpmsg/glink callback, which in turn is invoked
> from IRQ context.
> 
> This means that ucsi_unregister() is now called from IRQ context, which
> isn't feasible as it's expecting a sleepable context. An effort is under
> way to get GLINK to invoke its callbacks in a sleepable context, but
> until then lets schedule the unregistration.
> 
> A side effect of this is that ucsi_unregister() can now happen
> after the remote processor, and thereby the communication link with it, is
> gone. pmic_glink_send() is amended with a check to avoid the resulting
> NULL pointer dereference, but it becomes expecting to see a failing send
> upon shutting down the remote processor (e.g. during a restart following
> a firmware crash):
> 
>    ucsi_glink.pmic_glink_ucsi pmic_glink.ucsi.0: failed to send UCSI write request: -5
> 
> Fixes: caa855189104 ("soc: qcom: pmic_glink: Fix race during initialization")
> Cc: stable@vger.kernel.org
> Signed-off-by: Bjorn Andersson <quic_bjorande@quicinc.com>
> ---
>   drivers/soc/qcom/pmic_glink.c       | 10 +++++++++-
>   drivers/usb/typec/ucsi/ucsi_glink.c | 28 +++++++++++++++++++++++-----
>   2 files changed, 32 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/soc/qcom/pmic_glink.c b/drivers/soc/qcom/pmic_glink.c
> index 58ec91767d79..e4747f1d3da5 100644
> --- a/drivers/soc/qcom/pmic_glink.c
> +++ b/drivers/soc/qcom/pmic_glink.c
> @@ -112,8 +112,16 @@ EXPORT_SYMBOL_GPL(pmic_glink_register_client);
>   int pmic_glink_send(struct pmic_glink_client *client, void *data, size_t len)
>   {
>   	struct pmic_glink *pg = client->pg;
> +	int ret;
>   
> -	return rpmsg_send(pg->ept, data, len);
> +	mutex_lock(&pg->state_lock);
> +	if (!pg->ept)
> +		ret = -ECONNRESET;
> +	else
> +		ret = rpmsg_send(pg->ept, data, len);
> +	mutex_unlock(&pg->state_lock);
> +
> +	return ret;
>   }
>   EXPORT_SYMBOL_GPL(pmic_glink_send);
>   
> diff --git a/drivers/usb/typec/ucsi/ucsi_glink.c b/drivers/usb/typec/ucsi/ucsi_glink.c
> index ac53a81c2a81..a33056eec83d 100644
> --- a/drivers/usb/typec/ucsi/ucsi_glink.c
> +++ b/drivers/usb/typec/ucsi/ucsi_glink.c
> @@ -68,6 +68,9 @@ struct pmic_glink_ucsi {
>   
>   	struct work_struct notify_work;
>   	struct work_struct register_work;
> +	spinlock_t state_lock;
> +	unsigned int pdr_state;
> +	unsigned int new_pdr_state;
>   
>   	u8 read_buf[UCSI_BUF_SIZE];
>   };
> @@ -244,8 +247,22 @@ static void pmic_glink_ucsi_notify(struct work_struct *work)
>   static void pmic_glink_ucsi_register(struct work_struct *work)
>   {
>   	struct pmic_glink_ucsi *ucsi = container_of(work, struct pmic_glink_ucsi, register_work);
> +	unsigned long flags;
> +	unsigned int new_state;
> +
> +	spin_lock_irqsave(&ucsi->state_lock, flags);
> +	new_state = ucsi->new_pdr_state;
> +	spin_unlock_irqrestore(&ucsi->state_lock, flags);
> +
> +	if (ucsi->pdr_state != SERVREG_SERVICE_STATE_UP) {
> +		if (new_state == SERVREG_SERVICE_STATE_UP)
> +			ucsi_register(ucsi->ucsi);
> +	} else {
> +		if (new_state == SERVREG_SERVICE_STATE_DOWN)
> +			ucsi_unregister(ucsi->ucsi);
> +	}
>   
> -	ucsi_register(ucsi->ucsi);
> +	ucsi->pdr_state = new_state;
>   }
>   
>   static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv)
> @@ -269,11 +286,12 @@ static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv)
>   static void pmic_glink_ucsi_pdr_notify(void *priv, int state)
>   {
>   	struct pmic_glink_ucsi *ucsi = priv;
> +	unsigned long flags;
>   
> -	if (state == SERVREG_SERVICE_STATE_UP)
> -		schedule_work(&ucsi->register_work);
> -	else if (state == SERVREG_SERVICE_STATE_DOWN)
> -		ucsi_unregister(ucsi->ucsi);
> +	spin_lock_irqsave(&ucsi->state_lock, flags);
> +	ucsi->new_pdr_state = state;
> +	spin_unlock_irqrestore(&ucsi->state_lock, flags);
> +	schedule_work(&ucsi->register_work);
>   }
>   
>   static void pmic_glink_ucsi_destroy(void *data)
> 

Looks good, I'll run it on the 8550/8650 platforms

Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
Heikki Krogerus Aug. 19, 2024, 1:27 p.m. UTC | #5
On Sun, Aug 18, 2024 at 04:17:38PM -0700, Bjorn Andersson wrote:
> Commit 'caa855189104 ("soc: qcom: pmic_glink: Fix race during
> initialization")' moved the pmic_glink client list under a spinlock, as
> it is accessed by the rpmsg/glink callback, which in turn is invoked
> from IRQ context.
> 
> This means that ucsi_unregister() is now called from IRQ context, which
> isn't feasible as it's expecting a sleepable context. An effort is under
> way to get GLINK to invoke its callbacks in a sleepable context, but
> until then lets schedule the unregistration.
> 
> A side effect of this is that ucsi_unregister() can now happen
> after the remote processor, and thereby the communication link with it, is
> gone. pmic_glink_send() is amended with a check to avoid the resulting
> NULL pointer dereference, but it becomes expecting to see a failing send
> upon shutting down the remote processor (e.g. during a restart following
> a firmware crash):
> 
>   ucsi_glink.pmic_glink_ucsi pmic_glink.ucsi.0: failed to send UCSI write request: -5
> 
> Fixes: caa855189104 ("soc: qcom: pmic_glink: Fix race during initialization")
> Cc: stable@vger.kernel.org
> Signed-off-by: Bjorn Andersson <quic_bjorande@quicinc.com>

Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>

> ---
>  drivers/soc/qcom/pmic_glink.c       | 10 +++++++++-
>  drivers/usb/typec/ucsi/ucsi_glink.c | 28 +++++++++++++++++++++++-----
>  2 files changed, 32 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/soc/qcom/pmic_glink.c b/drivers/soc/qcom/pmic_glink.c
> index 58ec91767d79..e4747f1d3da5 100644
> --- a/drivers/soc/qcom/pmic_glink.c
> +++ b/drivers/soc/qcom/pmic_glink.c
> @@ -112,8 +112,16 @@ EXPORT_SYMBOL_GPL(pmic_glink_register_client);
>  int pmic_glink_send(struct pmic_glink_client *client, void *data, size_t len)
>  {
>  	struct pmic_glink *pg = client->pg;
> +	int ret;
>  
> -	return rpmsg_send(pg->ept, data, len);
> +	mutex_lock(&pg->state_lock);
> +	if (!pg->ept)
> +		ret = -ECONNRESET;
> +	else
> +		ret = rpmsg_send(pg->ept, data, len);
> +	mutex_unlock(&pg->state_lock);
> +
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(pmic_glink_send);
>  
> diff --git a/drivers/usb/typec/ucsi/ucsi_glink.c b/drivers/usb/typec/ucsi/ucsi_glink.c
> index ac53a81c2a81..a33056eec83d 100644
> --- a/drivers/usb/typec/ucsi/ucsi_glink.c
> +++ b/drivers/usb/typec/ucsi/ucsi_glink.c
> @@ -68,6 +68,9 @@ struct pmic_glink_ucsi {
>  
>  	struct work_struct notify_work;
>  	struct work_struct register_work;
> +	spinlock_t state_lock;
> +	unsigned int pdr_state;
> +	unsigned int new_pdr_state;
>  
>  	u8 read_buf[UCSI_BUF_SIZE];
>  };
> @@ -244,8 +247,22 @@ static void pmic_glink_ucsi_notify(struct work_struct *work)
>  static void pmic_glink_ucsi_register(struct work_struct *work)
>  {
>  	struct pmic_glink_ucsi *ucsi = container_of(work, struct pmic_glink_ucsi, register_work);
> +	unsigned long flags;
> +	unsigned int new_state;
> +
> +	spin_lock_irqsave(&ucsi->state_lock, flags);
> +	new_state = ucsi->new_pdr_state;
> +	spin_unlock_irqrestore(&ucsi->state_lock, flags);
> +
> +	if (ucsi->pdr_state != SERVREG_SERVICE_STATE_UP) {
> +		if (new_state == SERVREG_SERVICE_STATE_UP)
> +			ucsi_register(ucsi->ucsi);
> +	} else {
> +		if (new_state == SERVREG_SERVICE_STATE_DOWN)
> +			ucsi_unregister(ucsi->ucsi);
> +	}
>  
> -	ucsi_register(ucsi->ucsi);
> +	ucsi->pdr_state = new_state;
>  }
>  
>  static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv)
> @@ -269,11 +286,12 @@ static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv)
>  static void pmic_glink_ucsi_pdr_notify(void *priv, int state)
>  {
>  	struct pmic_glink_ucsi *ucsi = priv;
> +	unsigned long flags;
>  
> -	if (state == SERVREG_SERVICE_STATE_UP)
> -		schedule_work(&ucsi->register_work);
> -	else if (state == SERVREG_SERVICE_STATE_DOWN)
> -		ucsi_unregister(ucsi->ucsi);
> +	spin_lock_irqsave(&ucsi->state_lock, flags);
> +	ucsi->new_pdr_state = state;
> +	spin_unlock_irqrestore(&ucsi->state_lock, flags);
> +	schedule_work(&ucsi->register_work);
>  }
>  
>  static void pmic_glink_ucsi_destroy(void *data)
> 
> -- 
> 2.34.1
Johan Hovold Aug. 19, 2024, 3:06 p.m. UTC | #6
On Sun, Aug 18, 2024 at 04:17:38PM -0700, Bjorn Andersson wrote:
> Commit 'caa855189104 ("soc: qcom: pmic_glink: Fix race during
> initialization")' 

This commit does not exist, but I think you really meant to refer to

	9329933699b3 ("soc: qcom: pmic_glink: Make client-lock non-sleeping")

and possibly also

	635ce0db8956 ("soc: qcom: pmic_glink: don't traverse clients list without a lock")

here.

> moved the pmic_glink client list under a spinlock, as
> it is accessed by the rpmsg/glink callback, which in turn is invoked
> from IRQ context.
> 
> This means that ucsi_unregister() is now called from IRQ context, which
> isn't feasible as it's expecting a sleepable context.

But this is not correct as you say above that the callback has always
been made in IRQ context. Then this bug has been there since the
introduction of the UCSI driver by commit

	62b5412b1f4a ("usb: typec: ucsi: add PMIC Glink UCSI driver")

> An effort is under
> way to get GLINK to invoke its callbacks in a sleepable context, but
> until then lets schedule the unregistration.
> 
> A side effect of this is that ucsi_unregister() can now happen
> after the remote processor, and thereby the communication link with it, is
> gone. pmic_glink_send() is amended with a check to avoid the resulting
> NULL pointer dereference, but it becomes expecting to see a failing send

Perhaps you can rephrase this bit ("becomes expecting to see").

> upon shutting down the remote processor (e.g. during a restart following
> a firmware crash):
> 
>   ucsi_glink.pmic_glink_ucsi pmic_glink.ucsi.0: failed to send UCSI write request: -5
> 
> Fixes: caa855189104 ("soc: qcom: pmic_glink: Fix race during initialization")

So this should be

Fixes: 62b5412b1f4a ("usb: typec: ucsi: add PMIC Glink UCSI driver")

> Cc: stable@vger.kernel.org
> Signed-off-by: Bjorn Andersson <quic_bjorande@quicinc.com>
 
> diff --git a/drivers/usb/typec/ucsi/ucsi_glink.c b/drivers/usb/typec/ucsi/ucsi_glink.c
> index ac53a81c2a81..a33056eec83d 100644
> --- a/drivers/usb/typec/ucsi/ucsi_glink.c
> +++ b/drivers/usb/typec/ucsi/ucsi_glink.c
> @@ -68,6 +68,9 @@ struct pmic_glink_ucsi {
>  
>  	struct work_struct notify_work;
>  	struct work_struct register_work;
> +	spinlock_t state_lock;
> +	unsigned int pdr_state;
> +	unsigned int new_pdr_state;

Should these be int to match the notify callback (and enum
servreg_service_state)?

>  	u8 read_buf[UCSI_BUF_SIZE];
>  };
> @@ -244,8 +247,22 @@ static void pmic_glink_ucsi_notify(struct work_struct *work)
>  static void pmic_glink_ucsi_register(struct work_struct *work)
>  {
>  	struct pmic_glink_ucsi *ucsi = container_of(work, struct pmic_glink_ucsi, register_work);
> +	unsigned long flags;
> +	unsigned int new_state;

Then int here too.

> +
> +	spin_lock_irqsave(&ucsi->state_lock, flags);
> +	new_state = ucsi->new_pdr_state;
> +	spin_unlock_irqrestore(&ucsi->state_lock, flags);
> +
> +	if (ucsi->pdr_state != SERVREG_SERVICE_STATE_UP) {
> +		if (new_state == SERVREG_SERVICE_STATE_UP)
> +			ucsi_register(ucsi->ucsi);
> +	} else {
> +		if (new_state == SERVREG_SERVICE_STATE_DOWN)
> +			ucsi_unregister(ucsi->ucsi);

Do you risk a double deregistration (and UAF/double free) here?

> +	}
>  
> -	ucsi_register(ucsi->ucsi);
> +	ucsi->pdr_state = new_state;
>  }

Johan
Johan Hovold Aug. 19, 2024, 3:33 p.m. UTC | #7
On Sun, Aug 18, 2024 at 04:17:38PM -0700, Bjorn Andersson wrote:

> @@ -68,6 +68,9 @@ struct pmic_glink_ucsi {
>  
>  	struct work_struct notify_work;
>  	struct work_struct register_work;
> +	spinlock_t state_lock;

You also never initialise this lock...

Lockdep would have let you know with a big splat.

Johan
Bjorn Andersson Aug. 19, 2024, 4:45 p.m. UTC | #8
On Mon, Aug 19, 2024 at 05:06:58PM +0200, Johan Hovold wrote:
> On Sun, Aug 18, 2024 at 04:17:38PM -0700, Bjorn Andersson wrote:
> > Commit 'caa855189104 ("soc: qcom: pmic_glink: Fix race during
> > initialization")' 
> 
> This commit does not exist, but I think you really meant to refer to
> 
> 	9329933699b3 ("soc: qcom: pmic_glink: Make client-lock non-sleeping")
> 
> and possibly also
> 
> 	635ce0db8956 ("soc: qcom: pmic_glink: don't traverse clients list without a lock")
> 
> here.
> 

Yeah, I copy-pasted the wrong SHA1. Prior to commit 9329933699b3 ("soc:
qcom: pmic_glink: Make client-lock non-sleeping") the PDR notification
happened from a worker with only mutexes held.

> > moved the pmic_glink client list under a spinlock, as
> > it is accessed by the rpmsg/glink callback, which in turn is invoked
> > from IRQ context.
> > 
> > This means that ucsi_unregister() is now called from IRQ context, which
> > isn't feasible as it's expecting a sleepable context.
> 
> But this is not correct as you say above that the callback has always
> been made in IRQ context. Then this bug has been there since the
> introduction of the UCSI driver by commit
> 

No, I'm stating that commit 9329933699b3 ("soc: qcom: pmic_glink: Make
client-lock non-sleeping") was needed because the client list is
traversed under the separate glink callback, which has always been made
in IRQ context.

> 	62b5412b1f4a ("usb: typec: ucsi: add PMIC Glink UCSI driver")
> 
> > An effort is under
> > way to get GLINK to invoke its callbacks in a sleepable context, but
> > until then lets schedule the unregistration.
> > 
> > A side effect of this is that ucsi_unregister() can now happen
> > after the remote processor, and thereby the communication link with it, is
> > gone. pmic_glink_send() is amended with a check to avoid the resulting
> > NULL pointer dereference, but it becomes expecting to see a failing send
> 
> Perhaps you can rephrase this bit ("becomes expecting to see").
> 

Sure.

> > upon shutting down the remote processor (e.g. during a restart following
> > a firmware crash):
> > 
> >   ucsi_glink.pmic_glink_ucsi pmic_glink.ucsi.0: failed to send UCSI write request: -5
> > 
> > Fixes: caa855189104 ("soc: qcom: pmic_glink: Fix race during initialization")
> 
> So this should be
> 
> Fixes: 62b5412b1f4a ("usb: typec: ucsi: add PMIC Glink UCSI driver")
> 

I think it should be:

9329933699b3 ("soc: qcom: pmic_glink: Make client-lock non-sleeping")

> > Cc: stable@vger.kernel.org
> > Signed-off-by: Bjorn Andersson <quic_bjorande@quicinc.com>
>  
> > diff --git a/drivers/usb/typec/ucsi/ucsi_glink.c b/drivers/usb/typec/ucsi/ucsi_glink.c
> > index ac53a81c2a81..a33056eec83d 100644
> > --- a/drivers/usb/typec/ucsi/ucsi_glink.c
> > +++ b/drivers/usb/typec/ucsi/ucsi_glink.c
> > @@ -68,6 +68,9 @@ struct pmic_glink_ucsi {
> >  
> >  	struct work_struct notify_work;
> >  	struct work_struct register_work;
> > +	spinlock_t state_lock;
> > +	unsigned int pdr_state;
> > +	unsigned int new_pdr_state;
> 
> Should these be int to match the notify callback (and enum
> servreg_service_state)?
> 

Ohh my. I made it unsigned because I made it unsigned in pmic_glink,
when I wrote that. But as you point out, the type passed around is an
enum servreg_service_state and it's mostly handled as a signed int.

That said, pmic_glink actually filters the value space down to UP/DOWN,
so making this "bool pdr_up" (pd_running?) and "bool ucsi_registered"
would make this cleaner...

> >  	u8 read_buf[UCSI_BUF_SIZE];
> >  };
> > @@ -244,8 +247,22 @@ static void pmic_glink_ucsi_notify(struct work_struct *work)
> >  static void pmic_glink_ucsi_register(struct work_struct *work)
> >  {
> >  	struct pmic_glink_ucsi *ucsi = container_of(work, struct pmic_glink_ucsi, register_work);
> > +	unsigned long flags;
> > +	unsigned int new_state;
> 
> Then int here too.
> 

Yes.

> > +
> > +	spin_lock_irqsave(&ucsi->state_lock, flags);
> > +	new_state = ucsi->new_pdr_state;
> > +	spin_unlock_irqrestore(&ucsi->state_lock, flags);
> > +
> > +	if (ucsi->pdr_state != SERVREG_SERVICE_STATE_UP) {
> > +		if (new_state == SERVREG_SERVICE_STATE_UP)
> > +			ucsi_register(ucsi->ucsi);
> > +	} else {
> > +		if (new_state == SERVREG_SERVICE_STATE_DOWN)
> > +			ucsi_unregister(ucsi->ucsi);
> 
> Do you risk a double deregistration (and UAF/double free) here?
> 

I believe we're good.

Thank you,
Bjorn

> > +	}
> >  
> > -	ucsi_register(ucsi->ucsi);
> > +	ucsi->pdr_state = new_state;
> >  }
> 
> Johan
Johan Hovold Aug. 20, 2024, 6:34 a.m. UTC | #9
On Mon, Aug 19, 2024 at 09:45:29AM -0700, Bjorn Andersson wrote:
> On Mon, Aug 19, 2024 at 05:06:58PM +0200, Johan Hovold wrote:
> > On Sun, Aug 18, 2024 at 04:17:38PM -0700, Bjorn Andersson wrote:
> > > Commit 'caa855189104 ("soc: qcom: pmic_glink: Fix race during
> > > initialization")' 
> > 
> > This commit does not exist, but I think you really meant to refer to
> > 
> > 	9329933699b3 ("soc: qcom: pmic_glink: Make client-lock non-sleeping")
> > 
> > and possibly also
> > 
> > 	635ce0db8956 ("soc: qcom: pmic_glink: don't traverse clients list without a lock")
> > 
> > here.
> > 
> 
> Yeah, I copy-pasted the wrong SHA1. Prior to commit 9329933699b3 ("soc:
> qcom: pmic_glink: Make client-lock non-sleeping") the PDR notification
> happened from a worker with only mutexes held.
> 
> > > moved the pmic_glink client list under a spinlock, as
> > > it is accessed by the rpmsg/glink callback, which in turn is invoked
> > > from IRQ context.
> > > 
> > > This means that ucsi_unregister() is now called from IRQ context, which
                                                           ^^^^^^^^^^^

> > > isn't feasible as it's expecting a sleepable context.
> > 
> > But this is not correct as you say above that the callback has always
> > been made in IRQ context. Then this bug has been there since the
> > introduction of the UCSI driver by commit
> > 
> 
> No, I'm stating that commit 9329933699b3 ("soc: qcom: pmic_glink: Make
> client-lock non-sleeping") was needed because the client list is
> traversed under the separate glink callback, which has always been made
> in IRQ context.

Ok, got it. But then you meant "atomic context", not "IRQ context", in
the paragraph above.

Johan
diff mbox series

Patch

diff --git a/drivers/soc/qcom/pmic_glink.c b/drivers/soc/qcom/pmic_glink.c
index 58ec91767d79..e4747f1d3da5 100644
--- a/drivers/soc/qcom/pmic_glink.c
+++ b/drivers/soc/qcom/pmic_glink.c
@@ -112,8 +112,16 @@  EXPORT_SYMBOL_GPL(pmic_glink_register_client);
 int pmic_glink_send(struct pmic_glink_client *client, void *data, size_t len)
 {
 	struct pmic_glink *pg = client->pg;
+	int ret;
 
-	return rpmsg_send(pg->ept, data, len);
+	mutex_lock(&pg->state_lock);
+	if (!pg->ept)
+		ret = -ECONNRESET;
+	else
+		ret = rpmsg_send(pg->ept, data, len);
+	mutex_unlock(&pg->state_lock);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(pmic_glink_send);
 
diff --git a/drivers/usb/typec/ucsi/ucsi_glink.c b/drivers/usb/typec/ucsi/ucsi_glink.c
index ac53a81c2a81..a33056eec83d 100644
--- a/drivers/usb/typec/ucsi/ucsi_glink.c
+++ b/drivers/usb/typec/ucsi/ucsi_glink.c
@@ -68,6 +68,9 @@  struct pmic_glink_ucsi {
 
 	struct work_struct notify_work;
 	struct work_struct register_work;
+	spinlock_t state_lock;
+	unsigned int pdr_state;
+	unsigned int new_pdr_state;
 
 	u8 read_buf[UCSI_BUF_SIZE];
 };
@@ -244,8 +247,22 @@  static void pmic_glink_ucsi_notify(struct work_struct *work)
 static void pmic_glink_ucsi_register(struct work_struct *work)
 {
 	struct pmic_glink_ucsi *ucsi = container_of(work, struct pmic_glink_ucsi, register_work);
+	unsigned long flags;
+	unsigned int new_state;
+
+	spin_lock_irqsave(&ucsi->state_lock, flags);
+	new_state = ucsi->new_pdr_state;
+	spin_unlock_irqrestore(&ucsi->state_lock, flags);
+
+	if (ucsi->pdr_state != SERVREG_SERVICE_STATE_UP) {
+		if (new_state == SERVREG_SERVICE_STATE_UP)
+			ucsi_register(ucsi->ucsi);
+	} else {
+		if (new_state == SERVREG_SERVICE_STATE_DOWN)
+			ucsi_unregister(ucsi->ucsi);
+	}
 
-	ucsi_register(ucsi->ucsi);
+	ucsi->pdr_state = new_state;
 }
 
 static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv)
@@ -269,11 +286,12 @@  static void pmic_glink_ucsi_callback(const void *data, size_t len, void *priv)
 static void pmic_glink_ucsi_pdr_notify(void *priv, int state)
 {
 	struct pmic_glink_ucsi *ucsi = priv;
+	unsigned long flags;
 
-	if (state == SERVREG_SERVICE_STATE_UP)
-		schedule_work(&ucsi->register_work);
-	else if (state == SERVREG_SERVICE_STATE_DOWN)
-		ucsi_unregister(ucsi->ucsi);
+	spin_lock_irqsave(&ucsi->state_lock, flags);
+	ucsi->new_pdr_state = state;
+	spin_unlock_irqrestore(&ucsi->state_lock, flags);
+	schedule_work(&ucsi->register_work);
 }
 
 static void pmic_glink_ucsi_destroy(void *data)