diff mbox series

[RFC] usb: typec: ucsi: ccg: Remove run_isr flag

Message ID 20190923133101.30774-1-heikki.krogerus@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series [RFC] usb: typec: ucsi: ccg: Remove run_isr flag | expand

Commit Message

Heikki Krogerus Sept. 23, 2019, 1:31 p.m. UTC
There is no need to try to prevent the extra ucsi_notify()
that runtime resuming the device will cause.

This fixes potential deadlock. Both ccg_read() and
ccg_write() are called with the mutex already taken at least
from ccg_send_command(). In ccg_read() and ccg_write, the
mutex is only acquired so that run_isr flag can be set.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
---
Hi Ajay,

Before going forward with this I would like to get confirmation from
you that it is OK, and that I'm not missing anything. I did not see
any real purpose for that run_isr flag. The only thing that I can see
it preventing is an extra ucsi_notify() call caused by the waking of
the controller, but that should not be a problem. Is there any other
reason why the flag is there?

If the driver works fine without the flag, then let's just drop it.
The deadlock will need to be fixed in any case.

thanks,

---
 drivers/usb/typec/ucsi/ucsi_ccg.c | 40 ++-----------------------------
 1 file changed, 2 insertions(+), 38 deletions(-)

Comments

Ajay Gupta Sept. 23, 2019, 6:15 p.m. UTC | #1
Hi Heikki,

> -----Original Message-----
> From: linux-usb-owner@vger.kernel.org <linux-usb-owner@vger.kernel.org>
> On Behalf Of Heikki Krogerus
> Sent: Monday, September 23, 2019 6:31 AM
> To: Ajay Gupta <ajayg@nvidia.com>
> Cc: linux-usb@vger.kernel.org
> Subject: [RFC PATCH] usb: typec: ucsi: ccg: Remove run_isr flag
> 
> There is no need to try to prevent the extra ucsi_notify() that runtime
> resuming the device will cause.
> 
> This fixes potential deadlock. Both ccg_read() and
> ccg_write() are called with the mutex already taken at least from
> ccg_send_command(). In ccg_read() and ccg_write, the mutex is only acquired
> so that run_isr flag can be set.
> 
> Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
> ---
> Hi Ajay,
> 
> Before going forward with this I would like to get confirmation from you that it
> is OK, and that I'm not missing anything. 
Thanks for this. I mixed up firmware flashing and IO path by using same lock.

>I did not see any real purpose for that run_isr flag. 
> The only thing that I can see it preventing is an extra ucsi_notify()
> call caused by the waking of the controller, but that should not be a problem.
> Is there any other reason why the flag is there?
ucsi_ccg_runtime_resume() will get called after pm_runtime_get_sync(uc->dev);
in ccg_read() and ccg_write(). If we allow extra ucsi_notify() then ccg_read() and
ccg_write() will get called again from ucsi_notfiy() through ucsi_sync(). This will
keep on happening in a loop and the controller will remain in D0 always and 
runtime pm will never be done.

> 
> If the driver works fine without the flag, then let's just drop it.
> The deadlock will need to be fixed in any case.

We need to protect read/write of run_isr flag from ccg_read()/ccg_write() and
ucsi_ccg_runtime_resume() since they can get called from interrupt and runtime
pm threads.

I propose to add new "uc->pm_lock" for this purpose and not use "uc->lock".
Please see the change below for this. I tested both FW flashing and runtime PM
and they both work with new pm_lock.

diff --git a/drivers/usb/typec/ucsi/ucsi_ccg.c b/drivers/usb/typec/ucsi/ucsi_ccg.c
index ed727b2..a79a475 100644
--- a/drivers/usb/typec/ucsi/ucsi_ccg.c
+++ b/drivers/usb/typec/ucsi/ucsi_ccg.c
@@ -206,6 +206,7 @@ struct ucsi_ccg {
        /* fw build with vendor information */
        u16 fw_build;
        bool run_isr; /* flag to call ISR routine during resume */
+       struct mutex pm_lock; /* to sync between io and system pm thread */
        struct work_struct pm_work;

        bool has_multiple_dp;
@@ -240,14 +241,14 @@ static int ccg_read(struct ucsi_ccg *uc, u16 rab, u8 *data, u32 len)

        if (uc->fw_build == CCG_FW_BUILD_NVIDIA &&
            uc->fw_version <= CCG_OLD_FW_VERSION) {
-               mutex_lock(&uc->lock);
+               mutex_lock(&uc->pm_lock);
                /*
                 * Do not schedule pm_work to run ISR in
                 * ucsi_ccg_runtime_resume() after pm_runtime_get_sync()
                 * since we are already in ISR path.
                 */
                uc->run_isr = false;
-               mutex_unlock(&uc->lock);
+               mutex_unlock(&uc->pm_lock);
        }

        pm_runtime_get_sync(uc->dev);
@@ -294,14 +295,14 @@ static int ccg_write(struct ucsi_ccg *uc, u16 rab, u8 *data, u32 len)

        if (uc->fw_build == CCG_FW_BUILD_NVIDIA &&
            uc->fw_version <= CCG_OLD_FW_VERSION) {
-               mutex_lock(&uc->lock);
+               mutex_lock(&uc->pm_lock);
                /*
                 * Do not schedule pm_work to run ISR in
                 * ucsi_ccg_runtime_resume() after pm_runtime_get_sync()
                 * since we are already in ISR path.
                 */
                uc->run_isr = false;
-               mutex_unlock(&uc->lock);
+               mutex_unlock(&uc->pm_lock);
        }

        pm_runtime_get_sync(uc->dev);
@@ -1323,6 +1324,7 @@ static int ucsi_ccg_probe(struct i2c_client *client,
        uc->client = client;
        uc->run_isr = true;
        mutex_init(&uc->lock);
+       mutex_init(&uc->pm_lock);
        INIT_WORK(&uc->work, ccg_update_firmware);
        INIT_WORK(&uc->pm_work, ccg_pm_workaround_work);
@@ -1434,12 +1436,12 @@ static int ucsi_ccg_runtime_resume(struct device *dev)
         */
        if (uc->fw_build == CCG_FW_BUILD_NVIDIA &&
            uc->fw_version <= CCG_OLD_FW_VERSION) {
-               mutex_lock(&uc->lock);
+               mutex_lock(&uc->pm_lock);
                if (!uc->run_isr) {
                        uc->run_isr = true;
                        schedule = false;
                }
-               mutex_unlock(&uc->lock);
+               mutex_unlock(&uc->pm_lock);

                if (schedule)
                        schedule_work(&uc->pm_work);


Thanks
> nvpublic
> thanks,
> 
> ---
>  drivers/usb/typec/ucsi/ucsi_ccg.c | 40 ++-----------------------------
>  1 file changed, 2 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/usb/typec/ucsi/ucsi_ccg.c
> b/drivers/usb/typec/ucsi/ucsi_ccg.c
> index 907e20e1a71e..167cb6367198 100644
> --- a/drivers/usb/typec/ucsi/ucsi_ccg.c
> +++ b/drivers/usb/typec/ucsi/ucsi_ccg.c
> @@ -195,7 +195,6 @@ struct ucsi_ccg {
> 
>  	/* fw build with vendor information */
>  	u16 fw_build;
> -	bool run_isr; /* flag to call ISR routine during resume */
>  	struct work_struct pm_work;
>  };
> 
> @@ -224,18 +223,6 @@ static int ccg_read(struct ucsi_ccg *uc, u16 rab, u8
> *data, u32 len)
>  	if (quirks && quirks->max_read_len)
>  		max_read_len = quirks->max_read_len;
> 
> -	if (uc->fw_build == CCG_FW_BUILD_NVIDIA &&
> -	    uc->fw_version <= CCG_OLD_FW_VERSION) {
> -		mutex_lock(&uc->lock);
> -		/*
> -		 * Do not schedule pm_work to run ISR in
> -		 * ucsi_ccg_runtime_resume() after pm_runtime_get_sync()
> -		 * since we are already in ISR path.
> -		 */
> -		uc->run_isr = false;
> -		mutex_unlock(&uc->lock);
> -	}
> -
>  	pm_runtime_get_sync(uc->dev);
>  	while (rem_len > 0) {
>  		msgs[1].buf = &data[len - rem_len];
> @@ -278,18 +265,6 @@ static int ccg_write(struct ucsi_ccg *uc, u16 rab, u8
> *data, u32 len)
>  	msgs[0].len = len + sizeof(rab);
>  	msgs[0].buf = buf;
> 
> -	if (uc->fw_build == CCG_FW_BUILD_NVIDIA &&
> -	    uc->fw_version <= CCG_OLD_FW_VERSION) {
> -		mutex_lock(&uc->lock);
> -		/*
> -		 * Do not schedule pm_work to run ISR in
> -		 * ucsi_ccg_runtime_resume() after pm_runtime_get_sync()
> -		 * since we are already in ISR path.
> -		 */
> -		uc->run_isr = false;
> -		mutex_unlock(&uc->lock);
> -	}
> -
>  	pm_runtime_get_sync(uc->dev);
>  	status = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs));
>  	if (status < 0) {
> @@ -1130,7 +1105,6 @@ static int ucsi_ccg_probe(struct i2c_client *client,
>  	uc->ppm.sync = ucsi_ccg_sync;
>  	uc->dev = dev;
>  	uc->client = client;
> -	uc->run_isr = true;
>  	mutex_init(&uc->lock);
>  	INIT_WORK(&uc->work, ccg_update_firmware);
>  	INIT_WORK(&uc->pm_work, ccg_pm_workaround_work); @@ -1229,7
> +1203,6 @@ static int ucsi_ccg_runtime_resume(struct device *dev)  {
>  	struct i2c_client *client = to_i2c_client(dev);
>  	struct ucsi_ccg *uc = i2c_get_clientdata(client);
> -	bool schedule = true;
> 
>  	/*
>  	 * Firmware version 3.1.10 or earlier, built for NVIDIA has known issue
> @@ -1237,17 +1210,8 @@ static int ucsi_ccg_runtime_resume(struct device
> *dev)
>  	 * Schedule a work to call ISR as a workaround.
>  	 */
>  	if (uc->fw_build == CCG_FW_BUILD_NVIDIA &&
> -	    uc->fw_version <= CCG_OLD_FW_VERSION) {
> -		mutex_lock(&uc->lock);
> -		if (!uc->run_isr) {
> -			uc->run_isr = true;
> -			schedule = false;
> -		}
> -		mutex_unlock(&uc->lock);
> -
> -		if (schedule)
> -			schedule_work(&uc->pm_work);
> -	}
> +	    uc->fw_version <= CCG_OLD_FW_VERSION)
> +		schedule_work(&uc->pm_work);
> 
>  	return 0;
>  }
> --
> 2.23.0
Heikki Krogerus Sept. 24, 2019, 8:24 a.m. UTC | #2
Hi Ajay,

On Mon, Sep 23, 2019 at 06:15:59PM +0000, Ajay Gupta wrote:
> Hi Heikki,
> 
> > -----Original Message-----
> > From: linux-usb-owner@vger.kernel.org <linux-usb-owner@vger.kernel.org>
> > On Behalf Of Heikki Krogerus
> > Sent: Monday, September 23, 2019 6:31 AM
> > To: Ajay Gupta <ajayg@nvidia.com>
> > Cc: linux-usb@vger.kernel.org
> > Subject: [RFC PATCH] usb: typec: ucsi: ccg: Remove run_isr flag
> > 
> > There is no need to try to prevent the extra ucsi_notify() that runtime
> > resuming the device will cause.
> > 
> > This fixes potential deadlock. Both ccg_read() and
> > ccg_write() are called with the mutex already taken at least from
> > ccg_send_command(). In ccg_read() and ccg_write, the mutex is only acquired
> > so that run_isr flag can be set.
> > 
> > Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
> > ---
> > Hi Ajay,
> > 
> > Before going forward with this I would like to get confirmation from you that it
> > is OK, and that I'm not missing anything. 
> Thanks for this. I mixed up firmware flashing and IO path by using same lock.
> 
> >I did not see any real purpose for that run_isr flag. 
> > The only thing that I can see it preventing is an extra ucsi_notify()
> > call caused by the waking of the controller, but that should not be a problem.
> > Is there any other reason why the flag is there?
> ucsi_ccg_runtime_resume() will get called after pm_runtime_get_sync(uc->dev);
> in ccg_read() and ccg_write(). If we allow extra ucsi_notify() then ccg_read() and
> ccg_write() will get called again from ucsi_notfiy() through ucsi_sync(). This will
> keep on happening in a loop and the controller will remain in D0 always and 
> runtime pm will never be done.

OK, so the problem is actually that we are allowing the controller to
suspend while we are still in the middle of waiting for completion to
a command, right? That should not be allowed to happen.

Instead of hiding the symptom from the problem with that flag, the
driver needs to initially define autosuspend delay that is as long as
UCSI_TIMEOUT_MS (or longer). That should be enough to fix the problem,
though it still leaves a small change of hitting it in some corner
cases. After I'm done with the I/O rewrite, we can guarantee that the
controller stays in D0 for as long as it takes to process the whole
command.

But as the initial fix, let's just use the autosuspend delay.

Can you test the attached patch?

thanks,
Ajay Gupta Sept. 24, 2019, 5:10 p.m. UTC | #3
Hi Heikki,

> -----Original Message-----
> From: linux-usb-owner@vger.kernel.org <linux-usb-owner@vger.kernel.org>
> On Behalf Of Heikki Krogerus
> Sent: Tuesday, September 24, 2019 1:25 AM
> To: Ajay Gupta <ajayg@nvidia.com>
> Cc: linux-usb@vger.kernel.org
> Subject: Re: [RFC PATCH] usb: typec: ucsi: ccg: Remove run_isr flag
> 
> Hi Ajay,
> 
> On Mon, Sep 23, 2019 at 06:15:59PM +0000, Ajay Gupta wrote:
> > Hi Heikki,
> >
> > > -----Original Message-----
> > > From: linux-usb-owner@vger.kernel.org
> > > <linux-usb-owner@vger.kernel.org> On Behalf Of Heikki Krogerus
> > > Sent: Monday, September 23, 2019 6:31 AM
> > > To: Ajay Gupta <ajayg@nvidia.com>
> > > Cc: linux-usb@vger.kernel.org
> > > Subject: [RFC PATCH] usb: typec: ucsi: ccg: Remove run_isr flag
> > >
> > > There is no need to try to prevent the extra ucsi_notify() that
> > > runtime resuming the device will cause.
> > >
> > > This fixes potential deadlock. Both ccg_read() and
> > > ccg_write() are called with the mutex already taken at least from
> > > ccg_send_command(). In ccg_read() and ccg_write, the mutex is only
> > > acquired so that run_isr flag can be set.
> > >
> > > Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
> > > ---
> > > Hi Ajay,
> > >
> > > Before going forward with this I would like to get confirmation from
> > > you that it is OK, and that I'm not missing anything.
> > Thanks for this. I mixed up firmware flashing and IO path by using same lock.
> >
> > >I did not see any real purpose for that run_isr flag.
> > > The only thing that I can see it preventing is an extra
> > >ucsi_notify()  call caused by the waking of the controller, but that should
> not be a problem.
> > > Is there any other reason why the flag is there?
> > ucsi_ccg_runtime_resume() will get called after
> > pm_runtime_get_sync(uc->dev); in ccg_read() and ccg_write(). If we
> > allow extra ucsi_notify() then ccg_read() and
> > ccg_write() will get called again from ucsi_notfiy() through
> > ucsi_sync(). This will keep on happening in a loop and the controller
> > will remain in D0 always and runtime pm will never be done.
> 
> OK, so the problem is actually that we are allowing the controller to suspend
> while we are still in the middle of waiting for completion to a command, right?
> That should not be allowed to happen.
True.
 
> Instead of hiding the symptom from the problem with that flag, the driver
> needs to initially define autosuspend delay that is as long as
> UCSI_TIMEOUT_MS (or longer). That should be enough to fix the problem,
> though it still leaves a small change of hitting it in some corner cases. After I'm
> done with the I/O rewrite, we can guarantee that the controller stays in D0 for
> as long as it takes to process the whole command.
> 
> But as the initial fix, let's just use the autosuspend delay.
> 
> Can you test the attached patch?
Thanks for the patch. I tested both runtime PM and firmware flashing
on GPUs with old firmware and latest firmware and it works fine.

Thanks
>nvpublic
> 
> thanks,
> 
> --
> heikki
diff mbox series

Patch

diff --git a/drivers/usb/typec/ucsi/ucsi_ccg.c b/drivers/usb/typec/ucsi/ucsi_ccg.c
index 907e20e1a71e..167cb6367198 100644
--- a/drivers/usb/typec/ucsi/ucsi_ccg.c
+++ b/drivers/usb/typec/ucsi/ucsi_ccg.c
@@ -195,7 +195,6 @@  struct ucsi_ccg {
 
 	/* fw build with vendor information */
 	u16 fw_build;
-	bool run_isr; /* flag to call ISR routine during resume */
 	struct work_struct pm_work;
 };
 
@@ -224,18 +223,6 @@  static int ccg_read(struct ucsi_ccg *uc, u16 rab, u8 *data, u32 len)
 	if (quirks && quirks->max_read_len)
 		max_read_len = quirks->max_read_len;
 
-	if (uc->fw_build == CCG_FW_BUILD_NVIDIA &&
-	    uc->fw_version <= CCG_OLD_FW_VERSION) {
-		mutex_lock(&uc->lock);
-		/*
-		 * Do not schedule pm_work to run ISR in
-		 * ucsi_ccg_runtime_resume() after pm_runtime_get_sync()
-		 * since we are already in ISR path.
-		 */
-		uc->run_isr = false;
-		mutex_unlock(&uc->lock);
-	}
-
 	pm_runtime_get_sync(uc->dev);
 	while (rem_len > 0) {
 		msgs[1].buf = &data[len - rem_len];
@@ -278,18 +265,6 @@  static int ccg_write(struct ucsi_ccg *uc, u16 rab, u8 *data, u32 len)
 	msgs[0].len = len + sizeof(rab);
 	msgs[0].buf = buf;
 
-	if (uc->fw_build == CCG_FW_BUILD_NVIDIA &&
-	    uc->fw_version <= CCG_OLD_FW_VERSION) {
-		mutex_lock(&uc->lock);
-		/*
-		 * Do not schedule pm_work to run ISR in
-		 * ucsi_ccg_runtime_resume() after pm_runtime_get_sync()
-		 * since we are already in ISR path.
-		 */
-		uc->run_isr = false;
-		mutex_unlock(&uc->lock);
-	}
-
 	pm_runtime_get_sync(uc->dev);
 	status = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs));
 	if (status < 0) {
@@ -1130,7 +1105,6 @@  static int ucsi_ccg_probe(struct i2c_client *client,
 	uc->ppm.sync = ucsi_ccg_sync;
 	uc->dev = dev;
 	uc->client = client;
-	uc->run_isr = true;
 	mutex_init(&uc->lock);
 	INIT_WORK(&uc->work, ccg_update_firmware);
 	INIT_WORK(&uc->pm_work, ccg_pm_workaround_work);
@@ -1229,7 +1203,6 @@  static int ucsi_ccg_runtime_resume(struct device *dev)
 {
 	struct i2c_client *client = to_i2c_client(dev);
 	struct ucsi_ccg *uc = i2c_get_clientdata(client);
-	bool schedule = true;
 
 	/*
 	 * Firmware version 3.1.10 or earlier, built for NVIDIA has known issue
@@ -1237,17 +1210,8 @@  static int ucsi_ccg_runtime_resume(struct device *dev)
 	 * Schedule a work to call ISR as a workaround.
 	 */
 	if (uc->fw_build == CCG_FW_BUILD_NVIDIA &&
-	    uc->fw_version <= CCG_OLD_FW_VERSION) {
-		mutex_lock(&uc->lock);
-		if (!uc->run_isr) {
-			uc->run_isr = true;
-			schedule = false;
-		}
-		mutex_unlock(&uc->lock);
-
-		if (schedule)
-			schedule_work(&uc->pm_work);
-	}
+	    uc->fw_version <= CCG_OLD_FW_VERSION)
+		schedule_work(&uc->pm_work);
 
 	return 0;
 }