diff mbox series

platform/chrome: cros_ec_uart: properly fix race condition

Message ID 20240410182618.169042-2-noah@noahloomans.com (mailing list archive)
State Accepted
Commit 5e700b384ec13f5bcac9855cb28fcc674f1d3593
Headers show
Series platform/chrome: cros_ec_uart: properly fix race condition | expand

Commit Message

Noah Loomans April 10, 2024, 6:26 p.m. UTC
The cros_ec_uart_probe() function calls devm_serdev_device_open() before
it calls serdev_device_set_client_ops(). This can trigger a NULL pointer
dereference:

    BUG: kernel NULL pointer dereference, address: 0000000000000000
    ...
    CPU: 5 PID: 103 Comm: kworker/u16:3 Not tainted 6.8.4-zen1-1-zen #1 4a88f2661038c2a3bb69aa70fb41a5735338823c
    Hardware name: Google Morphius/Morphius, BIOS MrChromebox-4.22.2-1-g2a93624aebf 01/22/2024
    Workqueue: events_unbound flush_to_ldisc
    RIP: 0010:ttyport_receive_buf+0x3f/0xf0
    ...
    Call Trace:
     <TASK>
     ? __die+0x10f/0x120
     ? page_fault_oops+0x171/0x4e0
     ? srso_return_thunk+0x5/0x5f
     ? exc_page_fault+0x7f/0x180
     ? asm_exc_page_fault+0x26/0x30
     ? ttyport_receive_buf+0x3f/0xf0
     flush_to_ldisc+0x9b/0x1c0
     process_one_work+0x17b/0x340
     worker_thread+0x301/0x490
     ? __pfx_worker_thread+0x10/0x10
     kthread+0xe8/0x120
     ? __pfx_kthread+0x10/0x10
     ret_from_fork+0x34/0x50
     ? __pfx_kthread+0x10/0x10
     ret_from_fork_asm+0x1b/0x30
     </TASK>

A simplified version of crashing code is as follows:

    static inline size_t serdev_controller_receive_buf(struct serdev_controller *ctrl,
                                                      const u8 *data,
                                                      size_t count)
    {
            struct serdev_device *serdev = ctrl->serdev;

            if (!serdev || !serdev->ops->receive_buf) // CRASH!
                return 0;

            return serdev->ops->receive_buf(serdev, data, count);
    }

    static size_t ttyport_receive_buf(struct tty_port *port, const u8 *cp,
                                      const u8 *fp, size_t count)
    {
            struct serdev_controller *ctrl = port->client_data;
            [...]

            if (!test_bit(SERPORT_ACTIVE, &serport->flags))
                    return 0;

            ret = serdev_controller_receive_buf(ctrl, cp, count);

            [...]
            return ret;
    }

It assumes that if SERPORT_ACTIVE is set and serdev exists, serdev->ops
will also exist. This conflicts with the existing cros_ec_uart_probe()
logic, as it first calls devm_serdev_device_open() (which sets
SERPORT_ACTIVE), and only later sets serdev->ops via
serdev_device_set_client_ops().

Commit 01f95d42b8f4 ("platform/chrome: cros_ec_uart: fix race
condition") attempted to fix a similar race condition, but while doing
so, made the window of error for this race condition to happen much
wider.

Attempt to fix the race condition again, making sure we fully setup
before calling devm_serdev_device_open().

Fixes: 01f95d42b8f4 ("platform/chrome: cros_ec_uart: fix race condition")
Cc: stable@vger.kernel.org
Signed-off-by: Noah Loomans <noah@noahloomans.com>
---

This is my first time contributing to Linux, I hope this is a good
patch. Feedback on how to improve is welcome!

 drivers/platform/chrome/cros_ec_uart.c | 28 +++++++++++++-------------
 1 file changed, 14 insertions(+), 14 deletions(-)

Comments

Guenter Roeck April 10, 2024, 7:48 p.m. UTC | #1
On Wed, Apr 10, 2024 at 11:29 AM Noah Loomans <noah@noahloomans.com> wrote:
>
> The cros_ec_uart_probe() function calls devm_serdev_device_open() before
> it calls serdev_device_set_client_ops(). This can trigger a NULL pointer
> dereference:
>
>     BUG: kernel NULL pointer dereference, address: 0000000000000000
>     ...
>     CPU: 5 PID: 103 Comm: kworker/u16:3 Not tainted 6.8.4-zen1-1-zen #1 4a88f2661038c2a3bb69aa70fb41a5735338823c
>     Hardware name: Google Morphius/Morphius, BIOS MrChromebox-4.22.2-1-g2a93624aebf 01/22/2024
>     Workqueue: events_unbound flush_to_ldisc
>     RIP: 0010:ttyport_receive_buf+0x3f/0xf0
>     ...
>     Call Trace:
>      <TASK>
>      ? __die+0x10f/0x120
>      ? page_fault_oops+0x171/0x4e0
>      ? srso_return_thunk+0x5/0x5f
>      ? exc_page_fault+0x7f/0x180
>      ? asm_exc_page_fault+0x26/0x30
>      ? ttyport_receive_buf+0x3f/0xf0
>      flush_to_ldisc+0x9b/0x1c0
>      process_one_work+0x17b/0x340
>      worker_thread+0x301/0x490
>      ? __pfx_worker_thread+0x10/0x10
>      kthread+0xe8/0x120
>      ? __pfx_kthread+0x10/0x10
>      ret_from_fork+0x34/0x50
>      ? __pfx_kthread+0x10/0x10
>      ret_from_fork_asm+0x1b/0x30
>      </TASK>
>
> A simplified version of crashing code is as follows:
>
>     static inline size_t serdev_controller_receive_buf(struct serdev_controller *ctrl,
>                                                       const u8 *data,
>                                                       size_t count)
>     {
>             struct serdev_device *serdev = ctrl->serdev;
>
>             if (!serdev || !serdev->ops->receive_buf) // CRASH!
>                 return 0;
>
>             return serdev->ops->receive_buf(serdev, data, count);
>     }
>
>     static size_t ttyport_receive_buf(struct tty_port *port, const u8 *cp,
>                                       const u8 *fp, size_t count)
>     {
>             struct serdev_controller *ctrl = port->client_data;
>             [...]
>
>             if (!test_bit(SERPORT_ACTIVE, &serport->flags))
>                     return 0;
>
>             ret = serdev_controller_receive_buf(ctrl, cp, count);
>
>             [...]
>             return ret;
>     }
>
> It assumes that if SERPORT_ACTIVE is set and serdev exists, serdev->ops
> will also exist. This conflicts with the existing cros_ec_uart_probe()
> logic, as it first calls devm_serdev_device_open() (which sets
> SERPORT_ACTIVE), and only later sets serdev->ops via
> serdev_device_set_client_ops().
>
> Commit 01f95d42b8f4 ("platform/chrome: cros_ec_uart: fix race
> condition") attempted to fix a similar race condition, but while doing
> so, made the window of error for this race condition to happen much
> wider.
>
> Attempt to fix the race condition again, making sure we fully setup
> before calling devm_serdev_device_open().
>
> Fixes: 01f95d42b8f4 ("platform/chrome: cros_ec_uart: fix race condition")
> Cc: stable@vger.kernel.org
> Signed-off-by: Noah Loomans <noah@noahloomans.com>
> ---
>
> This is my first time contributing to Linux, I hope this is a good
> patch. Feedback on how to improve is welcome!
>

The commit message is a bit long, but the patch itself looks good to me.

Reviewed-by: Guenter Roeck <groeck@chromium.org>

Guenter

>  drivers/platform/chrome/cros_ec_uart.c | 28 +++++++++++++-------------
>  1 file changed, 14 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/platform/chrome/cros_ec_uart.c b/drivers/platform/chrome/cros_ec_uart.c
> index 8ea867c2a01a..62bc24f6dcc7 100644
> --- a/drivers/platform/chrome/cros_ec_uart.c
> +++ b/drivers/platform/chrome/cros_ec_uart.c
> @@ -263,12 +263,6 @@ static int cros_ec_uart_probe(struct serdev_device *serdev)
>         if (!ec_dev)
>                 return -ENOMEM;
>
> -       ret = devm_serdev_device_open(dev, serdev);
> -       if (ret) {
> -               dev_err(dev, "Unable to open UART device");
> -               return ret;
> -       }
> -
>         serdev_device_set_drvdata(serdev, ec_dev);
>         init_waitqueue_head(&ec_uart->response.wait_queue);
>
> @@ -280,14 +274,6 @@ static int cros_ec_uart_probe(struct serdev_device *serdev)
>                 return ret;
>         }
>
> -       ret = serdev_device_set_baudrate(serdev, ec_uart->baudrate);
> -       if (ret < 0) {
> -               dev_err(dev, "Failed to set up host baud rate (%d)", ret);
> -               return ret;
> -       }
> -
> -       serdev_device_set_flow_control(serdev, ec_uart->flowcontrol);
> -
>         /* Initialize ec_dev for cros_ec  */
>         ec_dev->phys_name = dev_name(dev);
>         ec_dev->dev = dev;
> @@ -301,6 +287,20 @@ static int cros_ec_uart_probe(struct serdev_device *serdev)
>
>         serdev_device_set_client_ops(serdev, &cros_ec_uart_client_ops);
>
> +       ret = devm_serdev_device_open(dev, serdev);
> +       if (ret) {
> +               dev_err(dev, "Unable to open UART device");
> +               return ret;
> +       }
> +
> +       ret = serdev_device_set_baudrate(serdev, ec_uart->baudrate);
> +       if (ret < 0) {
> +               dev_err(dev, "Failed to set up host baud rate (%d)", ret);
> +               return ret;
> +       }
> +
> +       serdev_device_set_flow_control(serdev, ec_uart->flowcontrol);
> +
>         return cros_ec_register(ec_dev);
>  }
>
> --
> 2.44.0
>
Noah Loomans April 10, 2024, 10:06 p.m. UTC | #2
On 2024-04-10 at 21:48 UTC+02, Guenter Roeck wrote:
> On Wed, Apr 10, 2024 at 11:29 AM Noah Loomans <noah@noahloomans.com> wrote:
> > This is my first time contributing to Linux, I hope this is a good
> > patch. Feedback on how to improve is welcome!
>
> The commit message is a bit long, but the patch itself looks good to me.

Hmm yeah it's a bit on a long side. I'm not sure what could be removed
though, it all seems relevant for understanding the bug and the fix.

> Reviewed-by: Guenter Roeck <groeck@chromium.org>

Thanks!

-Noah
Tzung-Bi Shih April 11, 2024, 2:18 a.m. UTC | #3
On Thu, Apr 11, 2024 at 12:06:33AM +0200, Noah Loomans wrote:
> On 2024-04-10 at 21:48 UTC+02, Guenter Roeck wrote:
> > On Wed, Apr 10, 2024 at 11:29 AM Noah Loomans <noah@noahloomans.com> wrote:
> > > This is my first time contributing to Linux, I hope this is a good
> > > patch. Feedback on how to improve is welcome!
> >
> > The commit message is a bit long, but the patch itself looks good to me.
> 
> Hmm yeah it's a bit on a long side. I'm not sure what could be removed
> though, it all seems relevant for understanding the bug and the fix.

Applied with shortening the message slightly.
Guenter Roeck April 11, 2024, 2:26 a.m. UTC | #4
On Wed, Apr 10, 2024 at 7:18 PM Tzung-Bi Shih <tzungbi@kernel.org> wrote:
>
> On Thu, Apr 11, 2024 at 12:06:33AM +0200, Noah Loomans wrote:
> > On 2024-04-10 at 21:48 UTC+02, Guenter Roeck wrote:
> > > On Wed, Apr 10, 2024 at 11:29 AM Noah Loomans <noah@noahloomans.com> wrote:
> > > > This is my first time contributing to Linux, I hope this is a good
> > > > patch. Feedback on how to improve is welcome!
> > >
> > > The commit message is a bit long, but the patch itself looks good to me.
> >
> > Hmm yeah it's a bit on a long side. I'm not sure what could be removed
> > though, it all seems relevant for understanding the bug and the fix.
>
> Applied with shortening the message slightly.

We might also consider applying the patch to all ChromeOS branches
directly (not waiting for upstream); we do see a number of crashes
because of it.

Guenter
Noah Loomans April 11, 2024, 4:08 p.m. UTC | #5
On 2024-04-11 at 04:18 UTC+02, Tzung-Bi Shih wrote:
> Applied with shortening the message slightly.

Thanks! And I see you already sent a pull request to Linus. That's nice,
didn't expect that to happen so quickly.
Noah Loomans April 11, 2024, 4:08 p.m. UTC | #6
On 2024-04-11 at 04:26 UTC+02, Guenter Roeck wrote:
> We might also consider applying the patch to all ChromeOS branches
> directly (not waiting for upstream); we do see a number of crashes
> because of it.

Heh, I did wonder how Chromebooks running Chrome OS avoided this issue.
I guess the answer is simple: they didn't :)
patchwork-bot+chrome-platform@kernel.org April 24, 2024, 8:51 a.m. UTC | #7
Hello:

This patch was applied to chrome-platform/linux.git (for-kernelci)
by Tzung-Bi Shih <tzungbi@kernel.org>:

On Wed, 10 Apr 2024 20:26:19 +0200 you wrote:
> The cros_ec_uart_probe() function calls devm_serdev_device_open() before
> it calls serdev_device_set_client_ops(). This can trigger a NULL pointer
> dereference:
> 
>     BUG: kernel NULL pointer dereference, address: 0000000000000000
>     ...
>     CPU: 5 PID: 103 Comm: kworker/u16:3 Not tainted 6.8.4-zen1-1-zen #1 4a88f2661038c2a3bb69aa70fb41a5735338823c
>     Hardware name: Google Morphius/Morphius, BIOS MrChromebox-4.22.2-1-g2a93624aebf 01/22/2024
>     Workqueue: events_unbound flush_to_ldisc
>     RIP: 0010:ttyport_receive_buf+0x3f/0xf0
>     ...
>     Call Trace:
>      <TASK>
>      ? __die+0x10f/0x120
>      ? page_fault_oops+0x171/0x4e0
>      ? srso_return_thunk+0x5/0x5f
>      ? exc_page_fault+0x7f/0x180
>      ? asm_exc_page_fault+0x26/0x30
>      ? ttyport_receive_buf+0x3f/0xf0
>      flush_to_ldisc+0x9b/0x1c0
>      process_one_work+0x17b/0x340
>      worker_thread+0x301/0x490
>      ? __pfx_worker_thread+0x10/0x10
>      kthread+0xe8/0x120
>      ? __pfx_kthread+0x10/0x10
>      ret_from_fork+0x34/0x50
>      ? __pfx_kthread+0x10/0x10
>      ret_from_fork_asm+0x1b/0x30
>      </TASK>
> 
> [...]

Here is the summary with links:
  - platform/chrome: cros_ec_uart: properly fix race condition
    https://git.kernel.org/chrome-platform/c/5e700b384ec1

You are awesome, thank you!
patchwork-bot+chrome-platform@kernel.org April 24, 2024, 8:52 a.m. UTC | #8
Hello:

This patch was applied to chrome-platform/linux.git (for-next)
by Tzung-Bi Shih <tzungbi@kernel.org>:

On Wed, 10 Apr 2024 20:26:19 +0200 you wrote:
> The cros_ec_uart_probe() function calls devm_serdev_device_open() before
> it calls serdev_device_set_client_ops(). This can trigger a NULL pointer
> dereference:
> 
>     BUG: kernel NULL pointer dereference, address: 0000000000000000
>     ...
>     CPU: 5 PID: 103 Comm: kworker/u16:3 Not tainted 6.8.4-zen1-1-zen #1 4a88f2661038c2a3bb69aa70fb41a5735338823c
>     Hardware name: Google Morphius/Morphius, BIOS MrChromebox-4.22.2-1-g2a93624aebf 01/22/2024
>     Workqueue: events_unbound flush_to_ldisc
>     RIP: 0010:ttyport_receive_buf+0x3f/0xf0
>     ...
>     Call Trace:
>      <TASK>
>      ? __die+0x10f/0x120
>      ? page_fault_oops+0x171/0x4e0
>      ? srso_return_thunk+0x5/0x5f
>      ? exc_page_fault+0x7f/0x180
>      ? asm_exc_page_fault+0x26/0x30
>      ? ttyport_receive_buf+0x3f/0xf0
>      flush_to_ldisc+0x9b/0x1c0
>      process_one_work+0x17b/0x340
>      worker_thread+0x301/0x490
>      ? __pfx_worker_thread+0x10/0x10
>      kthread+0xe8/0x120
>      ? __pfx_kthread+0x10/0x10
>      ret_from_fork+0x34/0x50
>      ? __pfx_kthread+0x10/0x10
>      ret_from_fork_asm+0x1b/0x30
>      </TASK>
> 
> [...]

Here is the summary with links:
  - platform/chrome: cros_ec_uart: properly fix race condition
    https://git.kernel.org/chrome-platform/c/5e700b384ec1

You are awesome, thank you!
diff mbox series

Patch

diff --git a/drivers/platform/chrome/cros_ec_uart.c b/drivers/platform/chrome/cros_ec_uart.c
index 8ea867c2a01a..62bc24f6dcc7 100644
--- a/drivers/platform/chrome/cros_ec_uart.c
+++ b/drivers/platform/chrome/cros_ec_uart.c
@@ -263,12 +263,6 @@  static int cros_ec_uart_probe(struct serdev_device *serdev)
 	if (!ec_dev)
 		return -ENOMEM;
 
-	ret = devm_serdev_device_open(dev, serdev);
-	if (ret) {
-		dev_err(dev, "Unable to open UART device");
-		return ret;
-	}
-
 	serdev_device_set_drvdata(serdev, ec_dev);
 	init_waitqueue_head(&ec_uart->response.wait_queue);
 
@@ -280,14 +274,6 @@  static int cros_ec_uart_probe(struct serdev_device *serdev)
 		return ret;
 	}
 
-	ret = serdev_device_set_baudrate(serdev, ec_uart->baudrate);
-	if (ret < 0) {
-		dev_err(dev, "Failed to set up host baud rate (%d)", ret);
-		return ret;
-	}
-
-	serdev_device_set_flow_control(serdev, ec_uart->flowcontrol);
-
 	/* Initialize ec_dev for cros_ec  */
 	ec_dev->phys_name = dev_name(dev);
 	ec_dev->dev = dev;
@@ -301,6 +287,20 @@  static int cros_ec_uart_probe(struct serdev_device *serdev)
 
 	serdev_device_set_client_ops(serdev, &cros_ec_uart_client_ops);
 
+	ret = devm_serdev_device_open(dev, serdev);
+	if (ret) {
+		dev_err(dev, "Unable to open UART device");
+		return ret;
+	}
+
+	ret = serdev_device_set_baudrate(serdev, ec_uart->baudrate);
+	if (ret < 0) {
+		dev_err(dev, "Failed to set up host baud rate (%d)", ret);
+		return ret;
+	}
+
+	serdev_device_set_flow_control(serdev, ec_uart->flowcontrol);
+
 	return cros_ec_register(ec_dev);
 }