diff mbox series

[v2,1/1] PM: Start asynchronous suspend threads upfront

Message ID 20240618121327.2177-2-kaiyen.chang@intel.com (mailing list archive)
State New
Headers show
Series PM: Start asynchronous suspend threads upfront | expand

Commit Message

Chang, Kaiyen June 18, 2024, 12:13 p.m. UTC
Currently, when performing a suspend operation, all devices on the
dpm_list must wait for the "synchronous" devices that are listed
after them to complete before the main suspend thread can start
their suspend routines, even if they are "asynchronous". If the
suspend routine of a synchronous device must enter a waiting state
for some reason, it will cause the main suspend thread to go to
sleep, thereby delaying the processing of all subsequent devices,
including asynchronous ones, ultimately extending the overall
suspend time.

By starting the asynchronous suspend threads upfront, we can allow
the system to handle the suspend routines of these asynchronous
devices in parallel, without waiting for the suspend routines of
the synchronous devices listed after them to complete, and without
breaking their order with respect to their parents and children.
This way, even if the main suspend thread is blocked, these
asynchronous suspend threads can continue to run without being
affected.

Signed-off-by: Kaiyen Chang <kaiyen.chang@intel.com>
---
Change from v1: Fix some unclear parts in the commit messages.
---
 drivers/base/power/main.c | 90 +++++++++++++++++++++++++--------------
 1 file changed, 57 insertions(+), 33 deletions(-)

Comments

Rafael J. Wysocki June 18, 2024, 5:27 p.m. UTC | #1
On Tue, Jun 18, 2024 at 2:13 PM Kaiyen Chang <kaiyen.chang@intel.com> wrote:
>
> Currently, when performing a suspend operation, all devices on the
> dpm_list must wait for the "synchronous" devices that are listed
> after them to complete before the main suspend thread can start
> their suspend routines, even if they are "asynchronous". If the
> suspend routine of a synchronous device must enter a waiting state
> for some reason, it will cause the main suspend thread to go to
> sleep, thereby delaying the processing of all subsequent devices,
> including asynchronous ones, ultimately extending the overall
> suspend time.
>
> By starting the asynchronous suspend threads upfront, we can allow
> the system to handle the suspend routines of these asynchronous
> devices in parallel, without waiting for the suspend routines of
> the synchronous devices listed after them to complete, and without
> breaking their order with respect to their parents and children.
> This way, even if the main suspend thread is blocked, these
> asynchronous suspend threads can continue to run without being
> affected.
>
> Signed-off-by: Kaiyen Chang <kaiyen.chang@intel.com>

How exactly has this been tested?

In the past, changes going in this direction caused system suspend to
hang on at least some platforms (including the ones in my office).

> ---
> Change from v1: Fix some unclear parts in the commit messages.
> ---
>  drivers/base/power/main.c | 90 +++++++++++++++++++++++++--------------
>  1 file changed, 57 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
> index 4a67e83300e1..6ddd6ef36625 100644
> --- a/drivers/base/power/main.c
> +++ b/drivers/base/power/main.c
> @@ -1283,6 +1283,7 @@ static void async_suspend_noirq(void *data, async_cookie_t cookie)
>
>  static int dpm_noirq_suspend_devices(pm_message_t state)
>  {
> +       struct device *dev;
>         ktime_t starttime = ktime_get();
>         int error = 0;
>
> @@ -1293,26 +1294,33 @@ static int dpm_noirq_suspend_devices(pm_message_t state)
>
>         mutex_lock(&dpm_list_mtx);
>
> +       /*
> +        * Trigger the suspend of "async" devices upfront so they don't have to
> +        * wait for the "non-async" ones that don't depend on them.
> +        */
> +
> +       list_for_each_entry_reverse(dev, &dpm_late_early_list, power.entry)
> +               dpm_async_fn(dev, async_suspend_noirq);
> +
>         while (!list_empty(&dpm_late_early_list)) {
> -               struct device *dev = to_device(dpm_late_early_list.prev);
> +               dev = to_device(dpm_late_early_list.prev);
>
>                 list_move(&dev->power.entry, &dpm_noirq_list);
>
> -               if (dpm_async_fn(dev, async_suspend_noirq))
> -                       continue;
> -
> -               get_device(dev);
> +               if (!dev->power.async_in_progress) {
> +                       get_device(dev);
>
> -               mutex_unlock(&dpm_list_mtx);
> +                       mutex_unlock(&dpm_list_mtx);
>
> -               error = device_suspend_noirq(dev, state, false);
> +                       error = device_suspend_noirq(dev, state, false);
>
> -               put_device(dev);
> +                       put_device(dev);
>
> -               mutex_lock(&dpm_list_mtx);
> +                       mutex_lock(&dpm_list_mtx);
>
> -               if (error || async_error)
> -                       break;
> +                       if (error || async_error)
> +                               break;
> +               }
>         }
>
>         mutex_unlock(&dpm_list_mtx);
> @@ -1454,6 +1462,7 @@ static void async_suspend_late(void *data, async_cookie_t cookie)
>   */
>  int dpm_suspend_late(pm_message_t state)
>  {
> +       struct device *dev;
>         ktime_t starttime = ktime_get();
>         int error = 0;
>
> @@ -1466,26 +1475,33 @@ int dpm_suspend_late(pm_message_t state)
>
>         mutex_lock(&dpm_list_mtx);
>
> +       /*
> +        * Trigger the suspend of "async" devices upfront so they don't have to
> +        * wait for the "non-async" ones that don't depend on them.
> +        */
> +
> +       list_for_each_entry_reverse(dev, &dpm_suspended_list, power.entry)
> +               dpm_async_fn(dev, async_suspend_late);
> +
>         while (!list_empty(&dpm_suspended_list)) {
> -               struct device *dev = to_device(dpm_suspended_list.prev);
> +               dev = to_device(dpm_suspended_list.prev);
>
>                 list_move(&dev->power.entry, &dpm_late_early_list);
>
> -               if (dpm_async_fn(dev, async_suspend_late))
> -                       continue;
> -
> -               get_device(dev);
> +               if (!dev->power.async_in_progress) {
> +                       get_device(dev);
>
> -               mutex_unlock(&dpm_list_mtx);
> +                       mutex_unlock(&dpm_list_mtx);
>
> -               error = device_suspend_late(dev, state, false);
> +                       error = device_suspend_late(dev, state, false);
>
> -               put_device(dev);
> +                       put_device(dev);
>
> -               mutex_lock(&dpm_list_mtx);
> +                       mutex_lock(&dpm_list_mtx);
>
> -               if (error || async_error)
> -                       break;
> +                       if (error || async_error)
> +                               break;
> +               }
>         }
>
>         mutex_unlock(&dpm_list_mtx);
> @@ -1719,6 +1735,7 @@ static void async_suspend(void *data, async_cookie_t cookie)
>   */
>  int dpm_suspend(pm_message_t state)
>  {
> +       struct device *dev;
>         ktime_t starttime = ktime_get();
>         int error = 0;
>
> @@ -1733,26 +1750,33 @@ int dpm_suspend(pm_message_t state)
>
>         mutex_lock(&dpm_list_mtx);
>
> +       /*
> +        * Trigger the suspend of "async" devices upfront so they don't have to
> +        * wait for the "non-async" ones that don't depend on them.
> +        */
> +
> +       list_for_each_entry_reverse(dev, &dpm_prepared_list, power.entry)
> +               dpm_async_fn(dev, async_suspend);
> +
>         while (!list_empty(&dpm_prepared_list)) {
> -               struct device *dev = to_device(dpm_prepared_list.prev);
> +               dev = to_device(dpm_prepared_list.prev);
>
>                 list_move(&dev->power.entry, &dpm_suspended_list);
>
> -               if (dpm_async_fn(dev, async_suspend))
> -                       continue;
> -
> -               get_device(dev);
> +               if (!dev->power.async_in_progress) {
> +                       get_device(dev);
>
> -               mutex_unlock(&dpm_list_mtx);
> +                       mutex_unlock(&dpm_list_mtx);
>
> -               error = device_suspend(dev, state, false);
> +                       error = device_suspend(dev, state, false);
>
> -               put_device(dev);
> +                       put_device(dev);
>
> -               mutex_lock(&dpm_list_mtx);
> +                       mutex_lock(&dpm_list_mtx);
>
> -               if (error || async_error)
> -                       break;
> +                       if (error || async_error)
> +                               break;
> +               }
>         }
>
>         mutex_unlock(&dpm_list_mtx);
> --
> 2.34.1
>
Chang, Kaiyen June 20, 2024, 9:21 a.m. UTC | #2
On Tue, Jun 18, 2024 at 07:27:18PM +0200, Rafael J. Wysocki <rafael@kernel.org> wrote:
> 
> On Tue, Jun 18, 2024 at 2:13 PM Kaiyen Chang <kaiyen.chang@intel.com>
> wrote:
> >
> > Currently, when performing a suspend operation, all devices on the
> > dpm_list must wait for the "synchronous" devices that are listed after
> > them to complete before the main suspend thread can start their
> > suspend routines, even if they are "asynchronous". If the suspend
> > routine of a synchronous device must enter a waiting state for some
> > reason, it will cause the main suspend thread to go to sleep, thereby
> > delaying the processing of all subsequent devices, including
> > asynchronous ones, ultimately extending the overall suspend time.
> >
> > By starting the asynchronous suspend threads upfront, we can allow the
> > system to handle the suspend routines of these asynchronous devices in
> > parallel, without waiting for the suspend routines of the synchronous
> > devices listed after them to complete, and without breaking their
> > order with respect to their parents and children.
> > This way, even if the main suspend thread is blocked, these
> > asynchronous suspend threads can continue to run without being
> > affected.
> >
> > Signed-off-by: Kaiyen Chang <kaiyen.chang@intel.com>
> 
> How exactly has this been tested?
> 
> In the past, changes going in this direction caused system suspend to hang on
> at least some platforms (including the ones in my office).
> 

Hi Rafael, thanks for your reply.

1. We have currently verified this patch on the ADL-N Chromebook, running the suspend stress test 1000 times consecutively without encountering any issues.

2. Could you tell us what was the reason for suspend to hang back then?

The reason why I believe that we can start the async device's suspend threads upfront is that, during resume, a sync devices listed after certain async devices on the list do not actually rely on the implicit dependency provided by the order of the devices on the list. This is because if there is no consumer-supplier or parent-children relationship between these devices, the suspend routine of this sync device will be executed directly without waiting for the suspend routines of the async devices listed before it to complete. In other words, this sync device does not depend on the async devices listed before them. Therefore, during suspend, there is no need to ensure those async devices listed before the sync devices are awake before these sync devices complete their suspend routines. In summary, as long as we manage the dependencies between consumers/suppliers and parents/children relationships properly, we should be able to start the async devices' suspend routines upfront during suspend.

> > ---
> > Change from v1: Fix some unclear parts in the commit messages.
> > ---
> >  drivers/base/power/main.c | 90
> > +++++++++++++++++++++++++--------------
> >  1 file changed, 57 insertions(+), 33 deletions(-)
> >
> > diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
> > index 4a67e83300e1..6ddd6ef36625 100644
> > --- a/drivers/base/power/main.c
> > +++ b/drivers/base/power/main.c
> > @@ -1283,6 +1283,7 @@ static void async_suspend_noirq(void *data,
> > async_cookie_t cookie)
> >
> >  static int dpm_noirq_suspend_devices(pm_message_t state)  {
> > +       struct device *dev;
> >         ktime_t starttime = ktime_get();
> >         int error = 0;
> >
> > @@ -1293,26 +1294,33 @@ static int
> > dpm_noirq_suspend_devices(pm_message_t state)
> >
> >         mutex_lock(&dpm_list_mtx);
> >
> > +       /*
> > +        * Trigger the suspend of "async" devices upfront so they don't
> have to
> > +        * wait for the "non-async" ones that don't depend on them.
> > +        */
> > +
> > +       list_for_each_entry_reverse(dev, &dpm_late_early_list,
> power.entry)
> > +               dpm_async_fn(dev, async_suspend_noirq);
> > +
> >         while (!list_empty(&dpm_late_early_list)) {
> > -               struct device *dev =
> to_device(dpm_late_early_list.prev);
> > +               dev = to_device(dpm_late_early_list.prev);
> >
> >                 list_move(&dev->power.entry, &dpm_noirq_list);
> >
> > -               if (dpm_async_fn(dev, async_suspend_noirq))
> > -                       continue;
> > -
> > -               get_device(dev);
> > +               if (!dev->power.async_in_progress) {
> > +                       get_device(dev);
> >
> > -               mutex_unlock(&dpm_list_mtx);
> > +                       mutex_unlock(&dpm_list_mtx);
> >
> > -               error = device_suspend_noirq(dev, state, false);
> > +                       error = device_suspend_noirq(dev, state,
> > + false);
> >
> > -               put_device(dev);
> > +                       put_device(dev);
> >
> > -               mutex_lock(&dpm_list_mtx);
> > +                       mutex_lock(&dpm_list_mtx);
> >
> > -               if (error || async_error)
> > -                       break;
> > +                       if (error || async_error)
> > +                               break;
> > +               }
> >         }
> >
> >         mutex_unlock(&dpm_list_mtx);
> > @@ -1454,6 +1462,7 @@ static void async_suspend_late(void *data,
> async_cookie_t cookie)
> >   */
> >  int dpm_suspend_late(pm_message_t state)  {
> > +       struct device *dev;
> >         ktime_t starttime = ktime_get();
> >         int error = 0;
> >
> > @@ -1466,26 +1475,33 @@ int dpm_suspend_late(pm_message_t state)
> >
> >         mutex_lock(&dpm_list_mtx);
> >
> > +       /*
> > +        * Trigger the suspend of "async" devices upfront so they don't
> have to
> > +        * wait for the "non-async" ones that don't depend on them.
> > +        */
> > +
> > +       list_for_each_entry_reverse(dev, &dpm_suspended_list,
> power.entry)
> > +               dpm_async_fn(dev, async_suspend_late);
> > +
> >         while (!list_empty(&dpm_suspended_list)) {
> > -               struct device *dev =
> to_device(dpm_suspended_list.prev);
> > +               dev = to_device(dpm_suspended_list.prev);
> >
> >                 list_move(&dev->power.entry, &dpm_late_early_list);
> >
> > -               if (dpm_async_fn(dev, async_suspend_late))
> > -                       continue;
> > -
> > -               get_device(dev);
> > +               if (!dev->power.async_in_progress) {
> > +                       get_device(dev);
> >
> > -               mutex_unlock(&dpm_list_mtx);
> > +                       mutex_unlock(&dpm_list_mtx);
> >
> > -               error = device_suspend_late(dev, state, false);
> > +                       error = device_suspend_late(dev, state,
> > + false);
> >
> > -               put_device(dev);
> > +                       put_device(dev);
> >
> > -               mutex_lock(&dpm_list_mtx);
> > +                       mutex_lock(&dpm_list_mtx);
> >
> > -               if (error || async_error)
> > -                       break;
> > +                       if (error || async_error)
> > +                               break;
> > +               }
> >         }
> >
> >         mutex_unlock(&dpm_list_mtx);
> > @@ -1719,6 +1735,7 @@ static void async_suspend(void *data,
> async_cookie_t cookie)
> >   */
> >  int dpm_suspend(pm_message_t state)
> >  {
> > +       struct device *dev;
> >         ktime_t starttime = ktime_get();
> >         int error = 0;
> >
> > @@ -1733,26 +1750,33 @@ int dpm_suspend(pm_message_t state)
> >
> >         mutex_lock(&dpm_list_mtx);
> >
> > +       /*
> > +        * Trigger the suspend of "async" devices upfront so they don't
> have to
> > +        * wait for the "non-async" ones that don't depend on them.
> > +        */
> > +
> > +       list_for_each_entry_reverse(dev, &dpm_prepared_list,
> power.entry)
> > +               dpm_async_fn(dev, async_suspend);
> > +
> >         while (!list_empty(&dpm_prepared_list)) {
> > -               struct device *dev = to_device(dpm_prepared_list.prev);
> > +               dev = to_device(dpm_prepared_list.prev);
> >
> >                 list_move(&dev->power.entry, &dpm_suspended_list);
> >
> > -               if (dpm_async_fn(dev, async_suspend))
> > -                       continue;
> > -
> > -               get_device(dev);
> > +               if (!dev->power.async_in_progress) {
> > +                       get_device(dev);
> >
> > -               mutex_unlock(&dpm_list_mtx);
> > +                       mutex_unlock(&dpm_list_mtx);
> >
> > -               error = device_suspend(dev, state, false);
> > +                       error = device_suspend(dev, state, false);
> >
> > -               put_device(dev);
> > +                       put_device(dev);
> >
> > -               mutex_lock(&dpm_list_mtx);
> > +                       mutex_lock(&dpm_list_mtx);
> >
> > -               if (error || async_error)
> > -                       break;
> > +                       if (error || async_error)
> > +                               break;
> > +               }
> >         }
> >
> >         mutex_unlock(&dpm_list_mtx);
> > --
> > 2.34.1
> >
diff mbox series

Patch

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 4a67e83300e1..6ddd6ef36625 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1283,6 +1283,7 @@  static void async_suspend_noirq(void *data, async_cookie_t cookie)
 
 static int dpm_noirq_suspend_devices(pm_message_t state)
 {
+	struct device *dev;
 	ktime_t starttime = ktime_get();
 	int error = 0;
 
@@ -1293,26 +1294,33 @@  static int dpm_noirq_suspend_devices(pm_message_t state)
 
 	mutex_lock(&dpm_list_mtx);
 
+	/*
+	 * Trigger the suspend of "async" devices upfront so they don't have to
+	 * wait for the "non-async" ones that don't depend on them.
+	 */
+
+	list_for_each_entry_reverse(dev, &dpm_late_early_list, power.entry)
+		dpm_async_fn(dev, async_suspend_noirq);
+
 	while (!list_empty(&dpm_late_early_list)) {
-		struct device *dev = to_device(dpm_late_early_list.prev);
+		dev = to_device(dpm_late_early_list.prev);
 
 		list_move(&dev->power.entry, &dpm_noirq_list);
 
-		if (dpm_async_fn(dev, async_suspend_noirq))
-			continue;
-
-		get_device(dev);
+		if (!dev->power.async_in_progress) {
+			get_device(dev);
 
-		mutex_unlock(&dpm_list_mtx);
+			mutex_unlock(&dpm_list_mtx);
 
-		error = device_suspend_noirq(dev, state, false);
+			error = device_suspend_noirq(dev, state, false);
 
-		put_device(dev);
+			put_device(dev);
 
-		mutex_lock(&dpm_list_mtx);
+			mutex_lock(&dpm_list_mtx);
 
-		if (error || async_error)
-			break;
+			if (error || async_error)
+				break;
+		}
 	}
 
 	mutex_unlock(&dpm_list_mtx);
@@ -1454,6 +1462,7 @@  static void async_suspend_late(void *data, async_cookie_t cookie)
  */
 int dpm_suspend_late(pm_message_t state)
 {
+	struct device *dev;
 	ktime_t starttime = ktime_get();
 	int error = 0;
 
@@ -1466,26 +1475,33 @@  int dpm_suspend_late(pm_message_t state)
 
 	mutex_lock(&dpm_list_mtx);
 
+	/*
+	 * Trigger the suspend of "async" devices upfront so they don't have to
+	 * wait for the "non-async" ones that don't depend on them.
+	 */
+
+	list_for_each_entry_reverse(dev, &dpm_suspended_list, power.entry)
+		dpm_async_fn(dev, async_suspend_late);
+
 	while (!list_empty(&dpm_suspended_list)) {
-		struct device *dev = to_device(dpm_suspended_list.prev);
+		dev = to_device(dpm_suspended_list.prev);
 
 		list_move(&dev->power.entry, &dpm_late_early_list);
 
-		if (dpm_async_fn(dev, async_suspend_late))
-			continue;
-
-		get_device(dev);
+		if (!dev->power.async_in_progress) {
+			get_device(dev);
 
-		mutex_unlock(&dpm_list_mtx);
+			mutex_unlock(&dpm_list_mtx);
 
-		error = device_suspend_late(dev, state, false);
+			error = device_suspend_late(dev, state, false);
 
-		put_device(dev);
+			put_device(dev);
 
-		mutex_lock(&dpm_list_mtx);
+			mutex_lock(&dpm_list_mtx);
 
-		if (error || async_error)
-			break;
+			if (error || async_error)
+				break;
+		}
 	}
 
 	mutex_unlock(&dpm_list_mtx);
@@ -1719,6 +1735,7 @@  static void async_suspend(void *data, async_cookie_t cookie)
  */
 int dpm_suspend(pm_message_t state)
 {
+	struct device *dev;
 	ktime_t starttime = ktime_get();
 	int error = 0;
 
@@ -1733,26 +1750,33 @@  int dpm_suspend(pm_message_t state)
 
 	mutex_lock(&dpm_list_mtx);
 
+	/*
+	 * Trigger the suspend of "async" devices upfront so they don't have to
+	 * wait for the "non-async" ones that don't depend on them.
+	 */
+
+	list_for_each_entry_reverse(dev, &dpm_prepared_list, power.entry)
+		dpm_async_fn(dev, async_suspend);
+
 	while (!list_empty(&dpm_prepared_list)) {
-		struct device *dev = to_device(dpm_prepared_list.prev);
+		dev = to_device(dpm_prepared_list.prev);
 
 		list_move(&dev->power.entry, &dpm_suspended_list);
 
-		if (dpm_async_fn(dev, async_suspend))
-			continue;
-
-		get_device(dev);
+		if (!dev->power.async_in_progress) {
+			get_device(dev);
 
-		mutex_unlock(&dpm_list_mtx);
+			mutex_unlock(&dpm_list_mtx);
 
-		error = device_suspend(dev, state, false);
+			error = device_suspend(dev, state, false);
 
-		put_device(dev);
+			put_device(dev);
 
-		mutex_lock(&dpm_list_mtx);
+			mutex_lock(&dpm_list_mtx);
 
-		if (error || async_error)
-			break;
+			if (error || async_error)
+				break;
+		}
 	}
 
 	mutex_unlock(&dpm_list_mtx);