diff mbox series

[v3,2/2] mm: zswap: fix global shrinker error handling logic

Message ID 20240720044127.508042-3-flintglass@gmail.com (mailing list archive)
State New
Headers show
Series mm: zswap: fixes for global shrinker | expand

Commit Message

Takero Funaki July 20, 2024, 4:41 a.m. UTC
This patch fixes zswap global shrinker that did not shrink zpool as
expected.

The issue it addresses is that `shrink_worker()` did not distinguish
between unexpected errors and expected error codes that should be
skipped, such as when there is no stored page in a memcg. This led to
the shrinking process being aborted on the expected error codes.

The shrinker should ignore these cases and skip to the next memcg.
However,  skipping all memcgs presents another problem. To address this,
this patch tracks progress while walking the memcg tree and checks for
progress once the tree walk is completed.

To handle the empty memcg case, the helper function `shrink_memcg()` is
modified to check if the memcg is empty and then return -ENOENT.

Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware")
Signed-off-by: Takero Funaki <flintglass@gmail.com>
---
 mm/zswap.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

Comments

Nhat Pham July 22, 2024, 9:51 p.m. UTC | #1
On Fri, Jul 19, 2024 at 9:41 PM Takero Funaki <flintglass@gmail.com> wrote:
>
> This patch fixes zswap global shrinker that did not shrink zpool as
> expected.
>
> The issue it addresses is that `shrink_worker()` did not distinguish
> between unexpected errors and expected error codes that should be
> skipped, such as when there is no stored page in a memcg. This led to
> the shrinking process being aborted on the expected error codes.

The code itself seems reasonable to me, but may I ask you to document
(as a comment) all the expected v.s unexpected cases? i.e when do we
increment (or not increment) the failure counter?

My understanding is, we only increment the failure counter if we fail
to reclaim from a selected memcg that is non-empty and
writeback-enabled, or if we go a full tree walk without making any
progress. Is this correct?

>
> The shrinker should ignore these cases and skip to the next memcg.
> However,  skipping all memcgs presents another problem. To address this,
> this patch tracks progress while walking the memcg tree and checks for
> progress once the tree walk is completed.
>
> To handle the empty memcg case, the helper function `shrink_memcg()` is
> modified to check if the memcg is empty and then return -ENOENT.
>
> Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware")
> Signed-off-by: Takero Funaki <flintglass@gmail.com>
> ---
>  mm/zswap.c | 23 +++++++++++++++++------
>  1 file changed, 17 insertions(+), 6 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 6528668c9af3..053d5be81d9a 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1310,10 +1310,10 @@ static struct shrinker *zswap_alloc_shrinker(void)
>
>  static int shrink_memcg(struct mem_cgroup *memcg)
>  {
> -       int nid, shrunk = 0;
> +       int nid, shrunk = 0, scanned = 0;
>
>         if (!mem_cgroup_zswap_writeback_enabled(memcg))
> -               return -EINVAL;
> +               return -ENOENT;
>
>         /*
>          * Skip zombies because their LRUs are reparented and we would be
> @@ -1327,14 +1327,19 @@ static int shrink_memcg(struct mem_cgroup *memcg)
>
>                 shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
>                                             &shrink_memcg_cb, NULL, &nr_to_walk);
> +               scanned += 1 - nr_to_walk;
>         }
> +
> +       if (!scanned)
> +               return -ENOENT;
> +
>         return shrunk ? 0 : -EAGAIN;
>  }
>
>  static void shrink_worker(struct work_struct *w)
>  {
>         struct mem_cgroup *memcg;
> -       int ret, failures = 0;
> +       int ret, failures = 0, progress = 0;
>         unsigned long thr;
>
>         /* Reclaim down to the accept threshold */
> @@ -1379,9 +1384,12 @@ static void shrink_worker(struct work_struct *w)
>                  */
>                 if (!memcg) {
>                         spin_unlock(&zswap_shrink_lock);
> -                       if (++failures == MAX_RECLAIM_RETRIES)
> +
> +                       /* tree walk completed but no progress */
> +                       if (!progress && ++failures == MAX_RECLAIM_RETRIES)
>                                 break;
>
> +                       progress = 0;
>                         goto resched;
>                 }
>
> @@ -1396,10 +1404,13 @@ static void shrink_worker(struct work_struct *w)
>                 /* drop the extra reference */
>                 mem_cgroup_put(memcg);
>
> -               if (ret == -EINVAL)
> -                       break;
> +               if (ret == -ENOENT)
> +                       continue;
> +
>                 if (ret && ++failures == MAX_RECLAIM_RETRIES)
>                         break;
> +
> +               ++progress;
>  resched:
>                 cond_resched();
>         } while (zswap_total_pages() > thr);
> --
> 2.43.0
>
Takero Funaki July 23, 2024, 4:44 p.m. UTC | #2
2024年7月23日(火) 6:51 Nhat Pham <nphamcs@gmail.com>:
>
> On Fri, Jul 19, 2024 at 9:41 PM Takero Funaki <flintglass@gmail.com> wrote:
> >
> > This patch fixes zswap global shrinker that did not shrink zpool as
> > expected.
> >
> > The issue it addresses is that `shrink_worker()` did not distinguish
> > between unexpected errors and expected error codes that should be
> > skipped, such as when there is no stored page in a memcg. This led to
> > the shrinking process being aborted on the expected error codes.
>
> The code itself seems reasonable to me, but may I ask you to document
> (as a comment) all the expected v.s unexpected cases? i.e when do we
> increment (or not increment) the failure counter?
>

In addition to changes in the commit log suggested by Yosry,
adding some comments specifying what memcg is (not) candidates for
writeback, and what should be a failure.

-       /* global reclaim will select cgroup in a round-robin fashion.
+       /*
+        * Global reclaim will select cgroup in a round-robin fashion from all
+        * online memcgs, but memcgs that have no pages in zswap and
+        * writeback-disabled memcgs (memory.zswap.writeback=0) are not
+        * candidates for shrinking.
+        *
+        * Shrinking will be aborted if we encounter the following
+        * MAX_RECLAIM_RETRIES times:
+        * - No writeback-candidate memcgs found in a memcg tree walk.
+        * - Shrinking a writeback-candidate memcg failed.
         *
         * We save iteration cursor memcg into zswap_next_shrink,
         * which can be modified by the offline memcg cleaner

and, the reasons to (not) increment the progress:

@@ -1387,10 +1407,20 @@ static void shrink_worker(struct work_struct *w)
                /* drop the extra reference */
                mem_cgroup_put(memcg);

-               if (ret == -EINVAL)
-                       break;
+               /*
+                * There are no writeback-candidate pages in the memcg.
+                * This is not an issue as long as we can find another memcg
+                * with pages in zswap. Skip this without incrementing progress
+                * and failures.
+                */
+               if (ret == -ENOENT)
+                       continue;
+
                if (ret && ++failures == MAX_RECLAIM_RETRIES)
                        break;
+
+               /* completed writeback or incremented failures */
+               ++progress;
 resched:


> My understanding is, we only increment the failure counter if we fail
> to reclaim from a selected memcg that is non-empty and
> writeback-enabled, or if we go a full tree walk without making any
> progress. Is this correct?
>

Yes, that's the expected behavior.
Please let me know if there is more appropriate wording.

Thanks.
Chengming Zhou July 26, 2024, 3:21 a.m. UTC | #3
On 2024/7/24 00:44, Takero Funaki wrote:
> 2024年7月23日(火) 6:51 Nhat Pham <nphamcs@gmail.com>:
>>
>> On Fri, Jul 19, 2024 at 9:41 PM Takero Funaki <flintglass@gmail.com> wrote:
>>>
>>> This patch fixes zswap global shrinker that did not shrink zpool as
>>> expected.
>>>
>>> The issue it addresses is that `shrink_worker()` did not distinguish
>>> between unexpected errors and expected error codes that should be
>>> skipped, such as when there is no stored page in a memcg. This led to
>>> the shrinking process being aborted on the expected error codes.
>>
>> The code itself seems reasonable to me, but may I ask you to document
>> (as a comment) all the expected v.s unexpected cases? i.e when do we
>> increment (or not increment) the failure counter?
>>
> 
> In addition to changes in the commit log suggested by Yosry,
> adding some comments specifying what memcg is (not) candidates for
> writeback, and what should be a failure.
> 
> -       /* global reclaim will select cgroup in a round-robin fashion.
> +       /*
> +        * Global reclaim will select cgroup in a round-robin fashion from all
> +        * online memcgs, but memcgs that have no pages in zswap and
> +        * writeback-disabled memcgs (memory.zswap.writeback=0) are not
> +        * candidates for shrinking.
> +        *
> +        * Shrinking will be aborted if we encounter the following
> +        * MAX_RECLAIM_RETRIES times:
> +        * - No writeback-candidate memcgs found in a memcg tree walk.
> +        * - Shrinking a writeback-candidate memcg failed.
>           *
>           * We save iteration cursor memcg into zswap_next_shrink,
>           * which can be modified by the offline memcg cleaner
> 
> and, the reasons to (not) increment the progress:
> 
> @@ -1387,10 +1407,20 @@ static void shrink_worker(struct work_struct *w)
>                  /* drop the extra reference */
>                  mem_cgroup_put(memcg);
> 
> -               if (ret == -EINVAL)
> -                       break;
> +               /*
> +                * There are no writeback-candidate pages in the memcg.
> +                * This is not an issue as long as we can find another memcg
> +                * with pages in zswap. Skip this without incrementing progress
> +                * and failures.
> +                */
> +               if (ret == -ENOENT)
> +                       continue;
> +
>                  if (ret && ++failures == MAX_RECLAIM_RETRIES)
>                          break;
> +
> +               /* completed writeback or incremented failures */
> +               ++progress;

Maybe the name "progress" is a little confusing here? "progress" sounds 
to me that we have some writeback completed.

But actually it just means we have encountered some candidates, right?

Thanks.


>   resched:
> 
> 
>> My understanding is, we only increment the failure counter if we fail
>> to reclaim from a selected memcg that is non-empty and
>> writeback-enabled, or if we go a full tree walk without making any
>> progress. Is this correct?
>>
> 
> Yes, that's the expected behavior.
> Please let me know if there is more appropriate wording.
> 
> Thanks.
Takero Funaki July 26, 2024, 8:54 a.m. UTC | #4
Thanks for your comments.


2024年7月26日(金) 12:21 Chengming Zhou <chengming.zhou@linux.dev>:
> > and, the reasons to (not) increment the progress:
> >
> > @@ -1387,10 +1407,20 @@ static void shrink_worker(struct work_struct *w)
> >                  /* drop the extra reference */
> >                  mem_cgroup_put(memcg);
> >
> > -               if (ret == -EINVAL)
> > -                       break;
> > +               /*
> > +                * There are no writeback-candidate pages in the memcg.
> > +                * This is not an issue as long as we can find another memcg
> > +                * with pages in zswap. Skip this without incrementing progress
> > +                * and failures.
> > +                */
> > +               if (ret == -ENOENT)
> > +                       continue;
> > +
> >                  if (ret && ++failures == MAX_RECLAIM_RETRIES)
> >                          break;
> > +
> > +               /* completed writeback or incremented failures */
> > +               ++progress;
>
> Maybe the name "progress" is a little confusing here? "progress" sounds
> to me that we have some writeback completed.
>
> But actually it just means we have encountered some candidates, right?
>
> Thanks.
>
>

Yes, the `++progress` counts both error and success as an iteration
progress for valid memcgs (not writeback amount). Incrementing only on
success will overly increment failures counter if there is only one
memcg, one from writeback failure and one from tree walk ends, the
worker aborts on 8 failures instead of 16.
`++candidates;` would be better? replacing the name and fixing commit
messages for v4.
Nhat Pham July 26, 2024, 6:01 p.m. UTC | #5
On Fri, Jul 26, 2024 at 1:54 AM Takero Funaki <flintglass@gmail.com> wrote:
>
> Yes, the `++progress` counts both error and success as an iteration
> progress for valid memcgs (not writeback amount). Incrementing only on
> success will overly increment failures counter if there is only one
> memcg, one from writeback failure and one from tree walk ends, the
> worker aborts on 8 failures instead of 16.
> `++candidates;` would be better? replacing the name and fixing commit
> messages for v4.

How about `attempt` or `attempted`? Naming is hard :)
Takero Funaki July 27, 2024, 11:08 a.m. UTC | #6
2024年7月27日(土) 3:01 Nhat Pham <nphamcs@gmail.com>:
>
> On Fri, Jul 26, 2024 at 1:54 AM Takero Funaki <flintglass@gmail.com> wrote:
> >
> > Yes, the `++progress` counts both error and success as an iteration
> > progress for valid memcgs (not writeback amount). Incrementing only on
> > success will overly increment failures counter if there is only one
> > memcg, one from writeback failure and one from tree walk ends, the
> > worker aborts on 8 failures instead of 16.
> > `++candidates;` would be better? replacing the name and fixing commit
> > messages for v4.
>
> How about `attempt` or `attempted`? Naming is hard :)

Thanks. Rewriting with `attempts` in align with the `failures` counter.
diff mbox series

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index 6528668c9af3..053d5be81d9a 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1310,10 +1310,10 @@  static struct shrinker *zswap_alloc_shrinker(void)
 
 static int shrink_memcg(struct mem_cgroup *memcg)
 {
-	int nid, shrunk = 0;
+	int nid, shrunk = 0, scanned = 0;
 
 	if (!mem_cgroup_zswap_writeback_enabled(memcg))
-		return -EINVAL;
+		return -ENOENT;
 
 	/*
 	 * Skip zombies because their LRUs are reparented and we would be
@@ -1327,14 +1327,19 @@  static int shrink_memcg(struct mem_cgroup *memcg)
 
 		shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
 					    &shrink_memcg_cb, NULL, &nr_to_walk);
+		scanned += 1 - nr_to_walk;
 	}
+
+	if (!scanned)
+		return -ENOENT;
+
 	return shrunk ? 0 : -EAGAIN;
 }
 
 static void shrink_worker(struct work_struct *w)
 {
 	struct mem_cgroup *memcg;
-	int ret, failures = 0;
+	int ret, failures = 0, progress = 0;
 	unsigned long thr;
 
 	/* Reclaim down to the accept threshold */
@@ -1379,9 +1384,12 @@  static void shrink_worker(struct work_struct *w)
 		 */
 		if (!memcg) {
 			spin_unlock(&zswap_shrink_lock);
-			if (++failures == MAX_RECLAIM_RETRIES)
+
+			/* tree walk completed but no progress */
+			if (!progress && ++failures == MAX_RECLAIM_RETRIES)
 				break;
 
+			progress = 0;
 			goto resched;
 		}
 
@@ -1396,10 +1404,13 @@  static void shrink_worker(struct work_struct *w)
 		/* drop the extra reference */
 		mem_cgroup_put(memcg);
 
-		if (ret == -EINVAL)
-			break;
+		if (ret == -ENOENT)
+			continue;
+
 		if (ret && ++failures == MAX_RECLAIM_RETRIES)
 			break;
+
+		++progress;
 resched:
 		cond_resched();
 	} while (zswap_total_pages() > thr);