diff mbox series

[v1,2/3] mm: zswap: fix global shrinker error handling logic

Message ID 20240608155316.451600-3-flintglass@gmail.com (mailing list archive)
State New
Headers show
Series mm: zswap: global shrinker fix and proactive shrink | expand

Commit Message

Takero Funaki June 8, 2024, 3:53 p.m. UTC
This patch fixes zswap global shrinker that did not shrink zpool as
expected.

The issue it addresses is that `shrink_worker()` did not distinguish
between unexpected errors and expected error codes that should be
skipped, such as when there is no stored page in a memcg. This led to
the shrinking process being aborted on the expected error codes.

The shrinker should ignore these cases and skip to the next memcg.
However,  skipping all memcgs presents another problem. To address this,
this patch tracks progress while walking the memcg tree and checks for
progress once the tree walk is completed.

To handle the empty memcg case, the helper function `shrink_memcg()` is
modified to check if the memcg is empty and then return -ENOENT.

Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware")
Signed-off-by: Takero Funaki <flintglass@gmail.com>
---
 mm/zswap.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

Comments

Yosry Ahmed June 10, 2024, 8:27 p.m. UTC | #1
On Sat, Jun 8, 2024 at 8:53 AM Takero Funaki <flintglass@gmail.com> wrote:
>
> This patch fixes zswap global shrinker that did not shrink zpool as
> expected.
>
> The issue it addresses is that `shrink_worker()` did not distinguish
> between unexpected errors and expected error codes that should be
> skipped, such as when there is no stored page in a memcg. This led to
> the shrinking process being aborted on the expected error codes.
>
> The shrinker should ignore these cases and skip to the next memcg.
> However,  skipping all memcgs presents another problem. To address this,
> this patch tracks progress while walking the memcg tree and checks for
> progress once the tree walk is completed.
>
> To handle the empty memcg case, the helper function `shrink_memcg()` is
> modified to check if the memcg is empty and then return -ENOENT.
>
> Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware")
> Signed-off-by: Takero Funaki <flintglass@gmail.com>
> ---
>  mm/zswap.c | 31 ++++++++++++++++++++++++++-----
>  1 file changed, 26 insertions(+), 5 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index d720a42069b6..1a90f434f247 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1393,7 +1393,7 @@ static struct shrinker *zswap_alloc_shrinker(void)
>
>  static int shrink_memcg(struct mem_cgroup *memcg)
>  {
> -       int nid, shrunk = 0;
> +       int nid, shrunk = 0, stored = 0;
>
>         if (!mem_cgroup_zswap_writeback_enabled(memcg))
>                 return -EINVAL;
> @@ -1408,9 +1408,16 @@ static int shrink_memcg(struct mem_cgroup *memcg)
>         for_each_node_state(nid, N_NORMAL_MEMORY) {
>                 unsigned long nr_to_walk = 1;
>
> +               if (!list_lru_count_one(&zswap_list_lru, nid, memcg))
> +                       continue;
> +               ++stored;
>                 shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
>                                             &shrink_memcg_cb, NULL, &nr_to_walk);
>         }
> +
> +       if (!stored)
> +               return -ENOENT;
> +

Can't we just check nr_to_walk here and return -ENOENT if it remains as 1?

Something like:

if (nr_to_walk)
    return -ENOENT;
if (!shrunk)
    return -EAGAIN;
return 0;

>         return shrunk ? 0 : -EAGAIN;
>  }
>
> @@ -1418,12 +1425,18 @@ static void shrink_worker(struct work_struct *w)
>  {
>         struct mem_cgroup *memcg = NULL;
>         struct mem_cgroup *next_memcg;
> -       int ret, failures = 0;
> +       int ret, failures = 0, progress;
>         unsigned long thr;
>
>         /* Reclaim down to the accept threshold */
>         thr = zswap_accept_thr_pages();
>
> +       /*
> +        * We might start from the last memcg.
> +        * That is not a failure.
> +        */
> +       progress = 1;
> +
>         /* global reclaim will select cgroup in a round-robin fashion.
>          *
>          * We save iteration cursor memcg into zswap_next_shrink,
> @@ -1461,9 +1474,12 @@ static void shrink_worker(struct work_struct *w)
>                  */
>                 if (!memcg) {
>                         spin_unlock(&zswap_shrink_lock);
> -                       if (++failures == MAX_RECLAIM_RETRIES)
> +
> +                       /* tree walk completed but no progress */
> +                       if (!progress && ++failures == MAX_RECLAIM_RETRIES)
>                                 break;

It seems like we may keep iterating the entire hierarchy a lot of
times as long as we are making any type of progress. This doesn't seem
right.

>
> +                       progress = 0;
>                         goto resched;
>                 }
>
> @@ -1493,10 +1509,15 @@ static void shrink_worker(struct work_struct *w)
>                 /* drop the extra reference */
>                 mem_cgroup_put(memcg);
>
> -               if (ret == -EINVAL)
> -                       break;
> +               /* not a writeback candidate memcg */
> +               if (ret == -EINVAL || ret == -ENOENT)
> +                       continue;
> +

We should probably return -ENOENT for memcg with writeback disabled as well.

>                 if (ret && ++failures == MAX_RECLAIM_RETRIES)
>                         break;
> +
> +               ++progress;
> +               /* reschedule as we performed some IO */
>  resched:
>                 cond_resched();
>         } while (zswap_total_pages() > thr);
> --
> 2.43.0
>
Takero Funaki June 11, 2024, 3:21 p.m. UTC | #2
Thanks a lot for your comments.


On 2024/06/11 5:27, Yosry Ahmed wrote:
>>                 unsigned long nr_to_walk = 1;
>>
>> +               if (!list_lru_count_one(&zswap_list_lru, nid, memcg))
>> +                       continue;
>> +               ++stored;
>>                 shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
>>                                             &shrink_memcg_cb, NULL, &nr_to_walk);
>>         }
>> +
>> +       if (!stored)
>> +               return -ENOENT;
>> +
>
> Can't we just check nr_to_walk here and return -ENOENT if it remains as 1?
>
> Something like:
>
> if (nr_to_walk)
>     return -ENOENT;
> if (!shrunk)
>     return -EAGAIN;
> return 0;
>

ah, the counting step can be removed. I will change it in v2.

>>         return shrunk ? 0 : -EAGAIN;
>>  }
>>
>> @@ -1418,12 +1425,18 @@ static void shrink_worker(struct work_struct *w)
>>  {
>>         struct mem_cgroup *memcg = NULL;
>>         struct mem_cgroup *next_memcg;
>> -       int ret, failures = 0;
>> +       int ret, failures = 0, progress;
>>         unsigned long thr;
>>
>>         /* Reclaim down to the accept threshold */
>>         thr = zswap_accept_thr_pages();
>>
>> +       /*
>> +        * We might start from the last memcg.
>> +        * That is not a failure.
>> +        */
>> +       progress = 1;
>> +
>>         /* global reclaim will select cgroup in a round-robin fashion.
>>          *
>>          * We save iteration cursor memcg into zswap_next_shrink,
>> @@ -1461,9 +1474,12 @@ static void shrink_worker(struct work_struct *w)
>>                  */
>>                 if (!memcg) {
>>                         spin_unlock(&zswap_shrink_lock);
>> -                       if (++failures == MAX_RECLAIM_RETRIES)
>> +
>> +                       /* tree walk completed but no progress */
>> +                       if (!progress && ++failures == MAX_RECLAIM_RETRIES)
>>                                 break;
>
> It seems like we may keep iterating the entire hierarchy a lot of
> times as long as we are making any type of progress. This doesn't seem
> right.
>

Since shrink_worker evicts only one page per tree walk when there is
only one memcg using zswap, I believe this is the intended behavior.
Even if we choose to break the loop more aggressively, it would only
be postponing the problem because pool_limit_hit will trigger the
worker again.

I agree the existing approach is inefficient. It might be better to
change the 1 page in a round-robin strategy.

>>
>> +                       progress = 0;
>>                         goto resched;
>>                 }
>>
>> @@ -1493,10 +1509,15 @@ static void shrink_worker(struct work_struct *w)
>>                 /* drop the extra reference */
>>                 mem_cgroup_put(memcg);
>>
>> -               if (ret == -EINVAL)
>> -                       break;
>> +               /* not a writeback candidate memcg */
>> +               if (ret == -EINVAL || ret == -ENOENT)
>> +                       continue;
>> +
>
> We should probably return -ENOENT for memcg with writeback disabled as well.
>
>>                 if (ret && ++failures == MAX_RECLAIM_RETRIES)
>>                         break;
>> +
>> +               ++progress;
>> +               /* reschedule as we performed some IO */
>>  resched:
>>                 cond_resched();
>>         } while (zswap_total_pages() > thr);
>> --
>> 2.43.0
>>
Nhat Pham June 11, 2024, 3:51 p.m. UTC | #3
On Tue, Jun 11, 2024 at 8:21 AM Takero Funaki <flintglass@gmail.com> wrote:
>
>
> Since shrink_worker evicts only one page per tree walk when there is
> only one memcg using zswap, I believe this is the intended behavior.

I don't think this is the intended behavior :) It's a holdover from
the old zswap reclaiming behaviors.

1. In the past, we used to shrink one object per shrink worker call.
This is crazy.

2. We then move the LRU from the allocator level to zswap level, and
shrink one object at a time until the pool can accept new pages (i.e
under the acceptance threshold).

3. When we separate the LRU to per-(memcg, node), we keep the
shrink-one-at-a-time part, but do it round-robin style on each of the
(memcg, node) combination.

It's time to optimize this. 4th time's the charm!

> Even if we choose to break the loop more aggressively, it would only
> be postponing the problem because pool_limit_hit will trigger the
> worker again.
>
> I agree the existing approach is inefficient. It might be better to
> change the 1 page in a round-robin strategy.

We can play with a bigger batch.

1. Most straightforward idea is to just use a bigger constant (32? 64? 128?)

2. We can try to shrink until accept for each memcg, hoping that the
round robin selection maintains fairness in the long run - but this
can be a bad idea in the short run for the memcg selected. At the very
least, this should try to respect the protected area for each lruvec.
This might still come into conflict with the zswap shrinker though
(since the protection is best-effort).

3. Proportional reclaim - a variant of what we're doing in
get_scan_count() for page reclaim?

scan = lruvec_size - lruvec_size * protection / (cgroup_size + 1);

protection is derived from memory.min or memory.low of the cgroup, and
cgroup_size is the memory usage of the cgroup. lruvec_size maybe we
can substitute with the number of (reclaimable/unprotected?) zswap
objects in the (node, memcg) lru?
Nhat Pham June 11, 2024, 6:15 p.m. UTC | #4
On Mon, Jun 10, 2024 at 1:28 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Sat, Jun 8, 2024 at 8:53 AM Takero Funaki <flintglass@gmail.com> wrote:
> >
> > This patch fixes zswap global shrinker that did not shrink zpool as
> > expected.
> >
> > The issue it addresses is that `shrink_worker()` did not distinguish
> > between unexpected errors and expected error codes that should be
> > skipped, such as when there is no stored page in a memcg. This led to
> > the shrinking process being aborted on the expected error codes.
> >
> > The shrinker should ignore these cases and skip to the next memcg.
> > However,  skipping all memcgs presents another problem. To address this,
> > this patch tracks progress while walking the memcg tree and checks for
> > progress once the tree walk is completed.
> >
> > To handle the empty memcg case, the helper function `shrink_memcg()` is
> > modified to check if the memcg is empty and then return -ENOENT.
> >
> > Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware")
> > Signed-off-by: Takero Funaki <flintglass@gmail.com>
> > ---
> >  mm/zswap.c | 31 ++++++++++++++++++++++++++-----
> >  1 file changed, 26 insertions(+), 5 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index d720a42069b6..1a90f434f247 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -1393,7 +1393,7 @@ static struct shrinker *zswap_alloc_shrinker(void)
> >
> >  static int shrink_memcg(struct mem_cgroup *memcg)
> >  {
> > -       int nid, shrunk = 0;
> > +       int nid, shrunk = 0, stored = 0;
> >
> >         if (!mem_cgroup_zswap_writeback_enabled(memcg))
> >                 return -EINVAL;
> > @@ -1408,9 +1408,16 @@ static int shrink_memcg(struct mem_cgroup *memcg)
> >         for_each_node_state(nid, N_NORMAL_MEMORY) {
> >                 unsigned long nr_to_walk = 1;
> >
> > +               if (!list_lru_count_one(&zswap_list_lru, nid, memcg))
> > +                       continue;
> > +               ++stored;
> >                 shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
> >                                             &shrink_memcg_cb, NULL, &nr_to_walk);
> >         }
> > +
> > +       if (!stored)
> > +               return -ENOENT;
> > +
>
> Can't we just check nr_to_walk here and return -ENOENT if it remains as 1?
>
> Something like:
>
> if (nr_to_walk)
>     return -ENOENT;
> if (!shrunk)
>     return -EAGAIN;
> return 0;
>
> >         return shrunk ? 0 : -EAGAIN;
> >  }
> >
> > @@ -1418,12 +1425,18 @@ static void shrink_worker(struct work_struct *w)
> >  {
> >         struct mem_cgroup *memcg = NULL;
> >         struct mem_cgroup *next_memcg;
> > -       int ret, failures = 0;
> > +       int ret, failures = 0, progress;
> >         unsigned long thr;
> >
> >         /* Reclaim down to the accept threshold */
> >         thr = zswap_accept_thr_pages();
> >
> > +       /*
> > +        * We might start from the last memcg.
> > +        * That is not a failure.
> > +        */
> > +       progress = 1;
> > +
> >         /* global reclaim will select cgroup in a round-robin fashion.
> >          *
> >          * We save iteration cursor memcg into zswap_next_shrink,
> > @@ -1461,9 +1474,12 @@ static void shrink_worker(struct work_struct *w)
> >                  */
> >                 if (!memcg) {
> >                         spin_unlock(&zswap_shrink_lock);
> > -                       if (++failures == MAX_RECLAIM_RETRIES)
> > +
> > +                       /* tree walk completed but no progress */
> > +                       if (!progress && ++failures == MAX_RECLAIM_RETRIES)
> >                                 break;
>
> It seems like we may keep iterating the entire hierarchy a lot of
> times as long as we are making any type of progress. This doesn't seem
> right.
>

Ah actually, reading your comment closer then yeah the
shrink-until-accept is the intended-ish behavior (from Domenico's
older patches to re-work zswap reclaim). The one-page-per-attempt is
also "intended" behavior, but not any of our's intentions - just
something that remains from the past implementation as we rework this
codebase one change at a time :)
diff mbox series

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index d720a42069b6..1a90f434f247 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1393,7 +1393,7 @@  static struct shrinker *zswap_alloc_shrinker(void)
 
 static int shrink_memcg(struct mem_cgroup *memcg)
 {
-	int nid, shrunk = 0;
+	int nid, shrunk = 0, stored = 0;
 
 	if (!mem_cgroup_zswap_writeback_enabled(memcg))
 		return -EINVAL;
@@ -1408,9 +1408,16 @@  static int shrink_memcg(struct mem_cgroup *memcg)
 	for_each_node_state(nid, N_NORMAL_MEMORY) {
 		unsigned long nr_to_walk = 1;
 
+		if (!list_lru_count_one(&zswap_list_lru, nid, memcg))
+			continue;
+		++stored;
 		shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
 					    &shrink_memcg_cb, NULL, &nr_to_walk);
 	}
+
+	if (!stored)
+		return -ENOENT;
+
 	return shrunk ? 0 : -EAGAIN;
 }
 
@@ -1418,12 +1425,18 @@  static void shrink_worker(struct work_struct *w)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct mem_cgroup *next_memcg;
-	int ret, failures = 0;
+	int ret, failures = 0, progress;
 	unsigned long thr;
 
 	/* Reclaim down to the accept threshold */
 	thr = zswap_accept_thr_pages();
 
+	/*
+	 * We might start from the last memcg.
+	 * That is not a failure.
+	 */
+	progress = 1;
+
 	/* global reclaim will select cgroup in a round-robin fashion.
 	 *
 	 * We save iteration cursor memcg into zswap_next_shrink,
@@ -1461,9 +1474,12 @@  static void shrink_worker(struct work_struct *w)
 		 */
 		if (!memcg) {
 			spin_unlock(&zswap_shrink_lock);
-			if (++failures == MAX_RECLAIM_RETRIES)
+
+			/* tree walk completed but no progress */
+			if (!progress && ++failures == MAX_RECLAIM_RETRIES)
 				break;
 
+			progress = 0;
 			goto resched;
 		}
 
@@ -1493,10 +1509,15 @@  static void shrink_worker(struct work_struct *w)
 		/* drop the extra reference */
 		mem_cgroup_put(memcg);
 
-		if (ret == -EINVAL)
-			break;
+		/* not a writeback candidate memcg */
+		if (ret == -EINVAL || ret == -ENOENT)
+			continue;
+
 		if (ret && ++failures == MAX_RECLAIM_RETRIES)
 			break;
+
+		++progress;
+		/* reschedule as we performed some IO */
 resched:
 		cond_resched();
 	} while (zswap_total_pages() > thr);