diff mbox series

[v9,13/13] mm: vmscan: shrink deferred objects proportional to priority

Message ID 20210310174603.5093-14-shy828301@gmail.com (mailing list archive)
State New, archived
Headers show
Series Make shrinker's nr_deferred memcg aware | expand

Commit Message

Yang Shi March 10, 2021, 5:46 p.m. UTC
The number of deferred objects might get windup to an absurd number, and it
results in clamp of slab objects.  It is undesirable for sustaining workingset.

So shrink deferred objects proportional to priority and cap nr_deferred to twice
of cache items.

The idea is borrowed from Dave Chinner's patch:
https://lore.kernel.org/linux-xfs/20191031234618.15403-13-david@fromorbit.com/

Tested with kernel build and vfs metadata heavy workload in our production
environment, no regression is spotted so far.

Signed-off-by: Yang Shi <shy828301@gmail.com>
---
 mm/vmscan.c | 46 +++++++++++-----------------------------------
 1 file changed, 11 insertions(+), 35 deletions(-)

Comments

Shakeel Butt March 10, 2021, 6:24 p.m. UTC | #1
On Wed, Mar 10, 2021 at 9:46 AM Yang Shi <shy828301@gmail.com> wrote:
>
> The number of deferred objects might get windup to an absurd number, and it
> results in clamp of slab objects.  It is undesirable for sustaining workingset.
>
> So shrink deferred objects proportional to priority and cap nr_deferred to twice
> of cache items.
>
> The idea is borrowed from Dave Chinner's patch:
> https://lore.kernel.org/linux-xfs/20191031234618.15403-13-david@fromorbit.com/
>
> Tested with kernel build and vfs metadata heavy workload in our production
> environment, no regression is spotted so far.

Did you run both of these workloads in the same cgroup or separate cgroups?

>
> Signed-off-by: Yang Shi <shy828301@gmail.com>
> ---
>  mm/vmscan.c | 46 +++++++++++-----------------------------------
>  1 file changed, 11 insertions(+), 35 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 9a2dfeaa79f4..6a0a91b23597 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -662,7 +662,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
>          */
>         nr = xchg_nr_deferred(shrinker, shrinkctl);
>
> -       total_scan = nr;
>         if (shrinker->seeks) {
>                 delta = freeable >> priority;
>                 delta *= 4;
> @@ -676,37 +675,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
>                 delta = freeable / 2;
>         }
>
> +       total_scan = nr >> priority;
>         total_scan += delta;
> -       if (total_scan < 0) {
> -               pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
> -                      shrinker->scan_objects, total_scan);
> -               total_scan = freeable;
> -               next_deferred = nr;
> -       } else
> -               next_deferred = total_scan;
> -
> -       /*
> -        * We need to avoid excessive windup on filesystem shrinkers
> -        * due to large numbers of GFP_NOFS allocations causing the
> -        * shrinkers to return -1 all the time. This results in a large
> -        * nr being built up so when a shrink that can do some work
> -        * comes along it empties the entire cache due to nr >>>
> -        * freeable. This is bad for sustaining a working set in
> -        * memory.
> -        *
> -        * Hence only allow the shrinker to scan the entire cache when
> -        * a large delta change is calculated directly.
> -        */
> -       if (delta < freeable / 4)
> -               total_scan = min(total_scan, freeable / 2);
> -
> -       /*
> -        * Avoid risking looping forever due to too large nr value:
> -        * never try to free more than twice the estimate number of
> -        * freeable entries.
> -        */
> -       if (total_scan > freeable * 2)
> -               total_scan = freeable * 2;
> +       total_scan = min(total_scan, (2 * freeable));
>
>         trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
>                                    freeable, delta, total_scan, priority);
> @@ -745,10 +716,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
>                 cond_resched();
>         }
>
> -       if (next_deferred >= scanned)
> -               next_deferred -= scanned;
> -       else
> -               next_deferred = 0;
> +       /*
> +        * The deferred work is increased by any new work (delta) that wasn't
> +        * done, decreased by old deferred work that was done now.
> +        *
> +        * And it is capped to two times of the freeable items.
> +        */
> +       next_deferred = max_t(long, (nr + delta - scanned), 0);
> +       next_deferred = min(next_deferred, (2 * freeable));
> +
>         /*
>          * move the unused scan count back into the shrinker in a
>          * manner that handles concurrent updates.
> --
> 2.26.2
>
Yang Shi March 10, 2021, 6:54 p.m. UTC | #2
On Wed, Mar 10, 2021 at 10:24 AM Shakeel Butt <shakeelb@google.com> wrote:
>
> On Wed, Mar 10, 2021 at 9:46 AM Yang Shi <shy828301@gmail.com> wrote:
> >
> > The number of deferred objects might get windup to an absurd number, and it
> > results in clamp of slab objects.  It is undesirable for sustaining workingset.
> >
> > So shrink deferred objects proportional to priority and cap nr_deferred to twice
> > of cache items.
> >
> > The idea is borrowed from Dave Chinner's patch:
> > https://lore.kernel.org/linux-xfs/20191031234618.15403-13-david@fromorbit.com/
> >
> > Tested with kernel build and vfs metadata heavy workload in our production
> > environment, no regression is spotted so far.
>
> Did you run both of these workloads in the same cgroup or separate cgroups?

Both are covered.

>
> >
> > Signed-off-by: Yang Shi <shy828301@gmail.com>
> > ---
> >  mm/vmscan.c | 46 +++++++++++-----------------------------------
> >  1 file changed, 11 insertions(+), 35 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 9a2dfeaa79f4..6a0a91b23597 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -662,7 +662,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
> >          */
> >         nr = xchg_nr_deferred(shrinker, shrinkctl);
> >
> > -       total_scan = nr;
> >         if (shrinker->seeks) {
> >                 delta = freeable >> priority;
> >                 delta *= 4;
> > @@ -676,37 +675,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
> >                 delta = freeable / 2;
> >         }
> >
> > +       total_scan = nr >> priority;
> >         total_scan += delta;
> > -       if (total_scan < 0) {
> > -               pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
> > -                      shrinker->scan_objects, total_scan);
> > -               total_scan = freeable;
> > -               next_deferred = nr;
> > -       } else
> > -               next_deferred = total_scan;
> > -
> > -       /*
> > -        * We need to avoid excessive windup on filesystem shrinkers
> > -        * due to large numbers of GFP_NOFS allocations causing the
> > -        * shrinkers to return -1 all the time. This results in a large
> > -        * nr being built up so when a shrink that can do some work
> > -        * comes along it empties the entire cache due to nr >>>
> > -        * freeable. This is bad for sustaining a working set in
> > -        * memory.
> > -        *
> > -        * Hence only allow the shrinker to scan the entire cache when
> > -        * a large delta change is calculated directly.
> > -        */
> > -       if (delta < freeable / 4)
> > -               total_scan = min(total_scan, freeable / 2);
> > -
> > -       /*
> > -        * Avoid risking looping forever due to too large nr value:
> > -        * never try to free more than twice the estimate number of
> > -        * freeable entries.
> > -        */
> > -       if (total_scan > freeable * 2)
> > -               total_scan = freeable * 2;
> > +       total_scan = min(total_scan, (2 * freeable));
> >
> >         trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
> >                                    freeable, delta, total_scan, priority);
> > @@ -745,10 +716,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
> >                 cond_resched();
> >         }
> >
> > -       if (next_deferred >= scanned)
> > -               next_deferred -= scanned;
> > -       else
> > -               next_deferred = 0;
> > +       /*
> > +        * The deferred work is increased by any new work (delta) that wasn't
> > +        * done, decreased by old deferred work that was done now.
> > +        *
> > +        * And it is capped to two times of the freeable items.
> > +        */
> > +       next_deferred = max_t(long, (nr + delta - scanned), 0);
> > +       next_deferred = min(next_deferred, (2 * freeable));
> > +
> >         /*
> >          * move the unused scan count back into the shrinker in a
> >          * manner that handles concurrent updates.
> > --
> > 2.26.2
> >
Shakeel Butt March 10, 2021, 9:08 p.m. UTC | #3
On Wed, Mar 10, 2021 at 10:54 AM Yang Shi <shy828301@gmail.com> wrote:
>
> On Wed, Mar 10, 2021 at 10:24 AM Shakeel Butt <shakeelb@google.com> wrote:
> >
> > On Wed, Mar 10, 2021 at 9:46 AM Yang Shi <shy828301@gmail.com> wrote:
> > >
> > > The number of deferred objects might get windup to an absurd number, and it
> > > results in clamp of slab objects.  It is undesirable for sustaining workingset.
> > >
> > > So shrink deferred objects proportional to priority and cap nr_deferred to twice
> > > of cache items.
> > >
> > > The idea is borrowed from Dave Chinner's patch:
> > > https://lore.kernel.org/linux-xfs/20191031234618.15403-13-david@fromorbit.com/
> > >
> > > Tested with kernel build and vfs metadata heavy workload in our production
> > > environment, no regression is spotted so far.
> >
> > Did you run both of these workloads in the same cgroup or separate cgroups?
>
> Both are covered.
>

Have you tried just this patch i.e. without the first 12 patches?
Yang Shi March 10, 2021, 9:41 p.m. UTC | #4
On Wed, Mar 10, 2021 at 1:08 PM Shakeel Butt <shakeelb@google.com> wrote:
>
> On Wed, Mar 10, 2021 at 10:54 AM Yang Shi <shy828301@gmail.com> wrote:
> >
> > On Wed, Mar 10, 2021 at 10:24 AM Shakeel Butt <shakeelb@google.com> wrote:
> > >
> > > On Wed, Mar 10, 2021 at 9:46 AM Yang Shi <shy828301@gmail.com> wrote:
> > > >
> > > > The number of deferred objects might get windup to an absurd number, and it
> > > > results in clamp of slab objects.  It is undesirable for sustaining workingset.
> > > >
> > > > So shrink deferred objects proportional to priority and cap nr_deferred to twice
> > > > of cache items.
> > > >
> > > > The idea is borrowed from Dave Chinner's patch:
> > > > https://lore.kernel.org/linux-xfs/20191031234618.15403-13-david@fromorbit.com/
> > > >
> > > > Tested with kernel build and vfs metadata heavy workload in our production
> > > > environment, no regression is spotted so far.
> > >
> > > Did you run both of these workloads in the same cgroup or separate cgroups?
> >
> > Both are covered.
> >
>
> Have you tried just this patch i.e. without the first 12 patches?

No. It could be applied without the first 12 patches, but I didn't
test this combination specifically since I don't think it would have
any difference from with the first 12 patches. I tested running the
test case under root memcg, it seems equal to w/o the first 12 patches
and the only difference is where to get nr_deferred.
Shakeel Butt March 10, 2021, 10:40 p.m. UTC | #5
On Wed, Mar 10, 2021 at 1:41 PM Yang Shi <shy828301@gmail.com> wrote:
>
> On Wed, Mar 10, 2021 at 1:08 PM Shakeel Butt <shakeelb@google.com> wrote:
> >
> > On Wed, Mar 10, 2021 at 10:54 AM Yang Shi <shy828301@gmail.com> wrote:
> > >
> > > On Wed, Mar 10, 2021 at 10:24 AM Shakeel Butt <shakeelb@google.com> wrote:
> > > >
> > > > On Wed, Mar 10, 2021 at 9:46 AM Yang Shi <shy828301@gmail.com> wrote:
> > > > >
> > > > > The number of deferred objects might get windup to an absurd number, and it
> > > > > results in clamp of slab objects.  It is undesirable for sustaining workingset.
> > > > >
> > > > > So shrink deferred objects proportional to priority and cap nr_deferred to twice
> > > > > of cache items.
> > > > >
> > > > > The idea is borrowed from Dave Chinner's patch:
> > > > > https://lore.kernel.org/linux-xfs/20191031234618.15403-13-david@fromorbit.com/
> > > > >
> > > > > Tested with kernel build and vfs metadata heavy workload in our production
> > > > > environment, no regression is spotted so far.
> > > >
> > > > Did you run both of these workloads in the same cgroup or separate cgroups?
> > >
> > > Both are covered.
> > >
> >
> > Have you tried just this patch i.e. without the first 12 patches?
>
> No. It could be applied without the first 12 patches, but I didn't
> test this combination specifically since I don't think it would have
> any difference from with the first 12 patches. I tested running the
> test case under root memcg, it seems equal to w/o the first 12 patches
> and the only difference is where to get nr_deferred.

I am trying to measure the impact of this patch independently. One
point I can think of is the global reclaim. The first 12 patches do
not aim to improve the global reclaim but this patch will. I am just
wondering what would be negative if any of this patch.
Yang Shi March 10, 2021, 11:01 p.m. UTC | #6
On Wed, Mar 10, 2021 at 2:41 PM Shakeel Butt <shakeelb@google.com> wrote:
>
> On Wed, Mar 10, 2021 at 1:41 PM Yang Shi <shy828301@gmail.com> wrote:
> >
> > On Wed, Mar 10, 2021 at 1:08 PM Shakeel Butt <shakeelb@google.com> wrote:
> > >
> > > On Wed, Mar 10, 2021 at 10:54 AM Yang Shi <shy828301@gmail.com> wrote:
> > > >
> > > > On Wed, Mar 10, 2021 at 10:24 AM Shakeel Butt <shakeelb@google.com> wrote:
> > > > >
> > > > > On Wed, Mar 10, 2021 at 9:46 AM Yang Shi <shy828301@gmail.com> wrote:
> > > > > >
> > > > > > The number of deferred objects might get windup to an absurd number, and it
> > > > > > results in clamp of slab objects.  It is undesirable for sustaining workingset.
> > > > > >
> > > > > > So shrink deferred objects proportional to priority and cap nr_deferred to twice
> > > > > > of cache items.
> > > > > >
> > > > > > The idea is borrowed from Dave Chinner's patch:
> > > > > > https://lore.kernel.org/linux-xfs/20191031234618.15403-13-david@fromorbit.com/
> > > > > >
> > > > > > Tested with kernel build and vfs metadata heavy workload in our production
> > > > > > environment, no regression is spotted so far.
> > > > >
> > > > > Did you run both of these workloads in the same cgroup or separate cgroups?
> > > >
> > > > Both are covered.
> > > >
> > >
> > > Have you tried just this patch i.e. without the first 12 patches?
> >
> > No. It could be applied without the first 12 patches, but I didn't
> > test this combination specifically since I don't think it would have
> > any difference from with the first 12 patches. I tested running the
> > test case under root memcg, it seems equal to w/o the first 12 patches
> > and the only difference is where to get nr_deferred.
>
> I am trying to measure the impact of this patch independently. One
> point I can think of is the global reclaim. The first 12 patches do
> not aim to improve the global reclaim but this patch will. I am just
> wondering what would be negative if any of this patch.

Feel free to do so. More tests from more workloads are definitely
appreciated. That could give us more confidence about this patch or
catch regression sooner.
diff mbox series

Patch

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9a2dfeaa79f4..6a0a91b23597 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -662,7 +662,6 @@  static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 	 */
 	nr = xchg_nr_deferred(shrinker, shrinkctl);
 
-	total_scan = nr;
 	if (shrinker->seeks) {
 		delta = freeable >> priority;
 		delta *= 4;
@@ -676,37 +675,9 @@  static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 		delta = freeable / 2;
 	}
 
+	total_scan = nr >> priority;
 	total_scan += delta;
-	if (total_scan < 0) {
-		pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
-		       shrinker->scan_objects, total_scan);
-		total_scan = freeable;
-		next_deferred = nr;
-	} else
-		next_deferred = total_scan;
-
-	/*
-	 * We need to avoid excessive windup on filesystem shrinkers
-	 * due to large numbers of GFP_NOFS allocations causing the
-	 * shrinkers to return -1 all the time. This results in a large
-	 * nr being built up so when a shrink that can do some work
-	 * comes along it empties the entire cache due to nr >>>
-	 * freeable. This is bad for sustaining a working set in
-	 * memory.
-	 *
-	 * Hence only allow the shrinker to scan the entire cache when
-	 * a large delta change is calculated directly.
-	 */
-	if (delta < freeable / 4)
-		total_scan = min(total_scan, freeable / 2);
-
-	/*
-	 * Avoid risking looping forever due to too large nr value:
-	 * never try to free more than twice the estimate number of
-	 * freeable entries.
-	 */
-	if (total_scan > freeable * 2)
-		total_scan = freeable * 2;
+	total_scan = min(total_scan, (2 * freeable));
 
 	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
 				   freeable, delta, total_scan, priority);
@@ -745,10 +716,15 @@  static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 		cond_resched();
 	}
 
-	if (next_deferred >= scanned)
-		next_deferred -= scanned;
-	else
-		next_deferred = 0;
+	/*
+	 * The deferred work is increased by any new work (delta) that wasn't
+	 * done, decreased by old deferred work that was done now.
+	 *
+	 * And it is capped to two times of the freeable items.
+	 */
+	next_deferred = max_t(long, (nr + delta - scanned), 0);
+	next_deferred = min(next_deferred, (2 * freeable));
+
 	/*
 	 * move the unused scan count back into the shrinker in a
 	 * manner that handles concurrent updates.