diff mbox series

[1/1] mm: add swapiness= arg to memory.reclaim

Message ID 20231130153658.527556-2-schatzberg.dan@gmail.com (mailing list archive)
State New
Headers show
Series Add swappiness argument to memory.reclaim | expand

Commit Message

Dan Schatzberg Nov. 30, 2023, 3:36 p.m. UTC
Allow proactive reclaimers to submit an additional swappiness=<val>
argument to memory.reclaim. This overrides the global or per-memcg
swappiness setting for that reclaim attempt.

For example:

echo "2M swappiness=0" > /sys/fs/cgroup/memory.reclaim

will perform reclaim on the rootcg with a swappiness setting of 0 (no
swap) regardless of the vm.swappiness sysctl setting.

Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
---
 include/linux/swap.h |  3 ++-
 mm/memcontrol.c      | 55 +++++++++++++++++++++++++++++++++++---------
 mm/vmscan.c          | 13 +++++++++--
 3 files changed, 57 insertions(+), 14 deletions(-)

Comments

Andrew Morton Nov. 30, 2023, 9:33 p.m. UTC | #1
On Thu, 30 Nov 2023 07:36:54 -0800 Dan Schatzberg <schatzberg.dan@gmail.com> wrote:

> Allow proactive reclaimers to submit an additional swappiness=<val>
> argument to memory.reclaim. This overrides the global or per-memcg
> swappiness setting for that reclaim attempt.
> 
> For example:
> 
> echo "2M swappiness=0" > /sys/fs/cgroup/memory.reclaim
> 
> will perform reclaim on the rootcg with a swappiness setting of 0 (no
> swap) regardless of the vm.swappiness sysctl setting.
> 
> Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
> ---
>  include/linux/swap.h |  3 ++-
>  mm/memcontrol.c      | 55 +++++++++++++++++++++++++++++++++++---------
>  mm/vmscan.c          | 13 +++++++++--

Documentation/admin-guide/cgroup-v2.rst is feeling unloved!

Please check whether this interface change can lead to
non-backward-compatible userspace.  If someone's script does the above
echo command, will it break on older kernels?  If so, does it matter?
Dan Schatzberg Nov. 30, 2023, 9:46 p.m. UTC | #2
On Thu, Nov 30, 2023 at 01:33:40PM -0800, Andrew Morton wrote:
> On Thu, 30 Nov 2023 07:36:54 -0800 Dan Schatzberg <schatzberg.dan@gmail.com> wrote:
> 
> > Allow proactive reclaimers to submit an additional swappiness=<val>
> > argument to memory.reclaim. This overrides the global or per-memcg
> > swappiness setting for that reclaim attempt.
> > 
> > For example:
> > 
> > echo "2M swappiness=0" > /sys/fs/cgroup/memory.reclaim
> > 
> > will perform reclaim on the rootcg with a swappiness setting of 0 (no
> > swap) regardless of the vm.swappiness sysctl setting.
> > 
> > Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
> > ---
> >  include/linux/swap.h |  3 ++-
> >  mm/memcontrol.c      | 55 +++++++++++++++++++++++++++++++++++---------
> >  mm/vmscan.c          | 13 +++++++++--
> 
> Documentation/admin-guide/cgroup-v2.rst is feeling unloved!

Oops - total oversight on my part. I'll add docs in a V2 if we can
come to consensus on this interface change in general.

> 
> Please check whether this interface change can lead to
> non-backward-compatible userspace.  If someone's script does the above
> echo command, will it break on older kernels?  If so, does it matter?

Older kernels will return -EINVAL with such a command - that seems
appropriate, indicating that the argument is not supported.
Huan Yang Dec. 1, 2023, 1:56 a.m. UTC | #3
在 2023/11/30 23:36, Dan Schatzberg 写道:
> [?????????schatzberg.dan@gmail.com  ?????????https://aka.ms/LearnAboutSenderIdentification,????????????]
>
> Allow proactive reclaimers to submit an additional swappiness=<val>
> argument to memory.reclaim. This overrides the global or per-memcg
> swappiness setting for that reclaim attempt.
>
> For example:
>
> echo "2M swappiness=0" > /sys/fs/cgroup/memory.reclaim
>
> will perform reclaim on the rootcg with a swappiness setting of 0 (no
> swap) regardless of the vm.swappiness sysctl setting.
>
> Signed-off-by: Dan Schatzberg<schatzberg.dan@gmail.com>
> ---
>   include/linux/swap.h |  3 ++-
>   mm/memcontrol.c      | 55 +++++++++++++++++++++++++++++++++++---------
>   mm/vmscan.c          | 13 +++++++++--
>   3 files changed, 57 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index f6dd6575b905..c6e309199f10 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -410,7 +410,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>   extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>                                                    unsigned long nr_pages,
>                                                    gfp_t gfp_mask,
> -                                                 unsigned int reclaim_options);
> +                                                 unsigned int reclaim_options,
> +                                                 int *swappiness);
>   extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
>                                                  gfp_t gfp_mask, bool noswap,
>                                                  pg_data_t *pgdat,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 1c1061df9cd1..ba1c89455ab0 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -63,6 +63,7 @@
>   #include <linux/resume_user_mode.h>
>   #include <linux/psi.h>
>   #include <linux/seq_buf.h>
> +#include <linux/parser.h>
>   #include <linux/sched/isolation.h>
>   #include "internal.h"
>   #include <net/sock.h>
> @@ -2449,7 +2450,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
>                  psi_memstall_enter(&pflags);
>                  nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
>                                                          gfp_mask,
> -                                                       MEMCG_RECLAIM_MAY_SWAP);
> +                                                       MEMCG_RECLAIM_MAY_SWAP, NULL);
>                  psi_memstall_leave(&pflags);
>          } while ((memcg = parent_mem_cgroup(memcg)) &&
>                   !mem_cgroup_is_root(memcg));
> @@ -2740,7 +2741,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>
>          psi_memstall_enter(&pflags);
>          nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
> -                                                   gfp_mask, reclaim_options);
> +                                                   gfp_mask, reclaim_options, NULL);
>          psi_memstall_leave(&pflags);
>
>          if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
> @@ -3660,7 +3661,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
>                  }
>
>                  if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> -                                       memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
> +                                       memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
>                          ret = -EBUSY;
>                          break;
>                  }
> @@ -3774,7 +3775,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
>                          return -EINTR;
>
>                  if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> -                                                 MEMCG_RECLAIM_MAY_SWAP))
> +                                                 MEMCG_RECLAIM_MAY_SWAP, NULL))
>                          nr_retries--;
>          }
>
> @@ -6720,7 +6721,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
>                  }
>
>                  reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
> -                                       GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
> +                                       GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
>
>                  if (!reclaimed && !nr_retries--)
>                          break;
> @@ -6769,7 +6770,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
>
>                  if (nr_reclaims) {
>                          if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
> -                                       GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
> +                                       GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
>                                  nr_reclaims--;
>                          continue;
>                  }
> @@ -6895,6 +6896,16 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
>          return nbytes;
>   }
>
> +enum {
> +       MEMORY_RECLAIM_SWAPPINESS = 0,
> +       MEMORY_RECLAIM_NULL,
> +};
> +
> +static const match_table_t if_tokens = {
> +       { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
> +       { MEMORY_RECLAIM_NULL, NULL },
> +};
> +
>   static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
>                                size_t nbytes, loff_t off)
>   {
> @@ -6902,12 +6913,33 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
>          unsigned int nr_retries = MAX_RECLAIM_RETRIES;
>          unsigned long nr_to_reclaim, nr_reclaimed = 0;
>          unsigned int reclaim_options;
> -       int err;
> +       char *old_buf, *start;
> +       substring_t args[MAX_OPT_ARGS];
> +       int swappiness = -1;
>
>          buf = strstrip(buf);
> -       err = page_counter_memparse(buf, "", &nr_to_reclaim);
> -       if (err)
> -               return err;
> +
> +       old_buf = buf;
> +       nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
> +       if (buf == old_buf)
> +               return -EINVAL;
> +
> +       buf = strstrip(buf);
> +
> +       while ((start = strsep(&buf, " ")) != NULL) {
> +               if (!strlen(start))
> +                       continue;
> +               switch (match_token(start, if_tokens, args)) {
> +               case MEMORY_RECLAIM_SWAPPINESS:
> +                       if (match_int(&args[0], &swappiness))
> +                               return -EINVAL;
> +                       if (swappiness < 0 || swappiness > 200)
> +                               return -EINVAL;
> +                       break;
> +               default:
> +                       return -EINVAL;
> +               }
> +       }
>
>          reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
>          while (nr_reclaimed < nr_to_reclaim) {
> @@ -6926,7 +6958,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
>
>                  reclaimed = try_to_free_mem_cgroup_pages(memcg,
>                                          min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX),
> -                                       GFP_KERNEL, reclaim_options);
> +                                       GFP_KERNEL, reclaim_options,
> +                                       swappiness == -1 ? NULL : &swappiness);
>
>                  if (!reclaimed && !nr_retries--)
>                          return -EAGAIN;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 506f8220c5fe..546704ea01e1 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -136,6 +136,9 @@ struct scan_control {
>          /* Always discard instead of demoting to lower tier memory */
>          unsigned int no_demotion:1;
>
> +       /* Swappiness value for reclaim, if NULL use memcg/global value */
> +       int *swappiness;
> +
>          /* Allocation order */
>          s8 order;
>
> @@ -2327,7 +2330,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>          struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>          struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>          unsigned long anon_cost, file_cost, total_cost;
> -       int swappiness = mem_cgroup_swappiness(memcg);
> +       int swappiness = sc->swappiness ?
> +               *sc->swappiness : mem_cgroup_swappiness(memcg);
Should we use "unlikely" here to indicate that sc->swappiness is an 
unexpected behavior?
Due to current use case only apply in proactive reclaim.
>          u64 fraction[ANON_AND_FILE];
>          u64 denominator = 0;    /* gcc */
>          enum scan_balance scan_balance;
> @@ -2608,6 +2612,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
>              mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
>                  return 0;
>
> +       if (sc->swappiness)
> +               return *sc->swappiness;
Also there.
> +
>          return mem_cgroup_swappiness(memcg);
>   }
>
> @@ -6433,7 +6440,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
>   unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>                                             unsigned long nr_pages,
>                                             gfp_t gfp_mask,
> -                                          unsigned int reclaim_options)
> +                                          unsigned int reclaim_options,
> +                                          int *swappiness)
>   {
>          unsigned long nr_reclaimed;
>          unsigned int noreclaim_flag;
> @@ -6448,6 +6456,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>                  .may_unmap = 1,
>                  .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
>                  .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
> +               .swappiness = swappiness,
>          };
>          /*
>           * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
> --
> 2.34.1
My previous patch attempted to ensure fully deterministic semantics 
under extreme swappiness.
For example, when swappiness is set to 200, only anonymous pages will be 
reclaimed.
Due to code in MGLRU isolate_folios will try scan anon if no scanned, 
will try other type.(We do not want
it to attempt this behavior.)
How do you think about extreme swappiness scenarios?
Yosry Ahmed Dec. 1, 2023, 2:05 a.m. UTC | #4
> @@ -2327,7 +2330,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>         unsigned long anon_cost, file_cost, total_cost;
> -       int swappiness = mem_cgroup_swappiness(memcg);
> +       int swappiness = sc->swappiness ?
> +               *sc->swappiness : mem_cgroup_swappiness(memcg);
>
> Should we use "unlikely" here to indicate that sc->swappiness is an unexpected behavior?
> Due to current use case only apply in proactive reclaim.

On a system that is not under memory pressure, the rate of proactive
reclaim could be higher than reactive reclaim. We should only use
likely/unlikely when it's obvious a scenario will happen most of the
time. I don't believe that's the case here.

>
>         u64 fraction[ANON_AND_FILE];
>         u64 denominator = 0;    /* gcc */
>         enum scan_balance scan_balance;
> @@ -2608,6 +2612,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
>             mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
>                 return 0;
>
> +       if (sc->swappiness)
> +               return *sc->swappiness;
>
> Also there.
>
> +
>         return mem_cgroup_swappiness(memcg);
>  }
>
> @@ -6433,7 +6440,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
>  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>                                            unsigned long nr_pages,
>                                            gfp_t gfp_mask,
> -                                          unsigned int reclaim_options)
> +                                          unsigned int reclaim_options,
> +                                          int *swappiness)
>  {
>         unsigned long nr_reclaimed;
>         unsigned int noreclaim_flag;
> @@ -6448,6 +6456,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>                 .may_unmap = 1,
>                 .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
>                 .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
> +               .swappiness = swappiness,
>         };
>         /*
>          * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
> --
> 2.34.1
>
> My previous patch attempted to ensure fully deterministic semantics under extreme swappiness.
> For example, when swappiness is set to 200, only anonymous pages will be reclaimed.
> Due to code in MGLRU isolate_folios will try scan anon if no scanned, will try other type.(We do not want
> it to attempt this behavior.)
> How do you think about extreme swappiness scenarios?

I think having different semantics between swappiness passed to
proactive reclaim and global swappiness can be confusing. If it's
needed to have a swappiness value that says "anon only no matter
what", perhaps we should introduce such a new value and make it
supported by both global and proactive reclaim swappiness? We could
support writing "max" or something similar instead of a special value
to mean that.

Writing such value to global swappiness may cause problems and
premature OOMs IIUC, but that would be misconfiguration. If we think
that's dangerous, we can introduce this new value but make it valid
only for proactive reclaim for now.
Huan Yang Dec. 1, 2023, 2:13 a.m. UTC | #5
在 2023/12/1 10:05, Yosry Ahmed 写道:
>> @@ -2327,7 +2330,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>>          struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>>          struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>>          unsigned long anon_cost, file_cost, total_cost;
>> -       int swappiness = mem_cgroup_swappiness(memcg);
>> +       int swappiness = sc->swappiness ?
>> +               *sc->swappiness : mem_cgroup_swappiness(memcg);
>>
>> Should we use "unlikely" here to indicate that sc->swappiness is an unexpected behavior?
>> Due to current use case only apply in proactive reclaim.
> On a system that is not under memory pressure, the rate of proactive
> reclaim could be higher than reactive reclaim. We should only use
> likely/unlikely when it's obvious a scenario will happen most of the
> time. I don't believe that's the case here.
Not all vendors will use proactive interfaces, and reactive reclaim are 
a normal
system behavior. In this regard, I think it is appropriate to add 
"unlikely".
>
>>          u64 fraction[ANON_AND_FILE];
>>          u64 denominator = 0;    /* gcc */
>>          enum scan_balance scan_balance;
>> @@ -2608,6 +2612,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
>>              mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
>>                  return 0;
>>
>> +       if (sc->swappiness)
>> +               return *sc->swappiness;
>>
>> Also there.
>>
>> +
>>          return mem_cgroup_swappiness(memcg);
>>   }
>>
>> @@ -6433,7 +6440,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
>>   unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>>                                             unsigned long nr_pages,
>>                                             gfp_t gfp_mask,
>> -                                          unsigned int reclaim_options)
>> +                                          unsigned int reclaim_options,
>> +                                          int *swappiness)
>>   {
>>          unsigned long nr_reclaimed;
>>          unsigned int noreclaim_flag;
>> @@ -6448,6 +6456,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>>                  .may_unmap = 1,
>>                  .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
>>                  .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
>> +               .swappiness = swappiness,
>>          };
>>          /*
>>           * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
>> --
>> 2.34.1
>>
>> My previous patch attempted to ensure fully deterministic semantics under extreme swappiness.
>> For example, when swappiness is set to 200, only anonymous pages will be reclaimed.
>> Due to code in MGLRU isolate_folios will try scan anon if no scanned, will try other type.(We do not want
>> it to attempt this behavior.)
>> How do you think about extreme swappiness scenarios?
> I think having different semantics between swappiness passed to
> proactive reclaim and global swappiness can be confusing. If it's
> needed to have a swappiness value that says "anon only no matter
> what", perhaps we should introduce such a new value and make it
> supported by both global and proactive reclaim swappiness? We could
> support writing "max" or something similar instead of a special value
> to mean that.

Yes, use other hint more suitable for this scenario.

However, from this patch, it seems that this feature is not supported.
Do you have a demand for this scenario?

>
> Writing such value to global swappiness may cause problems and
> premature OOMs IIUC, but that would be misconfiguration. If we think
> that's dangerous, we can introduce this new value but make it valid
> only for proactive reclaim for now.
Yosry Ahmed Dec. 1, 2023, 2:17 a.m. UTC | #6
On Thu, Nov 30, 2023 at 6:14 PM Huan Yang <11133793@vivo.com> wrote:
>
>
> 在 2023/12/1 10:05, Yosry Ahmed 写道:
> >> @@ -2327,7 +2330,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
> >>          struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> >>          struct mem_cgroup *memcg = lruvec_memcg(lruvec);
> >>          unsigned long anon_cost, file_cost, total_cost;
> >> -       int swappiness = mem_cgroup_swappiness(memcg);
> >> +       int swappiness = sc->swappiness ?
> >> +               *sc->swappiness : mem_cgroup_swappiness(memcg);
> >>
> >> Should we use "unlikely" here to indicate that sc->swappiness is an unexpected behavior?
> >> Due to current use case only apply in proactive reclaim.
> > On a system that is not under memory pressure, the rate of proactive
> > reclaim could be higher than reactive reclaim. We should only use
> > likely/unlikely when it's obvious a scenario will happen most of the
> > time. I don't believe that's the case here.
> Not all vendors will use proactive interfaces, and reactive reclaim are
> a normal
> system behavior. In this regard, I think it is appropriate to add
> "unlikely".

The general guidance is not to use likely/unlikely when it's not
certain, which I believe is the case here. I think the CPU will make
better decisions on its own than if we give it hints that's wrong in
some situations. Others please correct me if I am wrong.

> >
> >>          u64 fraction[ANON_AND_FILE];
> >>          u64 denominator = 0;    /* gcc */
> >>          enum scan_balance scan_balance;
> >> @@ -2608,6 +2612,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
> >>              mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
> >>                  return 0;
> >>
> >> +       if (sc->swappiness)
> >> +               return *sc->swappiness;
> >>
> >> Also there.
> >>
> >> +
> >>          return mem_cgroup_swappiness(memcg);
> >>   }
> >>
> >> @@ -6433,7 +6440,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
> >>   unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> >>                                             unsigned long nr_pages,
> >>                                             gfp_t gfp_mask,
> >> -                                          unsigned int reclaim_options)
> >> +                                          unsigned int reclaim_options,
> >> +                                          int *swappiness)
> >>   {
> >>          unsigned long nr_reclaimed;
> >>          unsigned int noreclaim_flag;
> >> @@ -6448,6 +6456,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> >>                  .may_unmap = 1,
> >>                  .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
> >>                  .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
> >> +               .swappiness = swappiness,
> >>          };
> >>          /*
> >>           * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
> >> --
> >> 2.34.1
> >>
> >> My previous patch attempted to ensure fully deterministic semantics under extreme swappiness.
> >> For example, when swappiness is set to 200, only anonymous pages will be reclaimed.
> >> Due to code in MGLRU isolate_folios will try scan anon if no scanned, will try other type.(We do not want
> >> it to attempt this behavior.)
> >> How do you think about extreme swappiness scenarios?
> > I think having different semantics between swappiness passed to
> > proactive reclaim and global swappiness can be confusing. If it's
> > needed to have a swappiness value that says "anon only no matter
> > what", perhaps we should introduce such a new value and make it
> > supported by both global and proactive reclaim swappiness? We could
> > support writing "max" or something similar instead of a special value
> > to mean that.
>
> Yes, use other hint more suitable for this scenario.
>
> However, from this patch, it seems that this feature is not supported.
> Do you have a demand for this scenario?

We do anonymous-only proactive reclaim in some setups, so it would be
nice to have. I am not sure if it's absolutely needed vs. just using
swappiness=200 and living with the possibility of reclaiming some file
pages.
Huan Yang Dec. 1, 2023, 2:24 a.m. UTC | #7
在 2023/12/1 10:17, Yosry Ahmed 写道:
> On Thu, Nov 30, 2023 at 6:14 PM Huan Yang <11133793@vivo.com> wrote:
>>
>> 在 2023/12/1 10:05, Yosry Ahmed 写道:
>>>> @@ -2327,7 +2330,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>>>>           struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>>>>           struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>>>>           unsigned long anon_cost, file_cost, total_cost;
>>>> -       int swappiness = mem_cgroup_swappiness(memcg);
>>>> +       int swappiness = sc->swappiness ?
>>>> +               *sc->swappiness : mem_cgroup_swappiness(memcg);
>>>>
>>>> Should we use "unlikely" here to indicate that sc->swappiness is an unexpected behavior?
>>>> Due to current use case only apply in proactive reclaim.
>>> On a system that is not under memory pressure, the rate of proactive
>>> reclaim could be higher than reactive reclaim. We should only use
>>> likely/unlikely when it's obvious a scenario will happen most of the
>>> time. I don't believe that's the case here.
>> Not all vendors will use proactive interfaces, and reactive reclaim are
>> a normal
>> system behavior. In this regard, I think it is appropriate to add
>> "unlikely".
> The general guidance is not to use likely/unlikely when it's not
> certain, which I believe is the case here. I think the CPU will make
OK, I will remember this part.
> better decisions on its own than if we give it hints that's wrong in
> some situations. Others please correct me if I am wrong.
No, you're right. CPU is good to do this.
>
>>>>           u64 fraction[ANON_AND_FILE];
>>>>           u64 denominator = 0;    /* gcc */
>>>>           enum scan_balance scan_balance;
>>>> @@ -2608,6 +2612,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
>>>>               mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
>>>>                   return 0;
>>>>
>>>> +       if (sc->swappiness)
>>>> +               return *sc->swappiness;
>>>>
>>>> Also there.
>>>>
>>>> +
>>>>           return mem_cgroup_swappiness(memcg);
>>>>    }
>>>>
>>>> @@ -6433,7 +6440,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
>>>>    unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>>>>                                              unsigned long nr_pages,
>>>>                                              gfp_t gfp_mask,
>>>> -                                          unsigned int reclaim_options)
>>>> +                                          unsigned int reclaim_options,
>>>> +                                          int *swappiness)
>>>>    {
>>>>           unsigned long nr_reclaimed;
>>>>           unsigned int noreclaim_flag;
>>>> @@ -6448,6 +6456,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>>>>                   .may_unmap = 1,
>>>>                   .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
>>>>                   .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
>>>> +               .swappiness = swappiness,
>>>>           };
>>>>           /*
>>>>            * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
>>>> --
>>>> 2.34.1
>>>>
>>>> My previous patch attempted to ensure fully deterministic semantics under extreme swappiness.
>>>> For example, when swappiness is set to 200, only anonymous pages will be reclaimed.
>>>> Due to code in MGLRU isolate_folios will try scan anon if no scanned, will try other type.(We do not want
>>>> it to attempt this behavior.)
>>>> How do you think about extreme swappiness scenarios?
>>> I think having different semantics between swappiness passed to
>>> proactive reclaim and global swappiness can be confusing. If it's
>>> needed to have a swappiness value that says "anon only no matter
>>> what", perhaps we should introduce such a new value and make it
>>> supported by both global and proactive reclaim swappiness? We could
>>> support writing "max" or something similar instead of a special value
>>> to mean that.
>> Yes, use other hint more suitable for this scenario.
>>
>> However, from this patch, it seems that this feature is not supported.
>> Do you have a demand for this scenario?
> We do anonymous-only proactive reclaim in some setups, so it would be
> nice to have. I am not sure if it's absolutely needed vs. just using
> swappiness=200 and living with the possibility of reclaiming some file
> pages.
Right now, the scenario where swappiness=200 is sufficient for us, but 
having the
tendency to only reclaim anonymous pages has a clear semantics that is
suitable for upper-level strategy scenarios, rather than relying solely on
the functionality of swappiness.
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f6dd6575b905..c6e309199f10 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -410,7 +410,8 @@  extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 						  unsigned long nr_pages,
 						  gfp_t gfp_mask,
-						  unsigned int reclaim_options);
+						  unsigned int reclaim_options,
+						  int *swappiness);
 extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
 						pg_data_t *pgdat,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c1061df9cd1..ba1c89455ab0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -63,6 +63,7 @@ 
 #include <linux/resume_user_mode.h>
 #include <linux/psi.h>
 #include <linux/seq_buf.h>
+#include <linux/parser.h>
 #include <linux/sched/isolation.h>
 #include "internal.h"
 #include <net/sock.h>
@@ -2449,7 +2450,7 @@  static unsigned long reclaim_high(struct mem_cgroup *memcg,
 		psi_memstall_enter(&pflags);
 		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
 							gfp_mask,
-							MEMCG_RECLAIM_MAY_SWAP);
+							MEMCG_RECLAIM_MAY_SWAP, NULL);
 		psi_memstall_leave(&pflags);
 	} while ((memcg = parent_mem_cgroup(memcg)) &&
 		 !mem_cgroup_is_root(memcg));
@@ -2740,7 +2741,7 @@  static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 	psi_memstall_enter(&pflags);
 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
-						    gfp_mask, reclaim_options);
+						    gfp_mask, reclaim_options, NULL);
 	psi_memstall_leave(&pflags);
 
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -3660,7 +3661,7 @@  static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
 		}
 
 		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
-					memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
+					memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
 			ret = -EBUSY;
 			break;
 		}
@@ -3774,7 +3775,7 @@  static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 			return -EINTR;
 
 		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
-						  MEMCG_RECLAIM_MAY_SWAP))
+						  MEMCG_RECLAIM_MAY_SWAP, NULL))
 			nr_retries--;
 	}
 
@@ -6720,7 +6721,7 @@  static ssize_t memory_high_write(struct kernfs_open_file *of,
 		}
 
 		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
-					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
+					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
 
 		if (!reclaimed && !nr_retries--)
 			break;
@@ -6769,7 +6770,7 @@  static ssize_t memory_max_write(struct kernfs_open_file *of,
 
 		if (nr_reclaims) {
 			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
-					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
+					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
 				nr_reclaims--;
 			continue;
 		}
@@ -6895,6 +6896,16 @@  static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+enum {
+	MEMORY_RECLAIM_SWAPPINESS = 0,
+	MEMORY_RECLAIM_NULL,
+};
+
+static const match_table_t if_tokens = {
+	{ MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+	{ MEMORY_RECLAIM_NULL, NULL },
+};
+
 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 			      size_t nbytes, loff_t off)
 {
@@ -6902,12 +6913,33 @@  static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
 	unsigned long nr_to_reclaim, nr_reclaimed = 0;
 	unsigned int reclaim_options;
-	int err;
+	char *old_buf, *start;
+	substring_t args[MAX_OPT_ARGS];
+	int swappiness = -1;
 
 	buf = strstrip(buf);
-	err = page_counter_memparse(buf, "", &nr_to_reclaim);
-	if (err)
-		return err;
+
+	old_buf = buf;
+	nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
+	if (buf == old_buf)
+		return -EINVAL;
+
+	buf = strstrip(buf);
+
+	while ((start = strsep(&buf, " ")) != NULL) {
+		if (!strlen(start))
+			continue;
+		switch (match_token(start, if_tokens, args)) {
+		case MEMORY_RECLAIM_SWAPPINESS:
+			if (match_int(&args[0], &swappiness))
+				return -EINVAL;
+			if (swappiness < 0 || swappiness > 200)
+				return -EINVAL;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
 
 	reclaim_options	= MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
 	while (nr_reclaimed < nr_to_reclaim) {
@@ -6926,7 +6958,8 @@  static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 
 		reclaimed = try_to_free_mem_cgroup_pages(memcg,
 					min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX),
-					GFP_KERNEL, reclaim_options);
+					GFP_KERNEL, reclaim_options,
+					swappiness == -1 ? NULL : &swappiness);
 
 		if (!reclaimed && !nr_retries--)
 			return -EAGAIN;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 506f8220c5fe..546704ea01e1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -136,6 +136,9 @@  struct scan_control {
 	/* Always discard instead of demoting to lower tier memory */
 	unsigned int no_demotion:1;
 
+	/* Swappiness value for reclaim, if NULL use memcg/global value */
+	int *swappiness;
+
 	/* Allocation order */
 	s8 order;
 
@@ -2327,7 +2330,8 @@  static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	unsigned long anon_cost, file_cost, total_cost;
-	int swappiness = mem_cgroup_swappiness(memcg);
+	int swappiness = sc->swappiness ?
+		*sc->swappiness : mem_cgroup_swappiness(memcg);
 	u64 fraction[ANON_AND_FILE];
 	u64 denominator = 0;	/* gcc */
 	enum scan_balance scan_balance;
@@ -2608,6 +2612,9 @@  static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
 	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
 		return 0;
 
+	if (sc->swappiness)
+		return *sc->swappiness;
+
 	return mem_cgroup_swappiness(memcg);
 }
 
@@ -6433,7 +6440,8 @@  unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
-					   unsigned int reclaim_options)
+					   unsigned int reclaim_options,
+					   int *swappiness)
 {
 	unsigned long nr_reclaimed;
 	unsigned int noreclaim_flag;
@@ -6448,6 +6456,7 @@  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.may_unmap = 1,
 		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
 		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
+		.swappiness = swappiness,
 	};
 	/*
 	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put