diff mbox series

[RFC,08/13] mm/migrate: demote pages during reclaim

Message ID 20210126003427.73DFDD34@viggo.jf.intel.com (mailing list archive)
State New
Headers show
Series Migrate Pages in lieu of discard | expand

Commit Message

Dave Hansen Jan. 26, 2021, 12:34 a.m. UTC
From: Dave Hansen <dave.hansen@linux.intel.com>

This is mostly derived from a patch from Yang Shi:

	https://lore.kernel.org/linux-mm/1560468577-101178-10-git-send-email-yang.shi@linux.alibaba.com/

Add code to the reclaim path (shrink_page_list()) to "demote" data
to another NUMA node instead of discarding the data.  This always
avoids the cost of I/O needed to read the page back in and sometimes
avoids the writeout cost when the pagee is dirty.

A second pass through shrink_page_list() will be made if any demotions
fail.  This essentally falls back to normal reclaim behavior in the
case that demotions fail.  Previous versions of this patch may have
simply failed to reclaim pages which were eligible for demotion but
were unable to be demoted in practice.

Note: This just adds the start of infratructure for migration. It is
actually disabled next to the FIXME in migrate_demote_page_ok().

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: osalvador <osalvador@suse.de>

--

changes from 202010:
 * add MR_NUMA_MISPLACED to trace MIGRATE_REASON define
 * make migrate_demote_page_ok() static, remove 'sc' arg until
   later patch
 * remove unnecessary alloc_demote_page() hugetlb warning
 * Simplify alloc_demote_page() gfp mask.  Depend on
   __GFP_NORETRY to make it lightweight instead of fancier
   stuff like leaving out __GFP_IO/FS.
 * Allocate migration page with alloc_migration_target()
   instead of allocating directly.
changes from 20200730:
 * Add another pass through shrink_page_list() when demotion
   fails.
---

 b/include/linux/migrate.h        |    9 ++++
 b/include/trace/events/migrate.h |    3 -
 b/mm/vmscan.c                    |   81 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+), 1 deletion(-)

Comments

Oscar Salvador Feb. 2, 2021, 11:55 a.m. UTC | #1
On Mon, Jan 25, 2021 at 04:34:27PM -0800, Dave Hansen wrote:
> 
> From: Dave Hansen <dave.hansen@linux.intel.com>
> 
> This is mostly derived from a patch from Yang Shi:
> 
> 	https://lore.kernel.org/linux-mm/1560468577-101178-10-git-send-email-yang.shi@linux.alibaba.com/
> 
> Add code to the reclaim path (shrink_page_list()) to "demote" data
> to another NUMA node instead of discarding the data.  This always
> avoids the cost of I/O needed to read the page back in and sometimes
> avoids the writeout cost when the pagee is dirty.
> 
> A second pass through shrink_page_list() will be made if any demotions
> fail.  This essentally falls back to normal reclaim behavior in the
> case that demotions fail.  Previous versions of this patch may have
> simply failed to reclaim pages which were eligible for demotion but
> were unable to be demoted in practice.
> 
> Note: This just adds the start of infratructure for migration. It is
> actually disabled next to the FIXME in migrate_demote_page_ok().
> 
> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Yang Shi <yang.shi@linux.alibaba.com>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Huang Ying <ying.huang@intel.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: osalvador <osalvador@suse.de>
> 
> --
> 
> changes from 202010:
>  * add MR_NUMA_MISPLACED to trace MIGRATE_REASON define
>  * make migrate_demote_page_ok() static, remove 'sc' arg until
>    later patch
>  * remove unnecessary alloc_demote_page() hugetlb warning
>  * Simplify alloc_demote_page() gfp mask.  Depend on
>    __GFP_NORETRY to make it lightweight instead of fancier
>    stuff like leaving out __GFP_IO/FS.
>  * Allocate migration page with alloc_migration_target()
>    instead of allocating directly.
> changes from 20200730:
>  * Add another pass through shrink_page_list() when demotion
>    fails.
> ---

[...]
  
> +static struct page *alloc_demote_page(struct page *page, unsigned long node)
> +{
> +        struct migration_target_control mtc = {
> +		/*
> +		 * Fail quickly and quietly.  Page will likely
> +		 * just be discarded instead of migrated.
> +		 */
> +		.gfp_mask = GFP_HIGHUSER | __GFP_NORETRY | __GFP_NOWARN,
> +		.nid = node
> +	};
> +
> +        return alloc_migration_target(page, (unsigned long)&mtc);
> +}

Migration for THP pages will set direct reclaim. I guess that is fine right?
AFAIK, direct reclaim will only be tried once with GFP_NORETRY.

> +
> +/*
> + * Take pages on @demote_list and attempt to demote them to
> + * another node.  Pages which are not demoted are left on
> + * @demote_pages.
> + */
> +static unsigned int demote_page_list(struct list_head *demote_pages,
> +				     struct pglist_data *pgdat,
> +				     struct scan_control *sc)
> +{
> +	int target_nid = next_demotion_node(pgdat->node_id);
> +	unsigned int nr_succeeded = 0;
> +	int err;
> +
> +	if (list_empty(demote_pages))
> +		return 0;
> +
> +	/* Demotion ignores all cpuset and mempolicy settings */
> +	err = migrate_pages(demote_pages, alloc_demote_page, NULL,
> +			    target_nid, MIGRATE_ASYNC, MR_DEMOTION,
> +			    &nr_succeeded);
> +
> +	return nr_succeeded;
> +}
> +
>  /*
>   * shrink_page_list() returns the number of reclaimed pages
>   */
> @@ -1078,12 +1135,15 @@ static unsigned int shrink_page_list(str
>  {
>  	LIST_HEAD(ret_pages);
>  	LIST_HEAD(free_pages);
> +	LIST_HEAD(demote_pages);
>  	unsigned int nr_reclaimed = 0;
>  	unsigned int pgactivate = 0;
> +	bool do_demote_pass = true;
>  
>  	memset(stat, 0, sizeof(*stat));
>  	cond_resched();
>  
> +retry:
>  	while (!list_empty(page_list)) {
>  		struct address_space *mapping;
>  		struct page *page;
> @@ -1233,6 +1293,16 @@ static unsigned int shrink_page_list(str
>  		}
>  
>  		/*
> +		 * Before reclaiming the page, try to relocate
> +		 * its contents to another node.
> +		 */
> +		if (do_demote_pass && migrate_demote_page_ok(page)) {
> +			list_add(&page->lru, &demote_pages);
> +			unlock_page(page);
> +			continue;
> +		}

Should we keep it simple for now and only try to demote those pages that are
free of cpusets and memory policies?
Actually, demoting those pages to a CPU or a NUMA node that does not fall into
their set, would violate those constraints right?
So I think we should leave those pages alone for now.

> +
> +		/*
>  		 * Anonymous process memory has backing store?
>  		 * Try to allocate it some swap space here.
>  		 * Lazyfree page could be freed directly
> @@ -1479,6 +1549,17 @@ keep:
>  		list_add(&page->lru, &ret_pages);
>  		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
>  	}
> +	/* 'page_list' is always empty here */
> +
> +	/* Migrate pages selected for demotion */
> +	nr_reclaimed += demote_page_list(&demote_pages, pgdat, sc);
> +	/* Pages that could not be demoted are still in @demote_pages */
> +	if (!list_empty(&demote_pages)) {
> +		/* Pages which failed to demoted go back on on @page_list for retry: */
> +		list_splice_init(&demote_pages, page_list);
> +		do_demote_pass = false;
> +		goto retry;
> +	}
>  
>  	pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
>  
> _
>
Yang Shi Feb. 2, 2021, 6:22 p.m. UTC | #2
On Mon, Jan 25, 2021 at 4:41 PM Dave Hansen <dave.hansen@linux.intel.com> wrote:
>
>
> From: Dave Hansen <dave.hansen@linux.intel.com>
>
> This is mostly derived from a patch from Yang Shi:
>
>         https://lore.kernel.org/linux-mm/1560468577-101178-10-git-send-email-yang.shi@linux.alibaba.com/
>
> Add code to the reclaim path (shrink_page_list()) to "demote" data
> to another NUMA node instead of discarding the data.  This always
> avoids the cost of I/O needed to read the page back in and sometimes
> avoids the writeout cost when the pagee is dirty.
>
> A second pass through shrink_page_list() will be made if any demotions
> fail.  This essentally falls back to normal reclaim behavior in the
> case that demotions fail.  Previous versions of this patch may have
> simply failed to reclaim pages which were eligible for demotion but
> were unable to be demoted in practice.
>
> Note: This just adds the start of infratructure for migration. It is
> actually disabled next to the FIXME in migrate_demote_page_ok().
>
> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Yang Shi <yang.shi@linux.alibaba.com>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Huang Ying <ying.huang@intel.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: osalvador <osalvador@suse.de>
>
> --
>
> changes from 202010:
>  * add MR_NUMA_MISPLACED to trace MIGRATE_REASON define
>  * make migrate_demote_page_ok() static, remove 'sc' arg until
>    later patch
>  * remove unnecessary alloc_demote_page() hugetlb warning
>  * Simplify alloc_demote_page() gfp mask.  Depend on
>    __GFP_NORETRY to make it lightweight instead of fancier
>    stuff like leaving out __GFP_IO/FS.
>  * Allocate migration page with alloc_migration_target()
>    instead of allocating directly.
> changes from 20200730:
>  * Add another pass through shrink_page_list() when demotion
>    fails.
> ---
>
>  b/include/linux/migrate.h        |    9 ++++
>  b/include/trace/events/migrate.h |    3 -
>  b/mm/vmscan.c                    |   81 +++++++++++++++++++++++++++++++++++++++
>  3 files changed, 92 insertions(+), 1 deletion(-)
>
> diff -puN include/linux/migrate.h~demote-with-migrate_pages include/linux/migrate.h
> --- a/include/linux/migrate.h~demote-with-migrate_pages 2021-01-25 16:23:14.591866696 -0800
> +++ b/include/linux/migrate.h   2021-01-25 16:23:14.599866696 -0800
> @@ -27,6 +27,7 @@ enum migrate_reason {
>         MR_MEMPOLICY_MBIND,
>         MR_NUMA_MISPLACED,
>         MR_CONTIG_RANGE,
> +       MR_DEMOTION,
>         MR_TYPES
>  };
>
> @@ -196,6 +197,14 @@ struct migrate_vma {
>  int migrate_vma_setup(struct migrate_vma *args);
>  void migrate_vma_pages(struct migrate_vma *migrate);
>  void migrate_vma_finalize(struct migrate_vma *migrate);
> +int next_demotion_node(int node);
> +
> +#else /* CONFIG_MIGRATION disabled: */
> +
> +static inline int next_demotion_node(int node)
> +{
> +       return NUMA_NO_NODE;
> +}
>
>  #endif /* CONFIG_MIGRATION */
>
> diff -puN include/trace/events/migrate.h~demote-with-migrate_pages include/trace/events/migrate.h
> --- a/include/trace/events/migrate.h~demote-with-migrate_pages  2021-01-25 16:23:14.593866696 -0800
> +++ b/include/trace/events/migrate.h    2021-01-25 16:23:14.599866696 -0800
> @@ -20,7 +20,8 @@
>         EM( MR_SYSCALL,         "syscall_or_cpuset")            \
>         EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind")              \
>         EM( MR_NUMA_MISPLACED,  "numa_misplaced")               \
> -       EMe(MR_CONTIG_RANGE,    "contig_range")
> +       EM( MR_CONTIG_RANGE,    "contig_range")                 \
> +       EMe(MR_DEMOTION,        "demotion")
>
>  /*
>   * First define the enums in the above macros to be exported to userspace
> diff -puN mm/vmscan.c~demote-with-migrate_pages mm/vmscan.c
> --- a/mm/vmscan.c~demote-with-migrate_pages     2021-01-25 16:23:14.595866696 -0800
> +++ b/mm/vmscan.c       2021-01-25 16:23:14.601866696 -0800
> @@ -43,6 +43,7 @@
>  #include <linux/kthread.h>
>  #include <linux/freezer.h>
>  #include <linux/memcontrol.h>
> +#include <linux/migrate.h>
>  #include <linux/delayacct.h>
>  #include <linux/sysctl.h>
>  #include <linux/oom.h>
> @@ -1036,6 +1037,24 @@ static enum page_references page_check_r
>         return PAGEREF_RECLAIM;
>  }
>
> +static bool migrate_demote_page_ok(struct page *page)
> +{
> +       int next_nid = next_demotion_node(page_to_nid(page));
> +
> +       VM_BUG_ON_PAGE(!PageLocked(page), page);
> +       VM_BUG_ON_PAGE(PageHuge(page), page);
> +       VM_BUG_ON_PAGE(PageLRU(page), page);
> +
> +       if (next_nid == NUMA_NO_NODE)
> +               return false;
> +       if (PageTransHuge(page) && !thp_migration_supported())
> +               return false;
> +
> +       // FIXME: actually enable this later in the series
> +       return false;
> +}
> +
> +
>  /* Check if a page is dirty or under writeback */
>  static void page_check_dirty_writeback(struct page *page,
>                                        bool *dirty, bool *writeback)
> @@ -1066,6 +1085,44 @@ static void page_check_dirty_writeback(s
>                 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
>  }
>
> +static struct page *alloc_demote_page(struct page *page, unsigned long node)
> +{
> +        struct migration_target_control mtc = {
> +               /*
> +                * Fail quickly and quietly.  Page will likely
> +                * just be discarded instead of migrated.
> +                */
> +               .gfp_mask = GFP_HIGHUSER | __GFP_NORETRY | __GFP_NOWARN,
> +               .nid = node
> +       };
> +
> +        return alloc_migration_target(page, (unsigned long)&mtc);

Other than the gfp flag question raised by Oscar, I'm wondering how we
guarantee the demotion allocation happens on the designated node. In
the previous version __GFP_THISNODE is set to guarantee this. In this
version you switched to use alloc_migration_target() API but without
having nodemask or __GFP_THISNODE. If nodemask is NULL the allocation
may fall back to an unexpected node.

And GFP_HIGHUSER does respect cpuset, so if the demotion target node
is excluded by the cpuset which the task belongs to, the migration
would fail. This might be a way to respect cpuset, but it should just
work for direct reclaimer. So, is this change really expected?

> +}
> +
> +/*
> + * Take pages on @demote_list and attempt to demote them to
> + * another node.  Pages which are not demoted are left on
> + * @demote_pages.
> + */
> +static unsigned int demote_page_list(struct list_head *demote_pages,
> +                                    struct pglist_data *pgdat,
> +                                    struct scan_control *sc)
> +{
> +       int target_nid = next_demotion_node(pgdat->node_id);
> +       unsigned int nr_succeeded = 0;
> +       int err;
> +
> +       if (list_empty(demote_pages))
> +               return 0;
> +
> +       /* Demotion ignores all cpuset and mempolicy settings */
> +       err = migrate_pages(demote_pages, alloc_demote_page, NULL,
> +                           target_nid, MIGRATE_ASYNC, MR_DEMOTION,
> +                           &nr_succeeded);
> +
> +       return nr_succeeded;
> +}
> +
>  /*
>   * shrink_page_list() returns the number of reclaimed pages
>   */
> @@ -1078,12 +1135,15 @@ static unsigned int shrink_page_list(str
>  {
>         LIST_HEAD(ret_pages);
>         LIST_HEAD(free_pages);
> +       LIST_HEAD(demote_pages);
>         unsigned int nr_reclaimed = 0;
>         unsigned int pgactivate = 0;
> +       bool do_demote_pass = true;
>
>         memset(stat, 0, sizeof(*stat));
>         cond_resched();
>
> +retry:
>         while (!list_empty(page_list)) {
>                 struct address_space *mapping;
>                 struct page *page;
> @@ -1233,6 +1293,16 @@ static unsigned int shrink_page_list(str
>                 }
>
>                 /*
> +                * Before reclaiming the page, try to relocate
> +                * its contents to another node.
> +                */
> +               if (do_demote_pass && migrate_demote_page_ok(page)) {
> +                       list_add(&page->lru, &demote_pages);
> +                       unlock_page(page);
> +                       continue;
> +               }
> +
> +               /*
>                  * Anonymous process memory has backing store?
>                  * Try to allocate it some swap space here.
>                  * Lazyfree page could be freed directly
> @@ -1479,6 +1549,17 @@ keep:
>                 list_add(&page->lru, &ret_pages);
>                 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
>         }
> +       /* 'page_list' is always empty here */
> +
> +       /* Migrate pages selected for demotion */
> +       nr_reclaimed += demote_page_list(&demote_pages, pgdat, sc);
> +       /* Pages that could not be demoted are still in @demote_pages */
> +       if (!list_empty(&demote_pages)) {
> +               /* Pages which failed to demoted go back on on @page_list for retry: */
> +               list_splice_init(&demote_pages, page_list);
> +               do_demote_pass = false;
> +               goto retry;
> +       }
>
>         pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
>
> _
>
Dave Hansen Feb. 2, 2021, 6:34 p.m. UTC | #3
On 2/2/21 10:22 AM, Yang Shi wrote:
>> +static struct page *alloc_demote_page(struct page *page, unsigned long node)
>> +{
>> +        struct migration_target_control mtc = {
>> +               /*
>> +                * Fail quickly and quietly.  Page will likely
>> +                * just be discarded instead of migrated.
>> +                */
>> +               .gfp_mask = GFP_HIGHUSER | __GFP_NORETRY | __GFP_NOWARN,
>> +               .nid = node
>> +       };
>> +
>> +        return alloc_migration_target(page, (unsigned long)&mtc);
> Other than the gfp flag question raised by Oscar, I'm wondering how we
> guarantee the demotion allocation happens on the designated node. In
> the previous version __GFP_THISNODE is set to guarantee this. In this
> version you switched to use alloc_migration_target() API but without
> having nodemask or __GFP_THISNODE. If nodemask is NULL the allocation
> may fall back to an unexpected node.
> 
> And GFP_HIGHUSER does respect cpuset, so if the demotion target node
> is excluded by the cpuset which the task belongs to, the migration
> would fail. This might be a way to respect cpuset, but it should just
> work for direct reclaimer. So, is this change really expected?

No, that wasn't intended.  I'll restore __GFP_THISNODE.  Thanks for
noting this.
Yang Shi Feb. 2, 2021, 10:45 p.m. UTC | #4
On Tue, Feb 2, 2021 at 3:55 AM Oscar Salvador <osalvador@suse.de> wrote:
>
> On Mon, Jan 25, 2021 at 04:34:27PM -0800, Dave Hansen wrote:
> >
> > From: Dave Hansen <dave.hansen@linux.intel.com>
> >
> > This is mostly derived from a patch from Yang Shi:
> >
> >       https://lore.kernel.org/linux-mm/1560468577-101178-10-git-send-email-yang.shi@linux.alibaba.com/
> >
> > Add code to the reclaim path (shrink_page_list()) to "demote" data
> > to another NUMA node instead of discarding the data.  This always
> > avoids the cost of I/O needed to read the page back in and sometimes
> > avoids the writeout cost when the pagee is dirty.
> >
> > A second pass through shrink_page_list() will be made if any demotions
> > fail.  This essentally falls back to normal reclaim behavior in the
> > case that demotions fail.  Previous versions of this patch may have
> > simply failed to reclaim pages which were eligible for demotion but
> > were unable to be demoted in practice.
> >
> > Note: This just adds the start of infratructure for migration. It is
> > actually disabled next to the FIXME in migrate_demote_page_ok().
> >
> > Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
> > Cc: Yang Shi <yang.shi@linux.alibaba.com>
> > Cc: David Rientjes <rientjes@google.com>
> > Cc: Huang Ying <ying.huang@intel.com>
> > Cc: Dan Williams <dan.j.williams@intel.com>
> > Cc: osalvador <osalvador@suse.de>
> >
> > --
> >
> > changes from 202010:
> >  * add MR_NUMA_MISPLACED to trace MIGRATE_REASON define
> >  * make migrate_demote_page_ok() static, remove 'sc' arg until
> >    later patch
> >  * remove unnecessary alloc_demote_page() hugetlb warning
> >  * Simplify alloc_demote_page() gfp mask.  Depend on
> >    __GFP_NORETRY to make it lightweight instead of fancier
> >    stuff like leaving out __GFP_IO/FS.
> >  * Allocate migration page with alloc_migration_target()
> >    instead of allocating directly.
> > changes from 20200730:
> >  * Add another pass through shrink_page_list() when demotion
> >    fails.
> > ---
>
> [...]
>
> > +static struct page *alloc_demote_page(struct page *page, unsigned long node)
> > +{
> > +        struct migration_target_control mtc = {
> > +             /*
> > +              * Fail quickly and quietly.  Page will likely
> > +              * just be discarded instead of migrated.
> > +              */
> > +             .gfp_mask = GFP_HIGHUSER | __GFP_NORETRY | __GFP_NOWARN,
> > +             .nid = node
> > +     };
> > +
> > +        return alloc_migration_target(page, (unsigned long)&mtc);
> > +}
>
> Migration for THP pages will set direct reclaim. I guess that is fine right?
> AFAIK, direct reclaim will only be tried once with GFP_NORETRY.
>
> > +
> > +/*
> > + * Take pages on @demote_list and attempt to demote them to
> > + * another node.  Pages which are not demoted are left on
> > + * @demote_pages.
> > + */
> > +static unsigned int demote_page_list(struct list_head *demote_pages,
> > +                                  struct pglist_data *pgdat,
> > +                                  struct scan_control *sc)
> > +{
> > +     int target_nid = next_demotion_node(pgdat->node_id);
> > +     unsigned int nr_succeeded = 0;
> > +     int err;
> > +
> > +     if (list_empty(demote_pages))
> > +             return 0;
> > +
> > +     /* Demotion ignores all cpuset and mempolicy settings */
> > +     err = migrate_pages(demote_pages, alloc_demote_page, NULL,
> > +                         target_nid, MIGRATE_ASYNC, MR_DEMOTION,
> > +                         &nr_succeeded);
> > +
> > +     return nr_succeeded;
> > +}
> > +
> >  /*
> >   * shrink_page_list() returns the number of reclaimed pages
> >   */
> > @@ -1078,12 +1135,15 @@ static unsigned int shrink_page_list(str
> >  {
> >       LIST_HEAD(ret_pages);
> >       LIST_HEAD(free_pages);
> > +     LIST_HEAD(demote_pages);
> >       unsigned int nr_reclaimed = 0;
> >       unsigned int pgactivate = 0;
> > +     bool do_demote_pass = true;
> >
> >       memset(stat, 0, sizeof(*stat));
> >       cond_resched();
> >
> > +retry:
> >       while (!list_empty(page_list)) {
> >               struct address_space *mapping;
> >               struct page *page;
> > @@ -1233,6 +1293,16 @@ static unsigned int shrink_page_list(str
> >               }
> >
> >               /*
> > +              * Before reclaiming the page, try to relocate
> > +              * its contents to another node.
> > +              */
> > +             if (do_demote_pass && migrate_demote_page_ok(page)) {
> > +                     list_add(&page->lru, &demote_pages);
> > +                     unlock_page(page);
> > +                     continue;
> > +             }
>
> Should we keep it simple for now and only try to demote those pages that are
> free of cpusets and memory policies?
> Actually, demoting those pages to a CPU or a NUMA node that does not fall into
> their set, would violate those constraints right?

Yes, this has been discussed since the very beginning. There is not an
easy way to figure out the memory placement policy (cpuset and
mempolicy) from "page". I think this also prevents "demote those pages
that are free of cpusets and memory policies".

The conclusion was the violation should be fine for now. And the
demotion feature is opt'ed in by a new node reclaim mode.

> So I think we should leave those pages alone for now.
>
> > +
> > +             /*
> >                * Anonymous process memory has backing store?
> >                * Try to allocate it some swap space here.
> >                * Lazyfree page could be freed directly
> > @@ -1479,6 +1549,17 @@ keep:
> >               list_add(&page->lru, &ret_pages);
> >               VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
> >       }
> > +     /* 'page_list' is always empty here */
> > +
> > +     /* Migrate pages selected for demotion */
> > +     nr_reclaimed += demote_page_list(&demote_pages, pgdat, sc);
> > +     /* Pages that could not be demoted are still in @demote_pages */
> > +     if (!list_empty(&demote_pages)) {
> > +             /* Pages which failed to demoted go back on on @page_list for retry: */
> > +             list_splice_init(&demote_pages, page_list);
> > +             do_demote_pass = false;
> > +             goto retry;
> > +     }
> >
> >       pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
> >
> > _
> >
>
> --
> Oscar Salvador
> SUSE L3
>
Dave Hansen Feb. 2, 2021, 10:56 p.m. UTC | #5
On 2/2/21 2:45 PM, Yang Shi wrote:
>> Should we keep it simple for now and only try to demote those pages that are
>> free of cpusets and memory policies?
>> Actually, demoting those pages to a CPU or a NUMA node that does not fall into
>> their set, would violate those constraints right?
> Yes, this has been discussed since the very beginning. There is not an
> easy way to figure out the memory placement policy (cpuset and
> mempolicy) from "page". I think this also prevents "demote those pages
> that are free of cpusets and memory policies".
> 
> The conclusion was the violation should be fine for now. And the
> demotion feature is opt'ed in by a new node reclaim mode.

This has come up a couple of times.

I'll add a bit of changelog material about it in the last patch since
that's where the opt-in is introduced.
diff mbox series

Patch

diff -puN include/linux/migrate.h~demote-with-migrate_pages include/linux/migrate.h
--- a/include/linux/migrate.h~demote-with-migrate_pages	2021-01-25 16:23:14.591866696 -0800
+++ b/include/linux/migrate.h	2021-01-25 16:23:14.599866696 -0800
@@ -27,6 +27,7 @@  enum migrate_reason {
 	MR_MEMPOLICY_MBIND,
 	MR_NUMA_MISPLACED,
 	MR_CONTIG_RANGE,
+	MR_DEMOTION,
 	MR_TYPES
 };
 
@@ -196,6 +197,14 @@  struct migrate_vma {
 int migrate_vma_setup(struct migrate_vma *args);
 void migrate_vma_pages(struct migrate_vma *migrate);
 void migrate_vma_finalize(struct migrate_vma *migrate);
+int next_demotion_node(int node);
+
+#else /* CONFIG_MIGRATION disabled: */
+
+static inline int next_demotion_node(int node)
+{
+	return NUMA_NO_NODE;
+}
 
 #endif /* CONFIG_MIGRATION */
 
diff -puN include/trace/events/migrate.h~demote-with-migrate_pages include/trace/events/migrate.h
--- a/include/trace/events/migrate.h~demote-with-migrate_pages	2021-01-25 16:23:14.593866696 -0800
+++ b/include/trace/events/migrate.h	2021-01-25 16:23:14.599866696 -0800
@@ -20,7 +20,8 @@ 
 	EM( MR_SYSCALL,		"syscall_or_cpuset")		\
 	EM( MR_MEMPOLICY_MBIND,	"mempolicy_mbind")		\
 	EM( MR_NUMA_MISPLACED,	"numa_misplaced")		\
-	EMe(MR_CONTIG_RANGE,	"contig_range")
+	EM( MR_CONTIG_RANGE,	"contig_range")			\
+	EMe(MR_DEMOTION,	"demotion")
 
 /*
  * First define the enums in the above macros to be exported to userspace
diff -puN mm/vmscan.c~demote-with-migrate_pages mm/vmscan.c
--- a/mm/vmscan.c~demote-with-migrate_pages	2021-01-25 16:23:14.595866696 -0800
+++ b/mm/vmscan.c	2021-01-25 16:23:14.601866696 -0800
@@ -43,6 +43,7 @@ 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
+#include <linux/migrate.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
 #include <linux/oom.h>
@@ -1036,6 +1037,24 @@  static enum page_references page_check_r
 	return PAGEREF_RECLAIM;
 }
 
+static bool migrate_demote_page_ok(struct page *page)
+{
+	int next_nid = next_demotion_node(page_to_nid(page));
+
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	VM_BUG_ON_PAGE(PageHuge(page), page);
+	VM_BUG_ON_PAGE(PageLRU(page), page);
+
+	if (next_nid == NUMA_NO_NODE)
+		return false;
+	if (PageTransHuge(page) && !thp_migration_supported())
+		return false;
+
+	// FIXME: actually enable this later in the series
+	return false;
+}
+
+
 /* Check if a page is dirty or under writeback */
 static void page_check_dirty_writeback(struct page *page,
 				       bool *dirty, bool *writeback)
@@ -1066,6 +1085,44 @@  static void page_check_dirty_writeback(s
 		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
 }
 
+static struct page *alloc_demote_page(struct page *page, unsigned long node)
+{
+        struct migration_target_control mtc = {
+		/*
+		 * Fail quickly and quietly.  Page will likely
+		 * just be discarded instead of migrated.
+		 */
+		.gfp_mask = GFP_HIGHUSER | __GFP_NORETRY | __GFP_NOWARN,
+		.nid = node
+	};
+
+        return alloc_migration_target(page, (unsigned long)&mtc);
+}
+
+/*
+ * Take pages on @demote_list and attempt to demote them to
+ * another node.  Pages which are not demoted are left on
+ * @demote_pages.
+ */
+static unsigned int demote_page_list(struct list_head *demote_pages,
+				     struct pglist_data *pgdat,
+				     struct scan_control *sc)
+{
+	int target_nid = next_demotion_node(pgdat->node_id);
+	unsigned int nr_succeeded = 0;
+	int err;
+
+	if (list_empty(demote_pages))
+		return 0;
+
+	/* Demotion ignores all cpuset and mempolicy settings */
+	err = migrate_pages(demote_pages, alloc_demote_page, NULL,
+			    target_nid, MIGRATE_ASYNC, MR_DEMOTION,
+			    &nr_succeeded);
+
+	return nr_succeeded;
+}
+
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -1078,12 +1135,15 @@  static unsigned int shrink_page_list(str
 {
 	LIST_HEAD(ret_pages);
 	LIST_HEAD(free_pages);
+	LIST_HEAD(demote_pages);
 	unsigned int nr_reclaimed = 0;
 	unsigned int pgactivate = 0;
+	bool do_demote_pass = true;
 
 	memset(stat, 0, sizeof(*stat));
 	cond_resched();
 
+retry:
 	while (!list_empty(page_list)) {
 		struct address_space *mapping;
 		struct page *page;
@@ -1233,6 +1293,16 @@  static unsigned int shrink_page_list(str
 		}
 
 		/*
+		 * Before reclaiming the page, try to relocate
+		 * its contents to another node.
+		 */
+		if (do_demote_pass && migrate_demote_page_ok(page)) {
+			list_add(&page->lru, &demote_pages);
+			unlock_page(page);
+			continue;
+		}
+
+		/*
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 * Lazyfree page could be freed directly
@@ -1479,6 +1549,17 @@  keep:
 		list_add(&page->lru, &ret_pages);
 		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
 	}
+	/* 'page_list' is always empty here */
+
+	/* Migrate pages selected for demotion */
+	nr_reclaimed += demote_page_list(&demote_pages, pgdat, sc);
+	/* Pages that could not be demoted are still in @demote_pages */
+	if (!list_empty(&demote_pages)) {
+		/* Pages which failed to demoted go back on on @page_list for retry: */
+		list_splice_init(&demote_pages, page_list);
+		do_demote_pass = false;
+		goto retry;
+	}
 
 	pgactivate = stat->nr_activate[0] + stat->nr_activate[1];