diff mbox series

[v2,01/11] mm/vmstat: remove remote node draining

Message ID 20230209153204.656996515@redhat.com (mailing list archive)
State New
Headers show
Series fold per-CPU vmstats remotely | expand

Commit Message

Marcelo Tosatti Feb. 9, 2023, 3:01 p.m. UTC
Draining of pages from the local pcp for a remote zone was necessary
since:

"Note that remote node draining is a somewhat esoteric feature that is
required on large NUMA systems because otherwise significant portions
of system memory can become trapped in pcp queues. The number of pcp is
determined by the number of processors and nodes in a system. A system
with 4 processors and 2 nodes has 8 pcps which is okay. But a system
with 1024 processors and 512 nodes has 512k pcps with a high potential
for large amount of memory being caught in them."

Since commit 443c2accd1b6679a1320167f8f56eed6536b806e
("mm/page_alloc: remotely drain per-cpu lists"), drain_all_pages() is able 
to remotely free those pages when necessary.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Comments

David Hildenbrand Feb. 28, 2023, 3:53 p.m. UTC | #1
On 09.02.23 16:01, Marcelo Tosatti wrote:
> Draining of pages from the local pcp for a remote zone was necessary
> since:
> 
> "Note that remote node draining is a somewhat esoteric feature that is
> required on large NUMA systems because otherwise significant portions
> of system memory can become trapped in pcp queues. The number of pcp is
> determined by the number of processors and nodes in a system. A system
> with 4 processors and 2 nodes has 8 pcps which is okay. But a system
> with 1024 processors and 512 nodes has 512k pcps with a high potential
> for large amount of memory being caught in them."
> 
> Since commit 443c2accd1b6679a1320167f8f56eed6536b806e
> ("mm/page_alloc: remotely drain per-cpu lists"), drain_all_pages() is able
> to remotely free those pages when necessary.
> 

I'm a bit new to that piece of code, so sorry for the dummy questions. 
I'm staring at linux master,

(1) I think you're removing the single user of drain_zone_pages(). So we
     should remove drain_zone_pages() as well.

(2) drain_zone_pages() documents that we're draining the PCP
     (bulk-freeing them) of the current CPU on remote nodes. That bulk-
     freeing will properly adjust free memory counters. What exactly is
     the impact when no longer doing that? Won't the "snapshot" of some
     counters eventually be wrong? Do we care?

Describing the difference between instructed refresh of vmstat and 
"remotely drain per-cpu lists" in order to move free memory from the pcp 
to the buddy would be great.

Because removing this code here looks nice, but I am not 100% sure about 
the implications. CCing Mel as well.


> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> Index: linux-vmstat-remote/include/linux/mmzone.h
> ===================================================================
> --- linux-vmstat-remote.orig/include/linux/mmzone.h
> +++ linux-vmstat-remote/include/linux/mmzone.h
> @@ -577,9 +577,6 @@ struct per_cpu_pages {
>   	int high;		/* high watermark, emptying needed */
>   	int batch;		/* chunk size for buddy add/remove */
>   	short free_factor;	/* batch scaling factor during free */
> -#ifdef CONFIG_NUMA
> -	short expire;		/* When 0, remote pagesets are drained */
> -#endif
>   
>   	/* Lists of pages, one per migrate type stored on the pcp-lists */
>   	struct list_head lists[NR_PCP_LISTS];
> Index: linux-vmstat-remote/mm/vmstat.c
> ===================================================================
> --- linux-vmstat-remote.orig/mm/vmstat.c
> +++ linux-vmstat-remote/mm/vmstat.c
> @@ -803,7 +803,7 @@ static int fold_diff(int *zone_diff, int
>    *
>    * The function returns the number of global counters updated.
>    */
> -static int refresh_cpu_vm_stats(bool do_pagesets)
> +static int refresh_cpu_vm_stats(void)
>   {
>   	struct pglist_data *pgdat;
>   	struct zone *zone;
> @@ -814,9 +814,6 @@ static int refresh_cpu_vm_stats(bool do_
>   
>   	for_each_populated_zone(zone) {
>   		struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
> -#ifdef CONFIG_NUMA
> -		struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
> -#endif
>   
>   		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
>   			int v;
> @@ -826,44 +823,8 @@ static int refresh_cpu_vm_stats(bool do_
>   
>   				atomic_long_add(v, &zone->vm_stat[i]);
>   				global_zone_diff[i] += v;
> -#ifdef CONFIG_NUMA
> -				/* 3 seconds idle till flush */
> -				__this_cpu_write(pcp->expire, 3);
> -#endif
>   			}
>   		}
> -#ifdef CONFIG_NUMA
> -
> -		if (do_pagesets) {
> -			cond_resched();
> -			/*
> -			 * Deal with draining the remote pageset of this
> -			 * processor
> -			 *
> -			 * Check if there are pages remaining in this pageset
> -			 * if not then there is nothing to expire.
> -			 */
> -			if (!__this_cpu_read(pcp->expire) ||
> -			       !__this_cpu_read(pcp->count))
> -				continue;
> -
> -			/*
> -			 * We never drain zones local to this processor.
> -			 */
> -			if (zone_to_nid(zone) == numa_node_id()) {
> -				__this_cpu_write(pcp->expire, 0);
> -				continue;
> -			}
> -
> -			if (__this_cpu_dec_return(pcp->expire))
> -				continue;
> -
> -			if (__this_cpu_read(pcp->count)) {
> -				drain_zone_pages(zone, this_cpu_ptr(pcp));
> -				changes++;
> -			}
> -		}
> -#endif
>   	}

I think you can then also get rid of the "changes" local variable and do 
a "return fold_diff(global_zone_diff, global_node_diff);" directly.
Marcelo Tosatti Feb. 28, 2023, 7:36 p.m. UTC | #2
On Tue, Feb 28, 2023 at 04:53:47PM +0100, David Hildenbrand wrote:
> On 09.02.23 16:01, Marcelo Tosatti wrote:
> > Draining of pages from the local pcp for a remote zone was necessary
> > since:
> > 
> > "Note that remote node draining is a somewhat esoteric feature that is
> > required on large NUMA systems because otherwise significant portions
> > of system memory can become trapped in pcp queues. The number of pcp is
> > determined by the number of processors and nodes in a system. A system
> > with 4 processors and 2 nodes has 8 pcps which is okay. But a system
> > with 1024 processors and 512 nodes has 512k pcps with a high potential
> > for large amount of memory being caught in them."
> > 
> > Since commit 443c2accd1b6679a1320167f8f56eed6536b806e
> > ("mm/page_alloc: remotely drain per-cpu lists"), drain_all_pages() is able
> > to remotely free those pages when necessary.
> > 
> 
> I'm a bit new to that piece of code, so sorry for the dummy questions. I'm
> staring at linux master,
> 
> (1) I think you're removing the single user of drain_zone_pages(). So we
>     should remove drain_zone_pages() as well.

Done.

> (2) drain_zone_pages() documents that we're draining the PCP
>     (bulk-freeing them) of the current CPU on remote nodes. That bulk-
>     freeing will properly adjust free memory counters. What exactly is
>     the impact when no longer doing that? Won't the "snapshot" of some
>     counters eventually be wrong? Do we care?

Don't see why the snapshot of counters will be wrong.

Instead of freeing pages on pcp list of remote nodes after they are
considered idle ("3 seconds idle till flush"), what will happen is that
drain_all_pages() will free those pcps, for example after an allocation
fails on direct reclaim:

        page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

        /*
         * If an allocation failed after direct reclaim, it could be because
         * pages are pinned on the per-cpu lists or in high alloc reserves.
         * Shrink them and try again
         */
        if (!page && !drained) {
                unreserve_highatomic_pageblock(ac, false);
                drain_all_pages(NULL);
                drained = true;
                goto retry;
        }

In both cases the pages are freed (and counters maintained) here:

static inline void __free_one_page(struct page *page,
                unsigned long pfn,
                struct zone *zone, unsigned int order,
                int migratetype, fpi_t fpi_flags)
{
        struct capture_control *capc = task_capc(zone);
        unsigned long buddy_pfn = 0;
        unsigned long combined_pfn;
        struct page *buddy;
        bool to_tail;

        VM_BUG_ON(!zone_is_initialized(zone));
        VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);

        VM_BUG_ON(migratetype == -1);
        if (likely(!is_migrate_isolate(migratetype)))
                __mod_zone_freepage_state(zone, 1 << order, migratetype);

        VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
        VM_BUG_ON_PAGE(bad_range(zone, page), page);

        while (order < MAX_ORDER - 1) {
                if (compaction_capture(capc, page, order, migratetype)) {
                        __mod_zone_freepage_state(zone, -(1 << order),
                                                                migratetype);
                        return;
                }

> Describing the difference between instructed refresh of vmstat and "remotely
> drain per-cpu lists" in order to move free memory from the pcp to the buddy
> would be great.

The difference is that now remote PCPs will be drained on demand, either via
kcompactd or direct reclaim (through drain_all_pages), when memory is
low.

For example, with the following test:

dd if=/dev/zero of=file bs=1M count=32000 on a tmpfs filesystem:

      kcompactd0-116     [005] ...1 228232.042873: drain_all_pages <-kcompactd_do_work
      kcompactd0-116     [005] ...1 228232.042873: __drain_all_pages <-kcompactd_do_work
              dd-479485  [003] ...1 228232.455130: __drain_all_pages <-__alloc_pages_slowpath.constprop.0
              dd-479485  [011] ...1 228232.721994: __drain_all_pages <-__alloc_pages_slowpath.constprop.0
     gnome-shell-3750    [015] ...1 228232.723729: __drain_all_pages <-__alloc_pages_slowpath.constprop.0

The commit message was indeed incorrect. Updated one:

"mm/vmstat: remove remote node draining

Draining of pages from the local pcp for a remote zone should not be
necessary, since once the system is low on memory (or compaction on a
zone is in effect), drain_all_pages should be called freeing any unused
pcps."

Thanks!

> Because removing this code here looks nice, but I am not 100% sure about the
> implications. CCing Mel as well.
> 
> 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > Index: linux-vmstat-remote/include/linux/mmzone.h
> > ===================================================================
> > --- linux-vmstat-remote.orig/include/linux/mmzone.h
> > +++ linux-vmstat-remote/include/linux/mmzone.h
> > @@ -577,9 +577,6 @@ struct per_cpu_pages {
> >   	int high;		/* high watermark, emptying needed */
> >   	int batch;		/* chunk size for buddy add/remove */
> >   	short free_factor;	/* batch scaling factor during free */
> > -#ifdef CONFIG_NUMA
> > -	short expire;		/* When 0, remote pagesets are drained */
> > -#endif
> >   	/* Lists of pages, one per migrate type stored on the pcp-lists */
> >   	struct list_head lists[NR_PCP_LISTS];
> > Index: linux-vmstat-remote/mm/vmstat.c
> > ===================================================================
> > --- linux-vmstat-remote.orig/mm/vmstat.c
> > +++ linux-vmstat-remote/mm/vmstat.c
> > @@ -803,7 +803,7 @@ static int fold_diff(int *zone_diff, int
> >    *
> >    * The function returns the number of global counters updated.
> >    */
> > -static int refresh_cpu_vm_stats(bool do_pagesets)
> > +static int refresh_cpu_vm_stats(void)
> >   {
> >   	struct pglist_data *pgdat;
> >   	struct zone *zone;
> > @@ -814,9 +814,6 @@ static int refresh_cpu_vm_stats(bool do_
> >   	for_each_populated_zone(zone) {
> >   		struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
> > -#ifdef CONFIG_NUMA
> > -		struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
> > -#endif
> >   		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
> >   			int v;
> > @@ -826,44 +823,8 @@ static int refresh_cpu_vm_stats(bool do_
> >   				atomic_long_add(v, &zone->vm_stat[i]);
> >   				global_zone_diff[i] += v;
> > -#ifdef CONFIG_NUMA
> > -				/* 3 seconds idle till flush */
> > -				__this_cpu_write(pcp->expire, 3);
> > -#endif
> >   			}
> >   		}
> > -#ifdef CONFIG_NUMA
> > -
> > -		if (do_pagesets) {
> > -			cond_resched();
> > -			/*
> > -			 * Deal with draining the remote pageset of this
> > -			 * processor
> > -			 *
> > -			 * Check if there are pages remaining in this pageset
> > -			 * if not then there is nothing to expire.
> > -			 */
> > -			if (!__this_cpu_read(pcp->expire) ||
> > -			       !__this_cpu_read(pcp->count))
> > -				continue;
> > -
> > -			/*
> > -			 * We never drain zones local to this processor.
> > -			 */
> > -			if (zone_to_nid(zone) == numa_node_id()) {
> > -				__this_cpu_write(pcp->expire, 0);
> > -				continue;
> > -			}
> > -
> > -			if (__this_cpu_dec_return(pcp->expire))
> > -				continue;
> > -
> > -			if (__this_cpu_read(pcp->count)) {
> > -				drain_zone_pages(zone, this_cpu_ptr(pcp));
> > -				changes++;
> > -			}
> > -		}
> > -#endif
> >   	}
> 
> I think you can then also get rid of the "changes" local variable and do a
> "return fold_diff(global_zone_diff, global_node_diff);" directly.

Done.
David Hildenbrand March 2, 2023, 10:10 a.m. UTC | #3
[...]

> 
>> (2) drain_zone_pages() documents that we're draining the PCP
>>      (bulk-freeing them) of the current CPU on remote nodes. That bulk-
>>      freeing will properly adjust free memory counters. What exactly is
>>      the impact when no longer doing that? Won't the "snapshot" of some
>>      counters eventually be wrong? Do we care?
> 
> Don't see why the snapshot of counters will be wrong.
> 
> Instead of freeing pages on pcp list of remote nodes after they are
> considered idle ("3 seconds idle till flush"), what will happen is that
> drain_all_pages() will free those pcps, for example after an allocation
> fails on direct reclaim:
> 
>          page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
> 
>          /*
>           * If an allocation failed after direct reclaim, it could be because
>           * pages are pinned on the per-cpu lists or in high alloc reserves.
>           * Shrink them and try again
>           */
>          if (!page && !drained) {
>                  unreserve_highatomic_pageblock(ac, false);
>                  drain_all_pages(NULL);
>                  drained = true;
>                  goto retry;
>          }
> 
> In both cases the pages are freed (and counters maintained) here:
> 
> static inline void __free_one_page(struct page *page,
>                  unsigned long pfn,
>                  struct zone *zone, unsigned int order,
>                  int migratetype, fpi_t fpi_flags)
> {
>          struct capture_control *capc = task_capc(zone);
>          unsigned long buddy_pfn = 0;
>          unsigned long combined_pfn;
>          struct page *buddy;
>          bool to_tail;
> 
>          VM_BUG_ON(!zone_is_initialized(zone));
>          VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
> 
>          VM_BUG_ON(migratetype == -1);
>          if (likely(!is_migrate_isolate(migratetype)))
>                  __mod_zone_freepage_state(zone, 1 << order, migratetype);
> 
>          VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
>          VM_BUG_ON_PAGE(bad_range(zone, page), page);
> 
>          while (order < MAX_ORDER - 1) {
>                  if (compaction_capture(capc, page, order, migratetype)) {
>                          __mod_zone_freepage_state(zone, -(1 << order),
>                                                                  migratetype);
>                          return;
>                  }
> 
>> Describing the difference between instructed refresh of vmstat and "remotely
>> drain per-cpu lists" in order to move free memory from the pcp to the buddy
>> would be great.
> 
> The difference is that now remote PCPs will be drained on demand, either via
> kcompactd or direct reclaim (through drain_all_pages), when memory is
> low.
> 
> For example, with the following test:
> 
> dd if=/dev/zero of=file bs=1M count=32000 on a tmpfs filesystem:
> 
>        kcompactd0-116     [005] ...1 228232.042873: drain_all_pages <-kcompactd_do_work
>        kcompactd0-116     [005] ...1 228232.042873: __drain_all_pages <-kcompactd_do_work
>                dd-479485  [003] ...1 228232.455130: __drain_all_pages <-__alloc_pages_slowpath.constprop.0
>                dd-479485  [011] ...1 228232.721994: __drain_all_pages <-__alloc_pages_slowpath.constprop.0
>       gnome-shell-3750    [015] ...1 228232.723729: __drain_all_pages <-__alloc_pages_slowpath.constprop.0
> 
> The commit message was indeed incorrect. Updated one:
> 
> "mm/vmstat: remove remote node draining
> 
> Draining of pages from the local pcp for a remote zone should not be
> necessary, since once the system is low on memory (or compaction on a
> zone is in effect), drain_all_pages should be called freeing any unused
> pcps."
> 
> Thanks!

Thanks for the explanation, that makes sense to me. Feel free to add my

Acked-by: David Hildenbrand <david@redhat.com>

... hoping that some others (Mel, Vlastimil?) can have another look.
Peter Xu March 2, 2023, 5:21 p.m. UTC | #4
On Thu, Feb 09, 2023 at 12:01:51PM -0300, Marcelo Tosatti wrote:
> Draining of pages from the local pcp for a remote zone was necessary
> since:
> 
> "Note that remote node draining is a somewhat esoteric feature that is
> required on large NUMA systems because otherwise significant portions
> of system memory can become trapped in pcp queues. The number of pcp is
> determined by the number of processors and nodes in a system. A system
> with 4 processors and 2 nodes has 8 pcps which is okay. But a system
> with 1024 processors and 512 nodes has 512k pcps with a high potential
> for large amount of memory being caught in them."

How about mentioning more details on where does this come from?

afaict it's from commit 4037d45 since 2007.

So I digged that out mostly because I want to know why we did flush pcp at
all during vmstat update.  It already sounds weird to me but I could have
been missing important details.

The rational I had here is refresh_cpu_vm_stats(true) is mostly being
called by the shepherd afaict, while:

  (1) The frequency of that interval is defined as sysctl_stat_interval,
      which has nothing yet to do with pcp pages but only stat at least in
      the name of it, and,

  (2) vmstat_work is only queued if need_update() here:

	for_each_online_cpu(cpu) {
		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);

		if (!delayed_work_pending(dw) && need_update(cpu))
			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);

		cond_resched();
	}

      need_update() tells us "we should flush vmstats", nothing it tells
      about "we should flush pcp list"..

I looked into the 2007 commit, besides what Marcelo quoted, I do see
there's a major benefit of reusing cache lines, quotting from the commit:

        Move the node draining so that is is done when the vm statistics
        are updated.  At that point we are already touching all the
        cachelines with the pagesets of a processor.

However I didn't see why it's rational to flush pcp list when vmstat needs
flushing either.  I also don't know whether that "cacheline locality" hold
true or not, because I saw that the pcp page list is split from vmstats
since 2021:

    commit 28f836b6777b6f42dce068a40d83a891deaaca37
    Author: Mel Gorman <mgorman@techsingularity.net>
    Date:   Mon Jun 28 19:41:38 2021 -0700

    mm/page_alloc: split per cpu page lists and zone stats

I'm not even sure my A-b or R-b worth anything at all here, just offer
something I got from git archaeology so maybe helpful to readers and
reasoning to this patch.  The correctness of archaeology needs help from
others (Christoph and Gel?)..  I would just say if there's anything useful
or correct may worth collect some into the commit log.

So from what I can tell this patch makes sense.
Peter Xu March 2, 2023, 5:27 p.m. UTC | #5
On Thu, Mar 02, 2023 at 12:21:15PM -0500, Peter Xu wrote:
> On Thu, Feb 09, 2023 at 12:01:51PM -0300, Marcelo Tosatti wrote:
> > Draining of pages from the local pcp for a remote zone was necessary
> > since:
> > 
> > "Note that remote node draining is a somewhat esoteric feature that is
> > required on large NUMA systems because otherwise significant portions
> > of system memory can become trapped in pcp queues. The number of pcp is
> > determined by the number of processors and nodes in a system. A system
> > with 4 processors and 2 nodes has 8 pcps which is okay. But a system
> > with 1024 processors and 512 nodes has 512k pcps with a high potential
> > for large amount of memory being caught in them."
> 
> How about mentioning more details on where does this come from?
> 
> afaict it's from commit 4037d45 since 2007.
> 
> So I digged that out mostly because I want to know why we did flush pcp at
> all during vmstat update.  It already sounds weird to me but I could have
> been missing important details.
> 
> The rational I had here is refresh_cpu_vm_stats(true) is mostly being
> called by the shepherd afaict, while:
> 
>   (1) The frequency of that interval is defined as sysctl_stat_interval,
>       which has nothing yet to do with pcp pages but only stat at least in
>       the name of it, and,
> 
>   (2) vmstat_work is only queued if need_update() here:
> 
> 	for_each_online_cpu(cpu) {
> 		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
> 
> 		if (!delayed_work_pending(dw) && need_update(cpu))
> 			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
> 
> 		cond_resched();
> 	}
> 
>       need_update() tells us "we should flush vmstats", nothing it tells
>       about "we should flush pcp list"..
> 
> I looked into the 2007 commit, besides what Marcelo quoted, I do see
> there's a major benefit of reusing cache lines, quotting from the commit:
> 
>         Move the node draining so that is is done when the vm statistics
>         are updated.  At that point we are already touching all the
>         cachelines with the pagesets of a processor.
> 
> However I didn't see why it's rational to flush pcp list when vmstat needs
> flushing either.  I also don't know whether that "cacheline locality" hold
> true or not, because I saw that the pcp page list is split from vmstats
> since 2021:
> 
>     commit 28f836b6777b6f42dce068a40d83a891deaaca37
>     Author: Mel Gorman <mgorman@techsingularity.net>
>     Date:   Mon Jun 28 19:41:38 2021 -0700
> 
>     mm/page_alloc: split per cpu page lists and zone stats
> 
> I'm not even sure my A-b or R-b worth anything at all here, just offer
> something I got from git archaeology so maybe helpful to readers and
> reasoning to this patch.  The correctness of archaeology needs help from
> others (Christoph and Gel?)..  I would just say if there's anything useful
> or correct may worth collect some into the commit log.
> 
> So from what I can tell this patch makes sense.

One thing I forgot to mention, which may be a slight abi change, is that I
think the pcp page drain is also triggered by /proc/PID/refresh_vm_stats
(even though again I don't see why flushing pcp is strictly needed).  It's
just that I don't know whether there's potential user app that can leverage
this.

The worst case is we can drain pcp list for refresh_vm_stats procfs
explicitly, but I'm not sure whether it'll be worthwhile either, probably
just to be safe.
Marcelo Tosatti March 2, 2023, 6:56 p.m. UTC | #6
On Thu, Mar 02, 2023 at 12:21:15PM -0500, Peter Xu wrote:
> On Thu, Feb 09, 2023 at 12:01:51PM -0300, Marcelo Tosatti wrote:
> > Draining of pages from the local pcp for a remote zone was necessary
> > since:
> > 
> > "Note that remote node draining is a somewhat esoteric feature that is
> > required on large NUMA systems because otherwise significant portions
> > of system memory can become trapped in pcp queues. The number of pcp is
> > determined by the number of processors and nodes in a system. A system
> > with 4 processors and 2 nodes has 8 pcps which is okay. But a system
> > with 1024 processors and 512 nodes has 512k pcps with a high potential
> > for large amount of memory being caught in them."
> 
> How about mentioning more details on where does this come from?
> 
> afaict it's from commit 4037d45 since 2007.
> 
> So I digged that out mostly because I want to know why we did flush pcp at
> all during vmstat update.  It already sounds weird to me but I could have
> been missing important details.
> 
> The rational I had here is refresh_cpu_vm_stats(true) is mostly being
> called by the shepherd afaict, while:
> 
>   (1) The frequency of that interval is defined as sysctl_stat_interval,
>       which has nothing yet to do with pcp pages but only stat at least in
>       the name of it, and,
> 
>   (2) vmstat_work is only queued if need_update() here:
> 
> 	for_each_online_cpu(cpu) {
> 		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
> 
> 		if (!delayed_work_pending(dw) && need_update(cpu))
> 			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
> 
> 		cond_resched();
> 	}
> 
>       need_update() tells us "we should flush vmstats", nothing it tells
>       about "we should flush pcp list"..
> 
> I looked into the 2007 commit, besides what Marcelo quoted, I do see
> there's a major benefit of reusing cache lines, quotting from the commit:
> 
>         Move the node draining so that is is done when the vm statistics
>         are updated.  At that point we are already touching all the
>         cachelines with the pagesets of a processor.
> 
> However I didn't see why it's rational to flush pcp list when vmstat needs
> flushing either. I also don't know whether that "cacheline locality" hold
> true or not, because I saw that the pcp page list is split from vmstats
> since 2021:
> 
>     commit 28f836b6777b6f42dce068a40d83a891deaaca37
>     Author: Mel Gorman <mgorman@techsingularity.net>
>     Date:   Mon Jun 28 19:41:38 2021 -0700
> 
>     mm/page_alloc: split per cpu page lists and zone stats
> 
> I'm not even sure my A-b or R-b worth anything at all here, 

"A-b or R-b" ? 

I think your points are valid.

Also, the fact that sysctl_stat_interval is large (a second or more),
means that any cache locality concern is would be limited to that
time span.

> just offer
> something I got from git archaeology so maybe helpful to readers and
> reasoning to this patch.  

> The correctness of archaeology needs help from
> others (Christoph and Gel?)..  I would just say if there's anything useful
> or correct may worth collect some into the commit log.

Agreed, i forgot to include the commit id in the changelog.

> So from what I can tell this patch makes sense.

Thanks!
Marcelo Tosatti March 2, 2023, 7:17 p.m. UTC | #7
On Thu, Mar 02, 2023 at 12:27:11PM -0500, Peter Xu wrote:
> On Thu, Mar 02, 2023 at 12:21:15PM -0500, Peter Xu wrote:
> > On Thu, Feb 09, 2023 at 12:01:51PM -0300, Marcelo Tosatti wrote:
> > > Draining of pages from the local pcp for a remote zone was necessary
> > > since:
> > > 
> > > "Note that remote node draining is a somewhat esoteric feature that is
> > > required on large NUMA systems because otherwise significant portions
> > > of system memory can become trapped in pcp queues. The number of pcp is
> > > determined by the number of processors and nodes in a system. A system
> > > with 4 processors and 2 nodes has 8 pcps which is okay. But a system
> > > with 1024 processors and 512 nodes has 512k pcps with a high potential
> > > for large amount of memory being caught in them."
> > 
> > How about mentioning more details on where does this come from?
> > 
> > afaict it's from commit 4037d45 since 2007.
> > 
> > So I digged that out mostly because I want to know why we did flush pcp at
> > all during vmstat update.  It already sounds weird to me but I could have
> > been missing important details.
> > 
> > The rational I had here is refresh_cpu_vm_stats(true) is mostly being
> > called by the shepherd afaict, while:
> > 
> >   (1) The frequency of that interval is defined as sysctl_stat_interval,
> >       which has nothing yet to do with pcp pages but only stat at least in
> >       the name of it, and,
> > 
> >   (2) vmstat_work is only queued if need_update() here:
> > 
> > 	for_each_online_cpu(cpu) {
> > 		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
> > 
> > 		if (!delayed_work_pending(dw) && need_update(cpu))
> > 			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
> > 
> > 		cond_resched();
> > 	}
> > 
> >       need_update() tells us "we should flush vmstats", nothing it tells
> >       about "we should flush pcp list"..
> > 
> > I looked into the 2007 commit, besides what Marcelo quoted, I do see
> > there's a major benefit of reusing cache lines, quotting from the commit:
> > 
> >         Move the node draining so that is is done when the vm statistics
> >         are updated.  At that point we are already touching all the
> >         cachelines with the pagesets of a processor.
> > 
> > However I didn't see why it's rational to flush pcp list when vmstat needs
> > flushing either.  I also don't know whether that "cacheline locality" hold
> > true or not, because I saw that the pcp page list is split from vmstats
> > since 2021:
> > 
> >     commit 28f836b6777b6f42dce068a40d83a891deaaca37
> >     Author: Mel Gorman <mgorman@techsingularity.net>
> >     Date:   Mon Jun 28 19:41:38 2021 -0700
> > 
> >     mm/page_alloc: split per cpu page lists and zone stats
> > 
> > I'm not even sure my A-b or R-b worth anything at all here, just offer
> > something I got from git archaeology so maybe helpful to readers and
> > reasoning to this patch.  The correctness of archaeology needs help from
> > others (Christoph and Gel?)..  I would just say if there's anything useful
> > or correct may worth collect some into the commit log.
> > 
> > So from what I can tell this patch makes sense.
> 
> One thing I forgot to mention, which may be a slight abi change, is that I
> think the pcp page drain is also triggered by /proc/PID/refresh_vm_stats
> (even though again I don't see why flushing pcp is strictly needed).  It's
> just that I don't know whether there's potential user app that can leverage
> this.
> 
> The worst case is we can drain pcp list for refresh_vm_stats procfs
> explicitly, but I'm not sure whether it'll be worthwhile either, probably
> just to be safe.

This is a good point.

Documentation/admin-guide/sysctl/vm.rst:

stat_refresh
============

Any read or write (by root only) flushes all the per-cpu vm statistics
into their global totals, for more accurate reports when testing
e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo

As a side-effect, it also checks for negative totals (elsewhere reported
as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
(At time of writing, a few stats are known sometimes to be found negative,
with no ill effects: errors and warnings on these stats are suppressed.)


Will add "drain_all_pages(NULL)" call to the start of stat_refresh
handler.

Thanks.
Mel Gorman March 21, 2023, 3:20 p.m. UTC | #8
On Thu, Mar 02, 2023 at 11:10:03AM +0100, David Hildenbrand wrote:
> [...]
> 
> > 
> > > (2) drain_zone_pages() documents that we're draining the PCP
> > >      (bulk-freeing them) of the current CPU on remote nodes. That bulk-
> > >      freeing will properly adjust free memory counters. What exactly is
> > >      the impact when no longer doing that? Won't the "snapshot" of some
> > >      counters eventually be wrong? Do we care?
> > 
> > Don't see why the snapshot of counters will be wrong.
> > 
> > Instead of freeing pages on pcp list of remote nodes after they are
> > considered idle ("3 seconds idle till flush"), what will happen is that
> > drain_all_pages() will free those pcps, for example after an allocation
> > fails on direct reclaim:
> > 
> >          page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
> > 
> >          /*
> >           * If an allocation failed after direct reclaim, it could be because
> >           * pages are pinned on the per-cpu lists or in high alloc reserves.
> >           * Shrink them and try again
> >           */
> >          if (!page && !drained) {
> >                  unreserve_highatomic_pageblock(ac, false);
> >                  drain_all_pages(NULL);
> >                  drained = true;
> >                  goto retry;
> >          }
> > 
> > In both cases the pages are freed (and counters maintained) here:
> > 
> > static inline void __free_one_page(struct page *page,
> >                  unsigned long pfn,
> >                  struct zone *zone, unsigned int order,
> >                  int migratetype, fpi_t fpi_flags)
> > {
> >          struct capture_control *capc = task_capc(zone);
> >          unsigned long buddy_pfn = 0;
> >          unsigned long combined_pfn;
> >          struct page *buddy;
> >          bool to_tail;
> > 
> >          VM_BUG_ON(!zone_is_initialized(zone));
> >          VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
> > 
> >          VM_BUG_ON(migratetype == -1);
> >          if (likely(!is_migrate_isolate(migratetype)))
> >                  __mod_zone_freepage_state(zone, 1 << order, migratetype);
> > 
> >          VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
> >          VM_BUG_ON_PAGE(bad_range(zone, page), page);
> > 
> >          while (order < MAX_ORDER - 1) {
> >                  if (compaction_capture(capc, page, order, migratetype)) {
> >                          __mod_zone_freepage_state(zone, -(1 << order),
> >                                                                  migratetype);
> >                          return;
> >                  }
> > 
> > > Describing the difference between instructed refresh of vmstat and "remotely
> > > drain per-cpu lists" in order to move free memory from the pcp to the buddy
> > > would be great.
> > 
> > The difference is that now remote PCPs will be drained on demand, either via
> > kcompactd or direct reclaim (through drain_all_pages), when memory is
> > low.
> > 
> > For example, with the following test:
> > 
> > dd if=/dev/zero of=file bs=1M count=32000 on a tmpfs filesystem:
> > 
> >        kcompactd0-116     [005] ...1 228232.042873: drain_all_pages <-kcompactd_do_work
> >        kcompactd0-116     [005] ...1 228232.042873: __drain_all_pages <-kcompactd_do_work
> >                dd-479485  [003] ...1 228232.455130: __drain_all_pages <-__alloc_pages_slowpath.constprop.0
> >                dd-479485  [011] ...1 228232.721994: __drain_all_pages <-__alloc_pages_slowpath.constprop.0
> >       gnome-shell-3750    [015] ...1 228232.723729: __drain_all_pages <-__alloc_pages_slowpath.constprop.0
> > 
> > The commit message was indeed incorrect. Updated one:
> > 
> > "mm/vmstat: remove remote node draining
> > 
> > Draining of pages from the local pcp for a remote zone should not be
> > necessary, since once the system is low on memory (or compaction on a
> > zone is in effect), drain_all_pages should be called freeing any unused
> > pcps."
> > 
> > Thanks!
> 
> Thanks for the explanation, that makes sense to me. Feel free to add my
> 
> Acked-by: David Hildenbrand <david@redhat.com>
> 
> ... hoping that some others (Mel, Vlastimil?) can have another look.
> 

I was on extended leave and am still in the process of triaging a few
thousand mails so I'm working off memory here instead of the code. This
is a straight-forward enough question to answer quickly in case I forget
later.

Short answer: I'm not a fan of the patch in concept and I do not think it
should be merged.

I agree that drain_all_pages() would free the PCP pages on demand in
direct reclaim context but it happens after reclaim has already
happened. Hence, the reclaim may be necessary and may cause overreclaim
in some situations due to remote CPUs pinning memory in PCP lists.

Similarly, kswapd may trigger early because PCP pages do not contribute
to NR_FREE_PAGES so watermark checks can fail even though pages are
free, just inaccessible.

Finally, remote pages expire because ideally CPUs allocate local memory
assuming memory policies are not forcing use of remote nodes. The expiry
means that remote pages get freed back to the buddy lists after a short
period. By removing the expiry, it's possible that a local allocation will
fail and spill over to a remote node prematurely because free pages were
pinned on the PCP lists.

As this patch has the possibility of reclaiming early in both direct and
kswapd context and increases the risk of remote node fallback, I think it
needs much stronger justification and a warning about the side-effects. For
this version unless I'm very wrong -- NAK :(
Marcelo Tosatti March 21, 2023, 5:31 p.m. UTC | #9
On Tue, Mar 21, 2023 at 03:20:31PM +0000, Mel Gorman wrote:
> On Thu, Mar 02, 2023 at 11:10:03AM +0100, David Hildenbrand wrote:
> > [...]
> > 
> > > 
> > > > (2) drain_zone_pages() documents that we're draining the PCP
> > > >      (bulk-freeing them) of the current CPU on remote nodes. That bulk-
> > > >      freeing will properly adjust free memory counters. What exactly is
> > > >      the impact when no longer doing that? Won't the "snapshot" of some
> > > >      counters eventually be wrong? Do we care?
> > > 
> > > Don't see why the snapshot of counters will be wrong.
> > > 
> > > Instead of freeing pages on pcp list of remote nodes after they are
> > > considered idle ("3 seconds idle till flush"), what will happen is that
> > > drain_all_pages() will free those pcps, for example after an allocation
> > > fails on direct reclaim:
> > > 
> > >          page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
> > > 
> > >          /*
> > >           * If an allocation failed after direct reclaim, it could be because
> > >           * pages are pinned on the per-cpu lists or in high alloc reserves.
> > >           * Shrink them and try again
> > >           */
> > >          if (!page && !drained) {
> > >                  unreserve_highatomic_pageblock(ac, false);
> > >                  drain_all_pages(NULL);
> > >                  drained = true;
> > >                  goto retry;
> > >          }
> > > 
> > > In both cases the pages are freed (and counters maintained) here:
> > > 
> > > static inline void __free_one_page(struct page *page,
> > >                  unsigned long pfn,
> > >                  struct zone *zone, unsigned int order,
> > >                  int migratetype, fpi_t fpi_flags)
> > > {
> > >          struct capture_control *capc = task_capc(zone);
> > >          unsigned long buddy_pfn = 0;
> > >          unsigned long combined_pfn;
> > >          struct page *buddy;
> > >          bool to_tail;
> > > 
> > >          VM_BUG_ON(!zone_is_initialized(zone));
> > >          VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
> > > 
> > >          VM_BUG_ON(migratetype == -1);
> > >          if (likely(!is_migrate_isolate(migratetype)))
> > >                  __mod_zone_freepage_state(zone, 1 << order, migratetype);
> > > 
> > >          VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
> > >          VM_BUG_ON_PAGE(bad_range(zone, page), page);
> > > 
> > >          while (order < MAX_ORDER - 1) {
> > >                  if (compaction_capture(capc, page, order, migratetype)) {
> > >                          __mod_zone_freepage_state(zone, -(1 << order),
> > >                                                                  migratetype);
> > >                          return;
> > >                  }
> > > 
> > > > Describing the difference between instructed refresh of vmstat and "remotely
> > > > drain per-cpu lists" in order to move free memory from the pcp to the buddy
> > > > would be great.
> > > 
> > > The difference is that now remote PCPs will be drained on demand, either via
> > > kcompactd or direct reclaim (through drain_all_pages), when memory is
> > > low.
> > > 
> > > For example, with the following test:
> > > 
> > > dd if=/dev/zero of=file bs=1M count=32000 on a tmpfs filesystem:
> > > 
> > >        kcompactd0-116     [005] ...1 228232.042873: drain_all_pages <-kcompactd_do_work
> > >        kcompactd0-116     [005] ...1 228232.042873: __drain_all_pages <-kcompactd_do_work
> > >                dd-479485  [003] ...1 228232.455130: __drain_all_pages <-__alloc_pages_slowpath.constprop.0
> > >                dd-479485  [011] ...1 228232.721994: __drain_all_pages <-__alloc_pages_slowpath.constprop.0
> > >       gnome-shell-3750    [015] ...1 228232.723729: __drain_all_pages <-__alloc_pages_slowpath.constprop.0
> > > 
> > > The commit message was indeed incorrect. Updated one:
> > > 
> > > "mm/vmstat: remove remote node draining
> > > 
> > > Draining of pages from the local pcp for a remote zone should not be
> > > necessary, since once the system is low on memory (or compaction on a
> > > zone is in effect), drain_all_pages should be called freeing any unused
> > > pcps."
> > > 
> > > Thanks!
> > 
> > Thanks for the explanation, that makes sense to me. Feel free to add my
> > 
> > Acked-by: David Hildenbrand <david@redhat.com>
> > 
> > ... hoping that some others (Mel, Vlastimil?) can have another look.
> > 
> 
> I was on extended leave and am still in the process of triaging a few
> thousand mails so I'm working off memory here instead of the code. This
> is a straight-forward enough question to answer quickly in case I forget
> later.
> 
> Short answer: I'm not a fan of the patch in concept and I do not think it
> should be merged.
> 
> I agree that drain_all_pages() would free the PCP pages on demand in
> direct reclaim context but it happens after reclaim has already
> happened. Hence, the reclaim may be necessary and may cause overreclaim
> in some situations due to remote CPUs pinning memory in PCP lists.
> 
> Similarly, kswapd may trigger early because PCP pages do not contribute
> to NR_FREE_PAGES so watermark checks can fail even though pages are
> free, just inaccessible.
> 
> Finally, remote pages expire because ideally CPUs allocate local memory
> assuming memory policies are not forcing use of remote nodes. The expiry
> means that remote pages get freed back to the buddy lists after a short
> period. By removing the expiry, it's possible that a local allocation will
> fail and spill over to a remote node prematurely because free pages were
> pinned on the PCP lists.
> 
> As this patch has the possibility of reclaiming early in both direct and
> kswapd context and increases the risk of remote node fallback, I think it
> needs much stronger justification and a warning about the side-effects. For
> this version unless I'm very wrong -- NAK :(

Mel,

Agreed. -v7 of the series dropped this patch and implements draining 
of non-local NUMA node memory on pcp caches to happen from cpu_vm_stats_fold
(which might execute remotely from vmstat_shepherd).
If you can take a look at -v7, it would be awesome.

Subject: [PATCH v7 13/13] vmstat: add pcp remote node draining via cpu_vm_stats_fold

Large NUMA systems might have significant portions
of system memory to be trapped in pcp queues. The number of pcp is
determined by the number of processors and nodes in a system. A system
with 4 processors and 2 nodes has 8 pcps which is okay. But a system
with 1024 processors and 512 nodes has 512k pcps with a high potential
for large amount of memory being caught in them.

Enable remote node draining for the CONFIG_HAVE_CMPXCHG_LOCAL case,
where vmstat_shepherd will perform the aging and draining via
cpu_vm_stats_fold.

Suggested-by: Vlastimil Babka <vbabka@suse.cz>




> 
> -- 
> Mel Gorman
> SUSE Labs
> 
>
diff mbox series

Patch

Index: linux-vmstat-remote/include/linux/mmzone.h
===================================================================
--- linux-vmstat-remote.orig/include/linux/mmzone.h
+++ linux-vmstat-remote/include/linux/mmzone.h
@@ -577,9 +577,6 @@  struct per_cpu_pages {
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
 	short free_factor;	/* batch scaling factor during free */
-#ifdef CONFIG_NUMA
-	short expire;		/* When 0, remote pagesets are drained */
-#endif
 
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
 	struct list_head lists[NR_PCP_LISTS];
Index: linux-vmstat-remote/mm/vmstat.c
===================================================================
--- linux-vmstat-remote.orig/mm/vmstat.c
+++ linux-vmstat-remote/mm/vmstat.c
@@ -803,7 +803,7 @@  static int fold_diff(int *zone_diff, int
  *
  * The function returns the number of global counters updated.
  */
-static int refresh_cpu_vm_stats(bool do_pagesets)
+static int refresh_cpu_vm_stats(void)
 {
 	struct pglist_data *pgdat;
 	struct zone *zone;
@@ -814,9 +814,6 @@  static int refresh_cpu_vm_stats(bool do_
 
 	for_each_populated_zone(zone) {
 		struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
-#ifdef CONFIG_NUMA
-		struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
-#endif
 
 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
 			int v;
@@ -826,44 +823,8 @@  static int refresh_cpu_vm_stats(bool do_
 
 				atomic_long_add(v, &zone->vm_stat[i]);
 				global_zone_diff[i] += v;
-#ifdef CONFIG_NUMA
-				/* 3 seconds idle till flush */
-				__this_cpu_write(pcp->expire, 3);
-#endif
 			}
 		}
-#ifdef CONFIG_NUMA
-
-		if (do_pagesets) {
-			cond_resched();
-			/*
-			 * Deal with draining the remote pageset of this
-			 * processor
-			 *
-			 * Check if there are pages remaining in this pageset
-			 * if not then there is nothing to expire.
-			 */
-			if (!__this_cpu_read(pcp->expire) ||
-			       !__this_cpu_read(pcp->count))
-				continue;
-
-			/*
-			 * We never drain zones local to this processor.
-			 */
-			if (zone_to_nid(zone) == numa_node_id()) {
-				__this_cpu_write(pcp->expire, 0);
-				continue;
-			}
-
-			if (__this_cpu_dec_return(pcp->expire))
-				continue;
-
-			if (__this_cpu_read(pcp->count)) {
-				drain_zone_pages(zone, this_cpu_ptr(pcp));
-				changes++;
-			}
-		}
-#endif
 	}
 
 	for_each_online_pgdat(pgdat) {
@@ -1864,7 +1825,7 @@  int sysctl_stat_interval __read_mostly =
 #ifdef CONFIG_PROC_FS
 static void refresh_vm_stats(struct work_struct *work)
 {
-	refresh_cpu_vm_stats(true);
+	refresh_cpu_vm_stats();
 }
 
 int vmstat_refresh(struct ctl_table *table, int write,
@@ -1928,7 +1889,7 @@  int vmstat_refresh(struct ctl_table *tab
 
 static void vmstat_update(struct work_struct *w)
 {
-	if (refresh_cpu_vm_stats(true)) {
+	if (refresh_cpu_vm_stats()) {
 		/*
 		 * Counters were updated so we expect more updates
 		 * to occur in the future. Keep on running the
@@ -1991,7 +1952,7 @@  void quiet_vmstat(void)
 	 * it would be too expensive from this path.
 	 * vmstat_shepherd will take care about that for us.
 	 */
-	refresh_cpu_vm_stats(false);
+	refresh_cpu_vm_stats();
 }
 
 /*