diff mbox series

[2/5] mm/page_alloc: Add a bulk page allocator

Message ID 20210310104618.22750-3-mgorman@techsingularity.net (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series Introduce a bulk order-0 page allocator with two in-tree users | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Guessed tree name to be net-next
netdev/subject_prefix success Link
netdev/cc_maintainers success CCed 2 of 2 maintainers
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit fail Errors and warnings before: 23512 this patch: 23514
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: extern prototypes should be avoided in .h files WARNING: line length of 103 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/build_allmodconfig_warn fail Errors and warnings before: 23141 this patch: 23143
netdev/header_inline success Link

Commit Message

Mel Gorman March 10, 2021, 10:46 a.m. UTC
This patch adds a new page allocator interface via alloc_pages_bulk,
and __alloc_pages_bulk_nodemask. A caller requests a number of pages
to be allocated and added to a list. They can be freed in bulk using
free_pages_bulk().

The API is not guaranteed to return the requested number of pages and
may fail if the preferred allocation zone has limited free memory, the
cpuset changes during the allocation or page debugging decides to fail
an allocation. It's up to the caller to request more pages in batch
if necessary.

Note that this implementation is not very efficient and could be improved
but it would require refactoring. The intent is to make it available early
to determine what semantics are required by different callers. Once the
full semantics are nailed down, it can be refactored.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 include/linux/gfp.h |  13 +++++
 mm/page_alloc.c     | 113 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 124 insertions(+), 2 deletions(-)

Comments

Andrew Morton March 10, 2021, 11:46 p.m. UTC | #1
On Wed, 10 Mar 2021 10:46:15 +0000 Mel Gorman <mgorman@techsingularity.net> wrote:

> This patch adds a new page allocator interface via alloc_pages_bulk,
> and __alloc_pages_bulk_nodemask. A caller requests a number of pages
> to be allocated and added to a list. They can be freed in bulk using
> free_pages_bulk().

Why am I surprised we don't already have this.

> The API is not guaranteed to return the requested number of pages and
> may fail if the preferred allocation zone has limited free memory, the
> cpuset changes during the allocation or page debugging decides to fail
> an allocation. It's up to the caller to request more pages in batch
> if necessary.
> 
> Note that this implementation is not very efficient and could be improved
> but it would require refactoring. The intent is to make it available early
> to determine what semantics are required by different callers. Once the
> full semantics are nailed down, it can be refactored.
> 
> ...
>
> +/* Drop reference counts and free order-0 pages from a list. */
> +void free_pages_bulk(struct list_head *list)
> +{
> +	struct page *page, *next;
> +
> +	list_for_each_entry_safe(page, next, list, lru) {
> +		trace_mm_page_free_batched(page);
> +		if (put_page_testzero(page)) {
> +			list_del(&page->lru);
> +			__free_pages_ok(page, 0, FPI_NONE);
> +		}
> +	}
> +}
> +EXPORT_SYMBOL_GPL(free_pages_bulk);

I expect that batching games are planned in here as well?

>  static inline unsigned int
>  gfp_to_alloc_flags(gfp_t gfp_mask)
>  {
> @@ -4919,6 +4934,9 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
>  		struct alloc_context *ac, gfp_t *alloc_mask,
>  		unsigned int *alloc_flags)
>  {
> +	gfp_mask &= gfp_allowed_mask;
> +	*alloc_mask = gfp_mask;
> +
>  	ac->highest_zoneidx = gfp_zone(gfp_mask);
>  	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
>  	ac->nodemask = nodemask;
> @@ -4960,6 +4978,99 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
>  	return true;
>  }
>  
> +/*
> + * This is a batched version of the page allocator that attempts to
> + * allocate nr_pages quickly from the preferred zone and add them to list.
> + */

Documentation is rather lame.  Returns number of pages allocated...

> +int __alloc_pages_bulk_nodemask(gfp_t gfp_mask, int preferred_nid,
> +			nodemask_t *nodemask, int nr_pages,
> +			struct list_head *alloc_list)
> +{
> +	struct page *page;
> +	unsigned long flags;
> +	struct zone *zone;
> +	struct zoneref *z;
> +	struct per_cpu_pages *pcp;
> +	struct list_head *pcp_list;
> +	struct alloc_context ac;
> +	gfp_t alloc_mask;
> +	unsigned int alloc_flags;
> +	int alloced = 0;
> +
> +	if (nr_pages == 1)
> +		goto failed;
> +
> +	/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
> +	if (!prepare_alloc_pages(gfp_mask, 0, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
> +		return 0;
> +	gfp_mask = alloc_mask;
> +
> +	/* Find an allowed local zone that meets the high watermark. */
> +	for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
> +		unsigned long mark;
> +
> +		if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
> +		    !__cpuset_zone_allowed(zone, gfp_mask)) {
> +			continue;
> +		}
> +
> +		if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
> +		    zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
> +			goto failed;
> +		}
> +
> +		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
> +		if (zone_watermark_fast(zone, 0,  mark,
> +				zonelist_zone_idx(ac.preferred_zoneref),
> +				alloc_flags, gfp_mask)) {
> +			break;
> +		}
> +	}

I suspect the above was stolen from elsewhere and that some code
commonification is planned.


> +	if (!zone)
> +		return 0;
> +
> +	/* Attempt the batch allocation */
> +	local_irq_save(flags);
> +	pcp = &this_cpu_ptr(zone->pageset)->pcp;
> +	pcp_list = &pcp->lists[ac.migratetype];
> +
> +	while (alloced < nr_pages) {
> +		page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
> +								pcp, pcp_list);
> +		if (!page)
> +			break;
> +
> +		prep_new_page(page, 0, gfp_mask, 0);

I wonder if it would be worth running prep_new_page() in a second pass,
after reenabling interrupts.

Speaking of which, will the realtime people get upset about the
irqs-off latency?  How many pages are we talking about here?

> +		list_add(&page->lru, alloc_list);
> +		alloced++;
> +	}
> +
> +	if (!alloced)
> +		goto failed_irq;
> +
> +	if (alloced) {
> +		__count_zid_vm_events(PGALLOC, zone_idx(zone), alloced);
> +		zone_statistics(zone, zone);
> +	}
> +
> +	local_irq_restore(flags);
> +
> +	return alloced;
> +
> +failed_irq:
> +	local_irq_restore(flags);
> +
> +failed:

Might we need some counter to show how often this path happens?

> +	page = __alloc_pages_nodemask(gfp_mask, 0, preferred_nid, nodemask);
> +	if (page) {
> +		alloced++;
> +		list_add(&page->lru, alloc_list);
> +	}
> +
> +	return alloced;
> +}
> +EXPORT_SYMBOL_GPL(__alloc_pages_bulk_nodemask);
> +
Mel Gorman March 11, 2021, 8:42 a.m. UTC | #2
On Wed, Mar 10, 2021 at 03:46:50PM -0800, Andrew Morton wrote:
> On Wed, 10 Mar 2021 10:46:15 +0000 Mel Gorman <mgorman@techsingularity.net> wrote:
> 
> > This patch adds a new page allocator interface via alloc_pages_bulk,
> > and __alloc_pages_bulk_nodemask. A caller requests a number of pages
> > to be allocated and added to a list. They can be freed in bulk using
> > free_pages_bulk().
> 
> Why am I surprised we don't already have this.
> 

It was prototyped a few years ago and discussed at LSF/MM so it's in
the back of your memory somewhere. It never got merged because it lacked
users and I didn't think carrying dead untested code was appropriate.

> > The API is not guaranteed to return the requested number of pages and
> > may fail if the preferred allocation zone has limited free memory, the
> > cpuset changes during the allocation or page debugging decides to fail
> > an allocation. It's up to the caller to request more pages in batch
> > if necessary.
> > 
> > Note that this implementation is not very efficient and could be improved
> > but it would require refactoring. The intent is to make it available early
> > to determine what semantics are required by different callers. Once the
> > full semantics are nailed down, it can be refactored.
> > 
> > ...
> >
> > +/* Drop reference counts and free order-0 pages from a list. */
> > +void free_pages_bulk(struct list_head *list)
> > +{
> > +	struct page *page, *next;
> > +
> > +	list_for_each_entry_safe(page, next, list, lru) {
> > +		trace_mm_page_free_batched(page);
> > +		if (put_page_testzero(page)) {
> > +			list_del(&page->lru);
> > +			__free_pages_ok(page, 0, FPI_NONE);
> > +		}
> > +	}
> > +}
> > +EXPORT_SYMBOL_GPL(free_pages_bulk);
> 
> I expect that batching games are planned in here as well?
> 

Potentially it could be done but the page allocator would need to be
fundamentally aware of batching to make it tidy or the per-cpu allocator
would need knowledge of how to handle batches in the free path.  Batch
freeing to the buddy allocator is problematic as buddy merging has to
happen. Batch freeing to per-cpu hits pcp->high limitations.

There are a couple of ways it *could* be done. Per-cpu lists could be
allowed to temporarily exceed the high limits and reduce them out-of-band
like what happens with counter updates or remote pcp freeing. Care
would need to be taken when memory is low to avoid premature OOM
and to guarantee draining happens in a timely fashion. There would be
additional benefits to this. For example, release_pages() can hammer the
zone lock when freeing very large batches and would benefit from either
large batching or "plugging" the per-cpu list. I prototyped a series to
allow the batch limits to be temporarily exceeded but it did not actually
improve performance because of errors in the implementation and it needs
a lot of work.

> >  static inline unsigned int
> >  gfp_to_alloc_flags(gfp_t gfp_mask)
> >  {
> > @@ -4919,6 +4934,9 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
> >  		struct alloc_context *ac, gfp_t *alloc_mask,
> >  		unsigned int *alloc_flags)
> >  {
> > +	gfp_mask &= gfp_allowed_mask;
> > +	*alloc_mask = gfp_mask;
> > +
> >  	ac->highest_zoneidx = gfp_zone(gfp_mask);
> >  	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
> >  	ac->nodemask = nodemask;
> > @@ -4960,6 +4978,99 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
> >  	return true;
> >  }
> >  
> > +/*
> > + * This is a batched version of the page allocator that attempts to
> > + * allocate nr_pages quickly from the preferred zone and add them to list.
> > + */
> 
> Documentation is rather lame.  Returns number of pages allocated...
> 

I added a note on the return value. The documentation is lame because at
this point, we do not know what the required semantics for future users
are. We have two examples at the moment in this series but I think it
would be better to add kerneldoc documentation when there is a reasonable
expectation that the API will not change. For example, SLUB could use
this API when it fails to allocate a high-order page and instead allocate
batches of order-0 pages but I did not investigate how feasible that
is. Similarly, it's possible that we really need to deal with high-order
batch allocations in which case, the per-cpu list should be high-order
aware or the core buddy allocator needs to be batch-allocation aware.

> > +int __alloc_pages_bulk_nodemask(gfp_t gfp_mask, int preferred_nid,
> > +			nodemask_t *nodemask, int nr_pages,
> > +			struct list_head *alloc_list)
> > +{
> > +	struct page *page;
> > +	unsigned long flags;
> > +	struct zone *zone;
> > +	struct zoneref *z;
> > +	struct per_cpu_pages *pcp;
> > +	struct list_head *pcp_list;
> > +	struct alloc_context ac;
> > +	gfp_t alloc_mask;
> > +	unsigned int alloc_flags;
> > +	int alloced = 0;
> > +
> > +	if (nr_pages == 1)
> > +		goto failed;
> > +
> > +	/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
> > +	if (!prepare_alloc_pages(gfp_mask, 0, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
> > +		return 0;
> > +	gfp_mask = alloc_mask;
> > +
> > +	/* Find an allowed local zone that meets the high watermark. */
> > +	for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
> > +		unsigned long mark;
> > +
> > +		if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
> > +		    !__cpuset_zone_allowed(zone, gfp_mask)) {
> > +			continue;
> > +		}
> > +
> > +		if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
> > +		    zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
> > +			goto failed;
> > +		}
> > +
> > +		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
> > +		if (zone_watermark_fast(zone, 0,  mark,
> > +				zonelist_zone_idx(ac.preferred_zoneref),
> > +				alloc_flags, gfp_mask)) {
> > +			break;
> > +		}
> > +	}
> 
> I suspect the above was stolen from elsewhere and that some code
> commonification is planned.
> 

It's based on get_page_from_freelist. It would be messy to have them share
common code at this point with a risk that the fast path for the common
path (single page requests) would be impaired. The issue is that the
fast path and slow paths have zonelist iteration, kswapd wakeup, cpuset
enforcement and reclaim actions all mixed together at various different
points. The locking is also mixed up with per-cpu list locking, statistic
locking and buddy locking all having inappropriate overlaps (e.g. IRQ
disabling protects per-cpu list locking, partially and unnecessarily
protects statistics depending on architecture and overlaps with the
IRQ-safe zone lock.

Ironing this out risks hurting the single page allocation path. It would
need to be done incrementally with ultimately the core of the allocator
dealing with batches to avoid false bisections.

> 
> > +	if (!zone)
> > +		return 0;
> > +
> > +	/* Attempt the batch allocation */
> > +	local_irq_save(flags);
> > +	pcp = &this_cpu_ptr(zone->pageset)->pcp;
> > +	pcp_list = &pcp->lists[ac.migratetype];
> > +
> > +	while (alloced < nr_pages) {
> > +		page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
> > +								pcp, pcp_list);
> > +		if (!page)
> > +			break;
> > +
> > +		prep_new_page(page, 0, gfp_mask, 0);
> 
> I wonder if it would be worth running prep_new_page() in a second pass,
> after reenabling interrupts.
> 

Possibly, I could add another patch on top that does this because it's
trading the time that IRQs are disabled for a list iteration.

> Speaking of which, will the realtime people get upset about the
> irqs-off latency?  How many pages are we talking about here?
> 

At the moment, it looks like batches of up to a few hundred at worst. I
don't think realtime sensitive applications are likely to be using the
bulk allocator API at this point.

The realtime people have a worse problem in that the per-cpu list does
not use local_lock and disable IRQs more than it needs to on x86 in
particular. I've a prototype series for this as well which splits the
locking for the per-cpu list and statistic handling and then converts the
per-cpu list to local_lock but I'm getting this off the table first because
I don't want multiple page allocator series in flight at the same time.
Thomas, Peter and Ingo would need to be cc'd on that series to review
the local_lock aspects.

Even with local_lock, it's not clear to me why per-cpu lists need to be
locked at all because potentially it could use a lock-free llist with some
struct page overloading. That one is harder to predict when batches are
taken into account as splicing a batch of free pages with llist would be
unsafe so batch free might exchange IRQ disabling overhead with multiple
atomics. I'd need to recheck things like whether NMI handlers ever call
the page allocator (they shouldn't but it should be checked).  It would
need a lot of review and testing.

> > +		list_add(&page->lru, alloc_list);
> > +		alloced++;
> > +	}
> > +
> > +	if (!alloced)
> > +		goto failed_irq;
> > +
> > +	if (alloced) {
> > +		__count_zid_vm_events(PGALLOC, zone_idx(zone), alloced);
> > +		zone_statistics(zone, zone);
> > +	}
> > +
> > +	local_irq_restore(flags);
> > +
> > +	return alloced;
> > +
> > +failed_irq:
> > +	local_irq_restore(flags);
> > +
> > +failed:
> 
> Might we need some counter to show how often this path happens?
> 

I think that would be overkill at this point. It only gives useful
information to a developer using the API for the first time and that can
be done with a debugging patch (or probes if you're feeling creative).
I'm already unhappy with the counter overhead in the page allocator.
zone_statistics in particular has no business being an accurate statistic.
It should have been a best-effort counter like vm_events that does not need
IRQs to be disabled. If that was a simply counter as opposed to an accurate
statistic then a failure counter at failed_irq would be very cheap to add.
Jesper Dangaard Brouer March 12, 2021, 11:46 a.m. UTC | #3
On Thu, 11 Mar 2021 08:42:00 +0000
Mel Gorman <mgorman@techsingularity.net> wrote:

> On Wed, Mar 10, 2021 at 03:46:50PM -0800, Andrew Morton wrote:
> > On Wed, 10 Mar 2021 10:46:15 +0000 Mel Gorman <mgorman@techsingularity.net> wrote:
> >   
> > > This patch adds a new page allocator interface via alloc_pages_bulk,
> > > and __alloc_pages_bulk_nodemask. A caller requests a number of pages
> > > to be allocated and added to a list. They can be freed in bulk using
> > > free_pages_bulk().  
> > 
> > Why am I surprised we don't already have this.
> >   
> 
> It was prototyped a few years ago and discussed at LSF/MM so it's in
> the back of your memory somewhere. It never got merged because it lacked
> users and I didn't think carrying dead untested code was appropriate.

And I guess didn't push hard enough and showed the use-case in code.
Thus, I will also take part of the blame for this stalling out.


> > > The API is not guaranteed to return the requested number of pages and
> > > may fail if the preferred allocation zone has limited free memory, the
> > > cpuset changes during the allocation or page debugging decides to fail
> > > an allocation. It's up to the caller to request more pages in batch
> > > if necessary.
> > > 
> > > Note that this implementation is not very efficient and could be improved
> > > but it would require refactoring. The intent is to make it available early
> > > to determine what semantics are required by different callers. Once the
> > > full semantics are nailed down, it can be refactored.
> > > 
> > > ...
> > >
> > > +/* Drop reference counts and free order-0 pages from a list. */
> > > +void free_pages_bulk(struct list_head *list)
> > > +{
> > > +	struct page *page, *next;
> > > +
> > > +	list_for_each_entry_safe(page, next, list, lru) {
> > > +		trace_mm_page_free_batched(page);
> > > +		if (put_page_testzero(page)) {
> > > +			list_del(&page->lru);
> > > +			__free_pages_ok(page, 0, FPI_NONE);
> > > +		}
> > > +	}
> > > +}
> > > +EXPORT_SYMBOL_GPL(free_pages_bulk);  
> > 
> > I expect that batching games are planned in here as well?
> >   
> 
> Potentially it could be done but the page allocator would need to be
> fundamentally aware of batching to make it tidy or the per-cpu allocator
> would need knowledge of how to handle batches in the free path.  Batch
> freeing to the buddy allocator is problematic as buddy merging has to
> happen. Batch freeing to per-cpu hits pcp->high limitations.
> 
> There are a couple of ways it *could* be done. Per-cpu lists could be
> allowed to temporarily exceed the high limits and reduce them out-of-band
> like what happens with counter updates or remote pcp freeing. Care
> would need to be taken when memory is low to avoid premature OOM
> and to guarantee draining happens in a timely fashion. There would be
> additional benefits to this. For example, release_pages() can hammer the
> zone lock when freeing very large batches and would benefit from either
> large batching or "plugging" the per-cpu list. I prototyped a series to
> allow the batch limits to be temporarily exceeded but it did not actually
> improve performance because of errors in the implementation and it needs
> a lot of work.
> 
> > >  static inline unsigned int
> > >  gfp_to_alloc_flags(gfp_t gfp_mask)
> > >  {
> > > @@ -4919,6 +4934,9 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
> > >  		struct alloc_context *ac, gfp_t *alloc_mask,
> > >  		unsigned int *alloc_flags)
> > >  {
> > > +	gfp_mask &= gfp_allowed_mask;
> > > +	*alloc_mask = gfp_mask;
> > > +
> > >  	ac->highest_zoneidx = gfp_zone(gfp_mask);
> > >  	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
> > >  	ac->nodemask = nodemask;
> > > @@ -4960,6 +4978,99 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
> > >  	return true;
> > >  }
> > >  
> > > +/*
> > > + * This is a batched version of the page allocator that attempts to
> > > + * allocate nr_pages quickly from the preferred zone and add them to list.
> > > + */  
> > 
> > Documentation is rather lame.  Returns number of pages allocated...
> >   
> 
> I added a note on the return value. The documentation is lame because at
> this point, we do not know what the required semantics for future users
> are. We have two examples at the moment in this series but I think it
> would be better to add kerneldoc documentation when there is a reasonable
> expectation that the API will not change. For example, SLUB could use
> this API when it fails to allocate a high-order page and instead allocate
> batches of order-0 pages but I did not investigate how feasible that
> is. Similarly, it's possible that we really need to deal with high-order
> batch allocations in which case, the per-cpu list should be high-order
> aware or the core buddy allocator needs to be batch-allocation aware.
> 
> > > +int __alloc_pages_bulk_nodemask(gfp_t gfp_mask, int preferred_nid,
> > > +			nodemask_t *nodemask, int nr_pages,
> > > +			struct list_head *alloc_list)
> > > +{
> > > +	struct page *page;
> > > +	unsigned long flags;
> > > +	struct zone *zone;
> > > +	struct zoneref *z;
> > > +	struct per_cpu_pages *pcp;
> > > +	struct list_head *pcp_list;
> > > +	struct alloc_context ac;
> > > +	gfp_t alloc_mask;
> > > +	unsigned int alloc_flags;
> > > +	int alloced = 0;
> > > +
> > > +	if (nr_pages == 1)
> > > +		goto failed;
> > > +
> > > +	/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
> > > +	if (!prepare_alloc_pages(gfp_mask, 0, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
> > > +		return 0;
> > > +	gfp_mask = alloc_mask;
> > > +
> > > +	/* Find an allowed local zone that meets the high watermark. */
> > > +	for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
> > > +		unsigned long mark;
> > > +
> > > +		if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
> > > +		    !__cpuset_zone_allowed(zone, gfp_mask)) {
> > > +			continue;
> > > +		}
> > > +
> > > +		if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
> > > +		    zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
> > > +			goto failed;
> > > +		}
> > > +
> > > +		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
> > > +		if (zone_watermark_fast(zone, 0,  mark,
> > > +				zonelist_zone_idx(ac.preferred_zoneref),
> > > +				alloc_flags, gfp_mask)) {
> > > +			break;
> > > +		}
> > > +	}  
> > 
> > I suspect the above was stolen from elsewhere and that some code
> > commonification is planned.
> >   
> 
> It's based on get_page_from_freelist. It would be messy to have them share
> common code at this point with a risk that the fast path for the common
> path (single page requests) would be impaired. The issue is that the
> fast path and slow paths have zonelist iteration, kswapd wakeup, cpuset
> enforcement and reclaim actions all mixed together at various different
> points. The locking is also mixed up with per-cpu list locking, statistic
> locking and buddy locking all having inappropriate overlaps (e.g. IRQ
> disabling protects per-cpu list locking, partially and unnecessarily
> protects statistics depending on architecture and overlaps with the
> IRQ-safe zone lock.
> 
> Ironing this out risks hurting the single page allocation path. It would
> need to be done incrementally with ultimately the core of the allocator
> dealing with batches to avoid false bisections.
> 
> >   
> > > +	if (!zone)
> > > +		return 0;
> > > +
> > > +	/* Attempt the batch allocation */
> > > +	local_irq_save(flags);
> > > +	pcp = &this_cpu_ptr(zone->pageset)->pcp;
> > > +	pcp_list = &pcp->lists[ac.migratetype];
> > > +
> > > +	while (alloced < nr_pages) {
> > > +		page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
> > > +								pcp, pcp_list);
> > > +		if (!page)
> > > +			break;
> > > +
> > > +		prep_new_page(page, 0, gfp_mask, 0);  
> > 
> > I wonder if it would be worth running prep_new_page() in a second pass,
> > after reenabling interrupts.
> >   
> 
> Possibly, I could add another patch on top that does this because it's
> trading the time that IRQs are disabled for a list iteration.

I for one like this idea, of moving prep_new_page() to a second pass.
As per below realtime concern, to reduce the time that IRQs are
disabled.

> > Speaking of which, will the realtime people get upset about the
> > irqs-off latency?  How many pages are we talking about here?
> >   

In my page_pool patch I'm bulk allocating 64 pages. I wanted to ask if
this is too much? (PP_ALLOC_CACHE_REFILL=64).

The mlx5 driver have a while loop for allocation 64 pages, which it
used in this case, that is why 64 is chosen.  If we choose a lower
bulk number, then the bulk-alloc will just be called more times.


> At the moment, it looks like batches of up to a few hundred at worst. I
> don't think realtime sensitive applications are likely to be using the
> bulk allocator API at this point.
> 
> The realtime people have a worse problem in that the per-cpu list does
> not use local_lock and disable IRQs more than it needs to on x86 in
> particular. I've a prototype series for this as well which splits the
> locking for the per-cpu list and statistic handling and then converts the
> per-cpu list to local_lock but I'm getting this off the table first because
> I don't want multiple page allocator series in flight at the same time.
> Thomas, Peter and Ingo would need to be cc'd on that series to review
> the local_lock aspects.
> 
> Even with local_lock, it's not clear to me why per-cpu lists need to be
> locked at all because potentially it could use a lock-free llist with some
> struct page overloading. That one is harder to predict when batches are
> taken into account as splicing a batch of free pages with llist would be
> unsafe so batch free might exchange IRQ disabling overhead with multiple
> atomics. I'd need to recheck things like whether NMI handlers ever call
> the page allocator (they shouldn't but it should be checked).  It would
> need a lot of review and testing.

The result of the API is to deliver pages as a double-linked list via
LRU (page->lru member).  If you are planning to use llist, then how to
handle this API change later?

Have you notice that the two users store the struct-page pointers in an
array?  We could have the caller provide the array to store struct-page
pointers, like we do with kmem_cache_alloc_bulk API.

You likely have good reasons for returning the pages as a list (via
lru), as I can see/imagine that there are some potential for grabbing
the entire PCP-list.

 
> > > +		list_add(&page->lru, alloc_list);
> > > +		alloced++;
> > > +	}
> > > +
> > > +	if (!alloced)
> > > +		goto failed_irq;
> > > +
> > > +	if (alloced) {
> > > +		__count_zid_vm_events(PGALLOC, zone_idx(zone),
> > > alloced);
> > > +		zone_statistics(zone, zone);
> > > +	}
> > > +
> > > +	local_irq_restore(flags);
> > > +
> > > +	return alloced;
> > > +
> > > +failed_irq:
> > > +	local_irq_restore(flags);
> > > +
> > > +failed:  
> > 
> > Might we need some counter to show how often this path happens?
> >   
> 
> I think that would be overkill at this point. It only gives useful
> information to a developer using the API for the first time and that
> can be done with a debugging patch (or probes if you're feeling
> creative). I'm already unhappy with the counter overhead in the page
> allocator. zone_statistics in particular has no business being an
> accurate statistic. It should have been a best-effort counter like
> vm_events that does not need IRQs to be disabled. If that was a
> simply counter as opposed to an accurate statistic then a failure
> counter at failed_irq would be very cheap to add.
Matthew Wilcox March 12, 2021, 12:43 p.m. UTC | #4
On Wed, Mar 10, 2021 at 10:46:15AM +0000, Mel Gorman wrote:
> +int __alloc_pages_bulk_nodemask(gfp_t gfp_mask, int preferred_nid,
> +				nodemask_t *nodemask, int nr_pages,
> +				struct list_head *list);

For the next revision, can you ditch the '_nodemask' part of the name?
Andrew just took this patch from me:

    mm/page_alloc: combine __alloc_pages and __alloc_pages_nodemask
    
    There are only two callers of __alloc_pages() so prune the thicket of
    alloc_page variants by combining the two functions together.  Current
    callers of __alloc_pages() simply add an extra 'NULL' parameter and
    current callers of __alloc_pages_nodemask() call __alloc_pages() instead.

...

-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
-                                                       nodemask_t *nodemask);
-
-static inline struct page *
-__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)
-{
-       return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
-}
+struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
+               nodemask_t *nodemask);

So calling this function __alloc_pages_bulk() fits with the new naming
scheme.

> @@ -4919,6 +4934,9 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
>  		struct alloc_context *ac, gfp_t *alloc_mask,
>  		unsigned int *alloc_flags)
>  {
> +	gfp_mask &= gfp_allowed_mask;
> +	*alloc_mask = gfp_mask;

Also I renamed alloc_mask to alloc_gfp.
Mel Gorman March 12, 2021, 1:44 p.m. UTC | #5
On Fri, Mar 12, 2021 at 12:46:09PM +0100, Jesper Dangaard Brouer wrote:
> > > > <SNIP>
> > > > +	if (!zone)
> > > > +		return 0;
> > > > +
> > > > +	/* Attempt the batch allocation */
> > > > +	local_irq_save(flags);
> > > > +	pcp = &this_cpu_ptr(zone->pageset)->pcp;
> > > > +	pcp_list = &pcp->lists[ac.migratetype];
> > > > +
> > > > +	while (alloced < nr_pages) {
> > > > +		page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
> > > > +								pcp, pcp_list);
> > > > +		if (!page)
> > > > +			break;
> > > > +
> > > > +		prep_new_page(page, 0, gfp_mask, 0);  
> > > 
> > > I wonder if it would be worth running prep_new_page() in a second pass,
> > > after reenabling interrupts.
> > >   
> > 
> > Possibly, I could add another patch on top that does this because it's
> > trading the time that IRQs are disabled for a list iteration.
> 
> I for one like this idea, of moving prep_new_page() to a second pass.
> As per below realtime concern, to reduce the time that IRQs are
> disabled.
> 

Already done.

> > > Speaking of which, will the realtime people get upset about the
> > > irqs-off latency?  How many pages are we talking about here?
> > >   
> 
> In my page_pool patch I'm bulk allocating 64 pages. I wanted to ask if
> this is too much? (PP_ALLOC_CACHE_REFILL=64).
> 

I expect no, it's not too much. The refill path should be short.

> > At the moment, it looks like batches of up to a few hundred at worst. I
> > don't think realtime sensitive applications are likely to be using the
> > bulk allocator API at this point.
> > 
> > The realtime people have a worse problem in that the per-cpu list does
> > not use local_lock and disable IRQs more than it needs to on x86 in
> > particular. I've a prototype series for this as well which splits the
> > locking for the per-cpu list and statistic handling and then converts the
> > per-cpu list to local_lock but I'm getting this off the table first because
> > I don't want multiple page allocator series in flight at the same time.
> > Thomas, Peter and Ingo would need to be cc'd on that series to review
> > the local_lock aspects.
> > 
> > Even with local_lock, it's not clear to me why per-cpu lists need to be
> > locked at all because potentially it could use a lock-free llist with some
> > struct page overloading. That one is harder to predict when batches are
> > taken into account as splicing a batch of free pages with llist would be
> > unsafe so batch free might exchange IRQ disabling overhead with multiple
> > atomics. I'd need to recheck things like whether NMI handlers ever call
> > the page allocator (they shouldn't but it should be checked).  It would
> > need a lot of review and testing.
> 
> The result of the API is to deliver pages as a double-linked list via
> LRU (page->lru member).  If you are planning to use llist, then how to
> handle this API change later?
> 

I would not have to. The per-cpu list internally can use llist internally
while pages returned to the bulk allocator user can still be a doubly
linked list. An llist_node fits in less space than the list_head lru.

> Have you notice that the two users store the struct-page pointers in an
> array?  We could have the caller provide the array to store struct-page
> pointers, like we do with kmem_cache_alloc_bulk API.
> 

That is a possibility but it ties the caller into declaring an array,
either via kmalloc, within an existing struct or on-stack. They would
then need to ensure that nr_pages does not exceed the array size or pass
in the array size. It's more error prone and a harder API to use.

> You likely have good reasons for returning the pages as a list (via
> lru), as I can see/imagine that there are some potential for grabbing
> the entire PCP-list.
> 

I used a list so that user was only required to define a list_head on
the stack to use the API.
Mel Gorman March 12, 2021, 2:15 p.m. UTC | #6
On Fri, Mar 12, 2021 at 12:43:31PM +0000, Matthew Wilcox wrote:
> On Wed, Mar 10, 2021 at 10:46:15AM +0000, Mel Gorman wrote:
> > +int __alloc_pages_bulk_nodemask(gfp_t gfp_mask, int preferred_nid,
> > +				nodemask_t *nodemask, int nr_pages,
> > +				struct list_head *list);
> 
> For the next revision, can you ditch the '_nodemask' part of the name?
> Andrew just took this patch from me:
> 

Ok, the first three patches are needed from that series. For convenience,
I'm going to post the same series with the rest of the patches as a
pre-requisite to avoid people having to take patches out of mmotm to test.
For review purposes, they can be ignored.

> > <SNIP>
> >
> > @@ -4919,6 +4934,9 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
> >  		struct alloc_context *ac, gfp_t *alloc_mask,
> >  		unsigned int *alloc_flags)
> >  {
> > +	gfp_mask &= gfp_allowed_mask;
> > +	*alloc_mask = gfp_mask;
> 
> Also I renamed alloc_mask to alloc_gfp.
> 

It then becomes obvious that prepare_alloc_pages does not share the same
naming convention as __alloc_pages(). In an effort to keep the naming
convention consistent, I updated the patch to also rename gfp_mask to
gfp in prepare_alloc_pages.

As a complete aside, I don't actually like the gfp name and would have
preferred gfp_flags because GFP is just an acronym and the context of the
variable is that it's a set of GFP Flags. The mask naming was wrong I admit
because it's not a mask but I'm not interested in naming the bike shed :)

Thanks for pointing this out early because it would have been a merge
headache!
Matthew Wilcox March 12, 2021, 2:58 p.m. UTC | #7
On Fri, Mar 12, 2021 at 12:46:09PM +0100, Jesper Dangaard Brouer wrote:
> In my page_pool patch I'm bulk allocating 64 pages. I wanted to ask if
> this is too much? (PP_ALLOC_CACHE_REFILL=64).
> 
> The mlx5 driver have a while loop for allocation 64 pages, which it
> used in this case, that is why 64 is chosen.  If we choose a lower
> bulk number, then the bulk-alloc will just be called more times.

The thing about batching is that smaller batches are often better.
Let's suppose you need to allocate 100 pages for something, and the page
allocator takes up 90% of your latency budget.  Batching just ten pages
at a time is going to reduce the overhead to 9%.  Going to 64 pages
reduces the overhead from 9% to 2% -- maybe that's important, but
possibly not.

> The result of the API is to deliver pages as a double-linked list via
> LRU (page->lru member).  If you are planning to use llist, then how to
> handle this API change later?
> 
> Have you notice that the two users store the struct-page pointers in an
> array?  We could have the caller provide the array to store struct-page
> pointers, like we do with kmem_cache_alloc_bulk API.

My preference would be for a pagevec.  That does limit you to 15 pages
per call [1], but I do think that might be enough.  And the overhead of
manipulating a linked list isn't free.

[1] patches exist to increase this, because it turns out that 15 may
not be enough for all systems!  but it would limit to 255 as an absolute
hard cap.
Mel Gorman March 12, 2021, 4:03 p.m. UTC | #8
On Fri, Mar 12, 2021 at 02:58:14PM +0000, Matthew Wilcox wrote:
> On Fri, Mar 12, 2021 at 12:46:09PM +0100, Jesper Dangaard Brouer wrote:
> > In my page_pool patch I'm bulk allocating 64 pages. I wanted to ask if
> > this is too much? (PP_ALLOC_CACHE_REFILL=64).
> > 
> > The mlx5 driver have a while loop for allocation 64 pages, which it
> > used in this case, that is why 64 is chosen.  If we choose a lower
> > bulk number, then the bulk-alloc will just be called more times.
> 
> The thing about batching is that smaller batches are often better.
> Let's suppose you need to allocate 100 pages for something, and the page
> allocator takes up 90% of your latency budget.  Batching just ten pages
> at a time is going to reduce the overhead to 9%.  Going to 64 pages
> reduces the overhead from 9% to 2% -- maybe that's important, but
> possibly not.
> 

I do not think that something like that can be properly accessed in
advance. It heavily depends on whether the caller is willing to amortise
the cost of the batch allocation or if the timing of the bulk request is
critical every single time.

> > The result of the API is to deliver pages as a double-linked list via
> > LRU (page->lru member).  If you are planning to use llist, then how to
> > handle this API change later?
> > 
> > Have you notice that the two users store the struct-page pointers in an
> > array?  We could have the caller provide the array to store struct-page
> > pointers, like we do with kmem_cache_alloc_bulk API.
> 
> My preference would be for a pagevec.  That does limit you to 15 pages
> per call [1], but I do think that might be enough.  And the overhead of
> manipulating a linked list isn't free.
> 

I'm opposed to a pagevec because it unnecessarily limits the caller.  The
sunrpc user for example knows how many pages it needs at the time the bulk
allocator is called but it's not the same value every time.  When tracing,
I found it sometimes requested 1 page (most common request actually) and
other times requested 200+ pages. Forcing it to call the batch allocator
in chunks of 15 means the caller incurs the cost of multiple allocation
requests which is almost as bad as calling __alloc_pages in a loop.

I think the first version should have an easy API to start with. Optimise
the implementation if it is a bottleneck. Only make the API harder to
use if the callers are really willing to always allocate and size the
array in advance and it's shown that it really makes a big difference
performance-wise.
Matthew Wilcox March 12, 2021, 9:08 p.m. UTC | #9
On Fri, Mar 12, 2021 at 04:03:50PM +0000, Mel Gorman wrote:
> On Fri, Mar 12, 2021 at 02:58:14PM +0000, Matthew Wilcox wrote:
> > On Fri, Mar 12, 2021 at 12:46:09PM +0100, Jesper Dangaard Brouer wrote:
> > > In my page_pool patch I'm bulk allocating 64 pages. I wanted to ask if
> > > this is too much? (PP_ALLOC_CACHE_REFILL=64).
> > > 
> > > The mlx5 driver have a while loop for allocation 64 pages, which it
> > > used in this case, that is why 64 is chosen.  If we choose a lower
> > > bulk number, then the bulk-alloc will just be called more times.
> > 
> > The thing about batching is that smaller batches are often better.
> > Let's suppose you need to allocate 100 pages for something, and the page
> > allocator takes up 90% of your latency budget.  Batching just ten pages
> > at a time is going to reduce the overhead to 9%.  Going to 64 pages
> > reduces the overhead from 9% to 2% -- maybe that's important, but
> > possibly not.
> > 
> 
> I do not think that something like that can be properly accessed in
> advance. It heavily depends on whether the caller is willing to amortise
> the cost of the batch allocation or if the timing of the bulk request is
> critical every single time.
> 
> > > The result of the API is to deliver pages as a double-linked list via
> > > LRU (page->lru member).  If you are planning to use llist, then how to
> > > handle this API change later?
> > > 
> > > Have you notice that the two users store the struct-page pointers in an
> > > array?  We could have the caller provide the array to store struct-page
> > > pointers, like we do with kmem_cache_alloc_bulk API.
> > 
> > My preference would be for a pagevec.  That does limit you to 15 pages
> > per call [1], but I do think that might be enough.  And the overhead of
> > manipulating a linked list isn't free.
> > 
> 
> I'm opposed to a pagevec because it unnecessarily limits the caller.  The
> sunrpc user for example knows how many pages it needs at the time the bulk
> allocator is called but it's not the same value every time.  When tracing,
> I found it sometimes requested 1 page (most common request actually) and
> other times requested 200+ pages. Forcing it to call the batch allocator
> in chunks of 15 means the caller incurs the cost of multiple allocation
> requests which is almost as bad as calling __alloc_pages in a loop.

Well, no.  It reduces the cost by a factor of 15 -- or by 93%.  200 is
an interesting example because putting 200 pages on a list costs 200 *
64 bytes of dirty cachelines, or 12KiB.  That's larger than some CPU L1
caches (mine's 48KB, 12-way set associative), but I think it's safe to say
some of those 200 cache lines are going to force others out into L2 cache.
Compared to a smaller batch of 15 pages in a pagevec, it'll dirty two cache
lines (admittedly the 15 struct pages are also going to get dirtied by being
allocated and then by being set up for whatever use they're getting, but
they should stay in L1 cache while that's happening).

I'm not claiming the pagevec is definitely a win, but it's very
unclear which tradeoff is actually going to lead to better performance.
Hopefully Jesper or Chuck can do some tests and figure out what actually
works better with their hardware & usage patterns.

> I think the first version should have an easy API to start with. Optimise
> the implementation if it is a bottleneck. Only make the API harder to
> use if the callers are really willing to always allocate and size the
> array in advance and it's shown that it really makes a big difference
> performance-wise.

I'm not entirely sure that a pagevec is harder to use than a list_head.
Mel Gorman March 13, 2021, 1:16 p.m. UTC | #10
On Fri, Mar 12, 2021 at 09:08:23PM +0000, Matthew Wilcox wrote:
> > > > The result of the API is to deliver pages as a double-linked list via
> > > > LRU (page->lru member).  If you are planning to use llist, then how to
> > > > handle this API change later?
> > > > 
> > > > Have you notice that the two users store the struct-page pointers in an
> > > > array?  We could have the caller provide the array to store struct-page
> > > > pointers, like we do with kmem_cache_alloc_bulk API.
> > > 
> > > My preference would be for a pagevec.  That does limit you to 15 pages
> > > per call [1], but I do think that might be enough.  And the overhead of
> > > manipulating a linked list isn't free.
> > > 
> > 
> > I'm opposed to a pagevec because it unnecessarily limits the caller.  The
> > sunrpc user for example knows how many pages it needs at the time the bulk
> > allocator is called but it's not the same value every time.  When tracing,
> > I found it sometimes requested 1 page (most common request actually) and
> > other times requested 200+ pages. Forcing it to call the batch allocator
> > in chunks of 15 means the caller incurs the cost of multiple allocation
> > requests which is almost as bad as calling __alloc_pages in a loop.
> 
> Well, no.  It reduces the cost by a factor of 15 -- or by 93%.  200 is
> an interesting example because putting 200 pages on a list costs 200 *
> 64 bytes of dirty cachelines, or 12KiB. 

That's a somewhat limited view. Yes, the overall cost gets reduced by
some factor but forcing the caller to limit the batch sizes incurs an
unnecessary cost. The SUNRPC user is particularly relevant as it cannot
make progress until it gets all the pages it requests -- it sleeps if
it cannot get the pages it needs. The whole point of the bulk allocator
is to avoid multiple round-trips through the page allocator. Forcing a
limit in the API requiring multiple round trips is just weird.

> That's larger than some CPU L1
> caches (mine's 48KB, 12-way set associative), but I think it's safe to say
> some of those 200 cache lines are going to force others out into L2 cache.
> Compared to a smaller batch of 15 pages in a pagevec, it'll dirty two cache
> lines (admittedly the 15 struct pages are also going to get dirtied by being
> allocated and then by being set up for whatever use they're getting, but
> they should stay in L1 cache while that's happening).
> 

The cache footprint is irrelevant if the caller *requires* the pages. If
the caller has to zero the pages then the cache gets thrashed anyway.
Even if non-temporal zeroing was used, the cache is likely thrashed by the
data copies. The page allocator in general is a cache nightmare because
of the number of cache lines it potentially dirties, particularly if it
has to call into the buddy allocator to split/merge pages for allocations
and frees respectively.

> I'm not claiming the pagevec is definitely a win, but it's very
> unclear which tradeoff is actually going to lead to better performance.
> Hopefully Jesper or Chuck can do some tests and figure out what actually
> works better with their hardware & usage patterns.
> 

The NFS user is often going to need to make round trips to get the pages it
needs. The pagevec would have to be copied into the target array meaning
it's not much better than a list manipulation.

Pagevecs are a bad interface in general simply because it puts hard
constraints on how many pages can be bulk allocatoed. Pagevecs are
primarily there to avoid excessive LRU lock acquisition and they are
bad at the job. These days, the LRU lock protects such a massive amount
of data that the pagevec is barely a band aid. Increasing its size just
shifts the problem slightly. I see very little value in introducing a
fundamental limitation into the bulk allocator by mandating pagevecs.

Now, I can see a case where the API moves to using arrays when there is a
user that is such a common hot path and using arrays that it is justified
but we're not there yet. The two callers are somewhat of corner cases and
both of them are limited by wire speed of networking. Not all users may
require arrays -- SLUB using batched order-0 pages on a high-allocation
failure for example would not need an array. Such an intensively hot user
does not currently exist so it's premature to even consider it.

> > I think the first version should have an easy API to start with. Optimise
> > the implementation if it is a bottleneck. Only make the API harder to
> > use if the callers are really willing to always allocate and size the
> > array in advance and it's shown that it really makes a big difference
> > performance-wise.
> 
> I'm not entirely sure that a pagevec is harder to use than a list_head.

Leaving aside the limitations of pagevecs, arrays get messy if the caller
does not necessarily use all the pages returned by the allocator. The
arrays would need to be tracked and/or preserved for some time. The
order pages are taken out of the array matters potentially. With lists,
the remaining pages can be easily spliced on a private cache or simply
handed back to the free API without having to track exactly how many
pages are on the array or where they are located. With arrays, the
elements have to be copied one at a time.

I think it's easier overall for the callers to deal with a list in
the initial implementation and only switch to arrays when there is an
extremely hot user that benefits heavily if pages are inserted directly
into an array.
Matthew Wilcox March 13, 2021, 4:39 p.m. UTC | #11
On Sat, Mar 13, 2021 at 01:16:48PM +0000, Mel Gorman wrote:
> > I'm not claiming the pagevec is definitely a win, but it's very
> > unclear which tradeoff is actually going to lead to better performance.
> > Hopefully Jesper or Chuck can do some tests and figure out what actually
> > works better with their hardware & usage patterns.
> 
> The NFS user is often going to need to make round trips to get the pages it
> needs. The pagevec would have to be copied into the target array meaning
> it's not much better than a list manipulation.

I don't think you fully realise how bad CPUs are at list manipulation.
See the attached program (and run it on your own hardware).  On my
less-than-a-year-old core-i7:

$ gcc -W -Wall -O2 -g array-vs-list.c -o array-vs-list
$ ./array-vs-list 
walked sequential array in 0.001765s
walked sequential list in 0.002920s
walked sequential array in 0.001777s
walked shuffled list in 0.081955s
walked shuffled array in 0.007367s

If you happen to get the objects in-order, it's only 64% worse to walk
a list as an array.  If they're out of order, it's *11.1* times as bad.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>

unsigned long count = 1000 * 1000;
unsigned int verbose;

struct object {
	struct object *next;
	struct object *prev;
	unsigned int value;
};

#define printv(level, fmt, ...) \
	if (level <= verbose) { printf(fmt, ##__VA_ARGS__); }

void check_total(unsigned long total)
{
	if (total * 2 != count * (count + 1))
		printf("Check your staging! (%lu %lu)\n", total, count);
}

void alloc_objs(struct object **array)
{
	unsigned long i;

	for (i = 0; i < count; i++) {
		struct object *obj = malloc(sizeof(*obj));

		obj->value = i + 1;
		/* Add to the array */
		array[i] = obj;
	}
}

void shuffle(struct object **array, unsigned long seed)
{
	unsigned long i;

	printv(1, "random seed %lu\n", seed);
	srand48(seed);

	/* Shuffle the array */
	for (i = 1; i < count; i++) {
		struct object *obj;
		unsigned long j = (unsigned int)mrand48() % (i + 1);

		if (i == j)
			continue;
		obj = array[j];
		array[j] = array[i];
		array[i] = obj;
	}
}

void create_list(struct object **array, struct object *list)
{
	unsigned long i;

	list->next = list;
	list->prev = list;

	for (i = 0; i < count; i++) {
		struct object *obj = array[i];
		/* Add to the tail of the list */
		obj->next = list;
		obj->prev = list->prev;
		list->prev->next = obj;
		list->prev = obj;
	}
}

void walk_list(struct object *list)
{
	unsigned long total = 0;
	struct object *obj;

	for (obj = list->next; obj != list; obj = obj->next) {
		total += obj->value;
	}

	check_total(total);
}

void walk_array(struct object **array)
{
	unsigned long total = 0;
	unsigned long i;

	for (i = 0; i < count; i++) {
		total += array[i]->value;
	}

	check_total(total);
}

/* time2 - time1 */
double diff_time(struct timespec *time1, struct timespec *time2)
{
	double result = time2->tv_nsec - time1->tv_nsec;

	return time2->tv_sec - time1->tv_sec + result / 1000 / 1000 / 1000;
}

int main(int argc, char **argv)
{
	int opt;
	unsigned long seed = time(NULL);
	struct object **array;
	struct object list;
	struct timespec time1, time2;

	while ((opt = getopt(argc, argv, "c:s:v")) != -1) {
		if (opt == 'c')
			count *= strtoul(optarg, NULL, 0);
		else if (opt == 's')
			seed = strtoul(optarg, NULL, 0);
		else if (opt == 'v')
			verbose++;
	}

	clock_gettime(CLOCK_MONOTONIC, &time1);
	array = calloc(count, sizeof(void *));
	alloc_objs(array);
	clock_gettime(CLOCK_MONOTONIC, &time2);
	printv(1, "allocated %lu items in %fs\n", count,
			diff_time(&time1, &time2));

	clock_gettime(CLOCK_MONOTONIC, &time1);
	walk_array(array);
	clock_gettime(CLOCK_MONOTONIC, &time2);
	printf("walked sequential array in %fs\n",
			diff_time(&time1, &time2));

	clock_gettime(CLOCK_MONOTONIC, &time1);
	create_list(array, &list);
	clock_gettime(CLOCK_MONOTONIC, &time2);
	printv(1, "created list in %fs\n",
			diff_time(&time1, &time2));

	clock_gettime(CLOCK_MONOTONIC, &time1);
	walk_list(&list);
	clock_gettime(CLOCK_MONOTONIC, &time2);
	printf("walked sequential list in %fs\n",
			diff_time(&time1, &time2));

	clock_gettime(CLOCK_MONOTONIC, &time1);
	walk_array(array);
	clock_gettime(CLOCK_MONOTONIC, &time2);
	printf("walked sequential array in %fs\n",
			diff_time(&time1, &time2));

	clock_gettime(CLOCK_MONOTONIC, &time1);
	shuffle(array, seed);
	clock_gettime(CLOCK_MONOTONIC, &time2);
	printv(1, "shuffled array in %fs\n",
			diff_time(&time1, &time2));

	clock_gettime(CLOCK_MONOTONIC, &time1);
	create_list(array, &list);
	clock_gettime(CLOCK_MONOTONIC, &time2);
	printv(1, "created list in %fs\n",
			diff_time(&time1, &time2));

	clock_gettime(CLOCK_MONOTONIC, &time1);
	walk_list(&list);
	clock_gettime(CLOCK_MONOTONIC, &time2);
	printf("walked shuffled list in %fs\n",
			diff_time(&time1, &time2));

	clock_gettime(CLOCK_MONOTONIC, &time1);
	walk_array(array);
	clock_gettime(CLOCK_MONOTONIC, &time2);
	printf("walked shuffled array in %fs\n",
			diff_time(&time1, &time2));

	return 0;
}
Chuck Lever III March 13, 2021, 4:56 p.m. UTC | #12
> On Mar 13, 2021, at 11:39 AM, Matthew Wilcox <willy@infradead.org> wrote:
> 
> On Sat, Mar 13, 2021 at 01:16:48PM +0000, Mel Gorman wrote:
>>> I'm not claiming the pagevec is definitely a win, but it's very
>>> unclear which tradeoff is actually going to lead to better performance.
>>> Hopefully Jesper or Chuck can do some tests and figure out what actually
>>> works better with their hardware & usage patterns.
>> 
>> The NFS user is often going to need to make round trips to get the pages it
>> needs. The pagevec would have to be copied into the target array meaning
>> it's not much better than a list manipulation.
> 
> I don't think you fully realise how bad CPUs are at list manipulation.
> See the attached program (and run it on your own hardware).  On my
> less-than-a-year-old core-i7:
> 
> $ gcc -W -Wall -O2 -g array-vs-list.c -o array-vs-list
> $ ./array-vs-list 
> walked sequential array in 0.001765s
> walked sequential list in 0.002920s
> walked sequential array in 0.001777s
> walked shuffled list in 0.081955s
> walked shuffled array in 0.007367s
> 
> If you happen to get the objects in-order, it's only 64% worse to walk
> a list as an array.  If they're out of order, it's *11.1* times as bad.
> <array-vs-list.c>

IME lists are indeed less CPU-efficient, but I wonder if that
expense is insignificant compared to serialization primitives like
disabling and re-enabling IRQs, which we are avoiding by using
bulk page allocation.

My initial experience with the current interface left me feeling
uneasy about re-using the lru list field. That seems to expose an
internal API feature to consumers of the page allocator. If we
continue with a list-centric bulk allocator API I hope there can
be some conveniently-placed documentation that explains when it is
safe to use that field. Or perhaps the field should be renamed.

I have a mild preference for an array-style interface because that's
more natural for the NFSD consumer, but I'm happy to have a bulk
allocator either way. Purely from a code-reuse point of view, I
wonder how many consumers of alloc_pages_bulk() will be like
svc_alloc_arg(), where they need to fill in pages in an array. Each
such consumer would need to repeat the logic to convert the returned
list into an array. We have, for instance, release_pages(), which is
an array-centric page allocator API. Maybe a helper function or two
might prevent duplication of the list conversion logic.

And I agree with Mel that passing a single large array seems more
useful then having to build code at each consumer call-site to
iterate over smaller page_vecs until that array is filled.


--
Chuck Lever
Matthew Wilcox March 13, 2021, 7:33 p.m. UTC | #13
On Sat, Mar 13, 2021 at 04:56:31PM +0000, Chuck Lever III wrote:
> IME lists are indeed less CPU-efficient, but I wonder if that
> expense is insignificant compared to serialization primitives like
> disabling and re-enabling IRQs, which we are avoiding by using
> bulk page allocation.

Cache misses are a worse problem than serialisation.  Paul McKenney had
a neat demonstration where he took a sheet of toilet paper to represent
an instruction, and then unrolled two rolls of toilet paper around the
lecture theatre to represent an L3 cache miss.  Obviously a serialising
instruction is worse than an add instruction, but i'm thinking maybe
50-100 sheets of paper, not an entire roll?

Anyway, I'm not arguing against a bulk allocator, nor even saying this
is a bad interface.  It just maybe could be better.

> My initial experience with the current interface left me feeling
> uneasy about re-using the lru list field. That seems to expose an
> internal API feature to consumers of the page allocator. If we
> continue with a list-centric bulk allocator API I hope there can
> be some conveniently-placed documentation that explains when it is
> safe to use that field. Or perhaps the field should be renamed.

Heh.  Spoken like a filesystem developer who's never been exposed to the
->readpages API (it's almost dead).  It's fairly common in the memory
management world to string pages together through the lru list_head.
Slab does it, as does put_pages_list() in mm/swap.c.  It's natural for
Mel to keep using this pattern ... and I dislike it intensely.

> I have a mild preference for an array-style interface because that's
> more natural for the NFSD consumer, but I'm happy to have a bulk
> allocator either way. Purely from a code-reuse point of view, I
> wonder how many consumers of alloc_pages_bulk() will be like
> svc_alloc_arg(), where they need to fill in pages in an array. Each
> such consumer would need to repeat the logic to convert the returned
> list into an array. We have, for instance, release_pages(), which is
> an array-centric page allocator API. Maybe a helper function or two
> might prevent duplication of the list conversion logic.
> 
> And I agree with Mel that passing a single large array seems more
> useful then having to build code at each consumer call-site to
> iterate over smaller page_vecs until that array is filled.

So how about this?

You provide the interface you'd _actually_ like to use (array-based) and
implement it on top of Mel's lru-list implementation.  If it's general
enough to be used by Jesper's use-case, we lift it to page_alloc.c.
If we go a year and there are no users of the lru-list interface, we
can just change the implementation.
Mel Gorman March 14, 2021, 12:52 p.m. UTC | #14
On Sat, Mar 13, 2021 at 07:33:43PM +0000, Matthew Wilcox wrote:
> On Sat, Mar 13, 2021 at 04:56:31PM +0000, Chuck Lever III wrote:
> > IME lists are indeed less CPU-efficient, but I wonder if that
> > expense is insignificant compared to serialization primitives like
> > disabling and re-enabling IRQs, which we are avoiding by using
> > bulk page allocation.
> 
> Cache misses are a worse problem than serialisation.  Paul McKenney had
> a neat demonstration where he took a sheet of toilet paper to represent
> an instruction, and then unrolled two rolls of toilet paper around the
> lecture theatre to represent an L3 cache miss.  Obviously a serialising
> instruction is worse than an add instruction, but i'm thinking maybe
> 50-100 sheets of paper, not an entire roll?
> 

I'm well array of the advantages of arrays over lists. The reality is that
the penalty is incurred unconditionally as the pages have to be removed
from the per-cpu or buddy lists and the cache footprint of the allocator
and the data copies are already large. It's also the case that bulk free
interfaces already exist that operate on lists (free_unref_page_list)
so there is existing precedent. The bulk free API in this series was not
used by the callers so I've deleted it.

Obviously the callers would need to be adjusted to use the array
interface. The sunrpc user has an array but it is coded in a way that
expects the array could be partially populated or has holes so the API has
to skip populated elements. The caller is responsible for making sure that
there are enough NULL elements available to store nr_pages or the buffer
overruns. nr_elements could be passed in to avoid the buffer overrun but
then further logic is needed to distinguish between a failed allocation
and a failure to have enough space in the array to store the pointer.
It also means that prep_new_page() should not be deferred outside of
the IRQ disabled section as it does not have the storage to track which
pages were freshly allocated and which ones were already on the array. It
could be tracked using the lower bit of the pointer but that is not free
either. Ideally the callers simply would ensure the array does not have
valid struct page pointers in it already so prepping the new page could
always be deferred.  Obviously the callers are also responsible for
ensuring protecting the array from parallel access if necessary while
calling into the allocator.

> Anyway, I'm not arguing against a bulk allocator, nor even saying this
> is a bad interface.  It just maybe could be better.
> 

I think it puts more responsibility on the caller to use the API correctly
but I also see no value in arguing about it further because there is no
supporting data either way (I don't have routine access to a sufficiently
fast network to generate the data). I can add the following patch and let
callers figure out which interface is preferred. If one of the interfaces
is dead in a year, it can be removed.

As there are a couple of ways the arrays could be used, I'm leaving it
up to Jesper and Chuck which interface they want to use. In particular,
it would be preferred if the array has no valid struct pages in it but
it's up to them to judge how practical that is.

Patch is only lightly tested with a poor conversion of the sunrpc code
to use the array interface.

---8<---
mm/page_alloc: Add an array-based interface to the bulk page allocator

The existing callers for the bulk allocator are storing the pages in
arrays. This patch adds an array-based interface to the API to avoid
multiple list iterations. The page list interface is preserved to
avoid requiring all users of the bulk API to allocate and manage
enough storage to store the pages.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4a304fd39916..fb6234e1fe59 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -520,13 +520,20 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 
 int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 				nodemask_t *nodemask, int nr_pages,
-				struct list_head *list);
+				struct list_head *page_list,
+				struct page **page_array);
 
 /* Bulk allocate order-0 pages */
 static inline unsigned long
-alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
+alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
 {
-	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list);
+	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL);
+}
+
+static inline unsigned long
+alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array)
+{
+	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array);
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3e0c87c588d3..96590f0726c7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4965,13 +4965,20 @@ static inline bool prepare_alloc_pages(gfp_t gfp, unsigned int order,
 
 /*
  * This is a batched version of the page allocator that attempts to
- * allocate nr_pages quickly from the preferred zone and add them to list.
+ * allocate nr_pages quickly from the preferred zone. Pages are added
+ * to page_list if page_list is not NULL, otherwise it is assumed
+ * that the page_array is valid.
+ *
+ * If using page_array, only NULL elements are populated with pages.
+ * The caller must ensure that the array has enough NULL elements
+ * to store nr_pages or the buffer overruns.
  *
  * Returns the number of pages allocated.
  */
 int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 			nodemask_t *nodemask, int nr_pages,
-			struct list_head *alloc_list)
+			struct list_head *page_list,
+			struct page **page_array)
 {
 	struct page *page;
 	unsigned long flags;
@@ -4987,6 +4994,9 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 	if (WARN_ON_ONCE(nr_pages <= 0))
 		return 0;
 
+	if (WARN_ON_ONCE(!page_list && !page_array))
+		return 0;
+
 	if (nr_pages == 1)
 		goto failed;
 
@@ -5035,7 +5045,24 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 			break;
 		}
 
-		list_add(&page->lru, alloc_list);
+		if (page_list) {
+			/* New page prep is deferred */
+			list_add(&page->lru, page_list);
+		} else {
+			/* Skip populated elements */
+			while (*page_array)
+				page_array++;
+
+			/*
+			 * Array pages must be prepped immediately to
+			 * avoid tracking which pages are new and
+			 * which ones were already on the array.
+			 */
+			prep_new_page(page, 0, gfp, 0);
+			*page_array = page;
+			page_array++;
+		}
+
 		allocated++;
 	}
 
@@ -5044,9 +5071,12 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 
 	local_irq_restore(flags);
 
-	/* Prep page with IRQs enabled to reduce disabled times */
-	list_for_each_entry(page, alloc_list, lru)
-		prep_new_page(page, 0, gfp, 0);
+	/* Prep pages with IRQs enabled if using a list */
+	if (page_list) {
+		list_for_each_entry(page, page_list, lru) {
+			prep_new_page(page, 0, gfp, 0);
+		}
+	}
 
 	return allocated;
 
@@ -5056,7 +5086,10 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 failed:
 	page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
 	if (page) {
-		list_add(&page->lru, alloc_list);
+		if (page_list)
+			list_add(&page->lru, page_list);
+		else
+			*page_array = page;
 		allocated = 1;
 	}
Chuck Lever III March 14, 2021, 3:22 p.m. UTC | #15
> On Mar 14, 2021, at 8:52 AM, Mel Gorman <mgorman@techsingularity.net> wrote:
> 
> On Sat, Mar 13, 2021 at 07:33:43PM +0000, Matthew Wilcox wrote:
>> On Sat, Mar 13, 2021 at 04:56:31PM +0000, Chuck Lever III wrote:
>>> IME lists are indeed less CPU-efficient, but I wonder if that
>>> expense is insignificant compared to serialization primitives like
>>> disabling and re-enabling IRQs, which we are avoiding by using
>>> bulk page allocation.
>> 
>> Cache misses are a worse problem than serialisation.  Paul McKenney had
>> a neat demonstration where he took a sheet of toilet paper to represent
>> an instruction, and then unrolled two rolls of toilet paper around the
>> lecture theatre to represent an L3 cache miss.  Obviously a serialising
>> instruction is worse than an add instruction, but i'm thinking maybe
>> 50-100 sheets of paper, not an entire roll?
>> 
> 
> I'm well array of the advantages of arrays over lists. The reality is that
> the penalty is incurred unconditionally as the pages have to be removed
> from the per-cpu or buddy lists and the cache footprint of the allocator
> and the data copies are already large. It's also the case that bulk free
> interfaces already exist that operate on lists (free_unref_page_list)
> so there is existing precedent. The bulk free API in this series was not
> used by the callers so I've deleted it.
> 
> Obviously the callers would need to be adjusted to use the array
> interface. The sunrpc user has an array but it is coded in a way that
> expects the array could be partially populated or has holes so the API has
> to skip populated elements. The caller is responsible for making sure that
> there are enough NULL elements available to store nr_pages or the buffer
> overruns. nr_elements could be passed in to avoid the buffer overrun but
> then further logic is needed to distinguish between a failed allocation
> and a failure to have enough space in the array to store the pointer.
> It also means that prep_new_page() should not be deferred outside of
> the IRQ disabled section as it does not have the storage to track which
> pages were freshly allocated and which ones were already on the array. It
> could be tracked using the lower bit of the pointer but that is not free
> either. Ideally the callers simply would ensure the array does not have
> valid struct page pointers in it already so prepping the new page could
> always be deferred.  Obviously the callers are also responsible for
> ensuring protecting the array from parallel access if necessary while
> calling into the allocator.
> 
>> Anyway, I'm not arguing against a bulk allocator, nor even saying this
>> is a bad interface.  It just maybe could be better.
>> 
> 
> I think it puts more responsibility on the caller to use the API correctly
> but I also see no value in arguing about it further because there is no
> supporting data either way (I don't have routine access to a sufficiently
> fast network to generate the data). I can add the following patch and let
> callers figure out which interface is preferred. If one of the interfaces
> is dead in a year, it can be removed.
> 
> As there are a couple of ways the arrays could be used, I'm leaving it
> up to Jesper and Chuck which interface they want to use. In particular,
> it would be preferred if the array has no valid struct pages in it but
> it's up to them to judge how practical that is.

I'm interested to hear from Jesper.

My two cents (US):

If svc_alloc_arg() is the /only/ consumer that wants to fill
a partially populated array of page pointers, then there's no
code-duplication benefit to changing the synopsis of
alloc_pages_bulk() at this point.

Also, if the consumers still have to pass in the number of
pages the array needs, rather than having the bulk allocator
figure it out, then there's not much additional benefit, IMO.

Ideally (for SUNRPC) alloc_pages_bulk() would take a pointer
to a sparsely-populated array and the total number of elements
in that array, and fill in the NULL elements. The return value
would be "success -- all elements are populated" or "failure --
some elements remain NULL".

But again, if no other consumer finds that useful, or that API
design obscures the performance benefits of the bulk allocator,
I can be very happy with the list-centric API. My interest in
this part of the exercise is simply to reduce the overall amount
of new complexity across mm/ and consumers of the bulk allocator.


> Patch is only lightly tested with a poor conversion of the sunrpc code
> to use the array interface.
> 
> ---8<---
> mm/page_alloc: Add an array-based interface to the bulk page allocator
> 
> The existing callers for the bulk allocator are storing the pages in
> arrays. This patch adds an array-based interface to the API to avoid
> multiple list iterations. The page list interface is preserved to
> avoid requiring all users of the bulk API to allocate and manage
> enough storage to store the pages.
> 
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 4a304fd39916..fb6234e1fe59 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -520,13 +520,20 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
> 
> int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
> 				nodemask_t *nodemask, int nr_pages,
> -				struct list_head *list);
> +				struct list_head *page_list,
> +				struct page **page_array);
> 
> /* Bulk allocate order-0 pages */
> static inline unsigned long
> -alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
> +alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
> {
> -	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list);
> +	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL);
> +}
> +
> +static inline unsigned long
> +alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array)
> +{
> +	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array);
> }
> 
> /*
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3e0c87c588d3..96590f0726c7 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -4965,13 +4965,20 @@ static inline bool prepare_alloc_pages(gfp_t gfp, unsigned int order,
> 
> /*
>  * This is a batched version of the page allocator that attempts to
> - * allocate nr_pages quickly from the preferred zone and add them to list.
> + * allocate nr_pages quickly from the preferred zone. Pages are added
> + * to page_list if page_list is not NULL, otherwise it is assumed
> + * that the page_array is valid.
> + *
> + * If using page_array, only NULL elements are populated with pages.
> + * The caller must ensure that the array has enough NULL elements
> + * to store nr_pages or the buffer overruns.
>  *
>  * Returns the number of pages allocated.
>  */
> int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
> 			nodemask_t *nodemask, int nr_pages,
> -			struct list_head *alloc_list)
> +			struct list_head *page_list,
> +			struct page **page_array)
> {
> 	struct page *page;
> 	unsigned long flags;
> @@ -4987,6 +4994,9 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
> 	if (WARN_ON_ONCE(nr_pages <= 0))
> 		return 0;
> 
> +	if (WARN_ON_ONCE(!page_list && !page_array))
> +		return 0;
> +
> 	if (nr_pages == 1)
> 		goto failed;
> 
> @@ -5035,7 +5045,24 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
> 			break;
> 		}
> 
> -		list_add(&page->lru, alloc_list);
> +		if (page_list) {
> +			/* New page prep is deferred */
> +			list_add(&page->lru, page_list);
> +		} else {
> +			/* Skip populated elements */
> +			while (*page_array)
> +				page_array++;
> +
> +			/*
> +			 * Array pages must be prepped immediately to
> +			 * avoid tracking which pages are new and
> +			 * which ones were already on the array.
> +			 */
> +			prep_new_page(page, 0, gfp, 0);
> +			*page_array = page;
> +			page_array++;
> +		}
> +
> 		allocated++;
> 	}
> 
> @@ -5044,9 +5071,12 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
> 
> 	local_irq_restore(flags);
> 
> -	/* Prep page with IRQs enabled to reduce disabled times */
> -	list_for_each_entry(page, alloc_list, lru)
> -		prep_new_page(page, 0, gfp, 0);
> +	/* Prep pages with IRQs enabled if using a list */
> +	if (page_list) {
> +		list_for_each_entry(page, page_list, lru) {
> +			prep_new_page(page, 0, gfp, 0);
> +		}
> +	}
> 
> 	return allocated;
> 
> @@ -5056,7 +5086,10 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
> failed:
> 	page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
> 	if (page) {
> -		list_add(&page->lru, alloc_list);
> +		if (page_list)
> +			list_add(&page->lru, page_list);
> +		else
> +			*page_array = page;
> 		allocated = 1;
> 	}
> 

--
Chuck Lever
Mel Gorman March 15, 2021, 10:42 a.m. UTC | #16
On Sun, Mar 14, 2021 at 03:22:02PM +0000, Chuck Lever III wrote:
> >> Anyway, I'm not arguing against a bulk allocator, nor even saying this
> >> is a bad interface.  It just maybe could be better.
> >> 
> > 
> > I think it puts more responsibility on the caller to use the API correctly
> > but I also see no value in arguing about it further because there is no
> > supporting data either way (I don't have routine access to a sufficiently
> > fast network to generate the data). I can add the following patch and let
> > callers figure out which interface is preferred. If one of the interfaces
> > is dead in a year, it can be removed.
> > 
> > As there are a couple of ways the arrays could be used, I'm leaving it
> > up to Jesper and Chuck which interface they want to use. In particular,
> > it would be preferred if the array has no valid struct pages in it but
> > it's up to them to judge how practical that is.
> 
> I'm interested to hear from Jesper.
> 
> My two cents (US):
> 
> If svc_alloc_arg() is the /only/ consumer that wants to fill
> a partially populated array of page pointers, then there's no
> code-duplication benefit to changing the synopsis of
> alloc_pages_bulk() at this point.
> 
> Also, if the consumers still have to pass in the number of
> pages the array needs, rather than having the bulk allocator
> figure it out, then there's not much additional benefit, IMO.
> 
> Ideally (for SUNRPC) alloc_pages_bulk() would take a pointer
> to a sparsely-populated array and the total number of elements
> in that array, and fill in the NULL elements. The return value
> would be "success -- all elements are populated" or "failure --
> some elements remain NULL".
> 

If the array API interface was expected to handle sparse arrays, it would
make sense to define nr_pages are the number of pages that need to be
in the array instead of the number of pages to allocate. The preamble
would skip the first N number of allocated pages and decrement nr_pages
accordingly before the watermark check. The return value would then be the
last populated array element and the caller decides if that is enough to
proceed or if the API needs to be called again. There is a slight risk
that with a spare array that only needed 1 page in reality would fail
the watermark check but on low memory, allocations take more work anyway.
That definition of nr_pages would avoid the potential buffer overrun but
both you and Jesper would need to agree that it's an appropriate API.
Jesper Dangaard Brouer March 15, 2021, 4:42 p.m. UTC | #17
On Mon, 15 Mar 2021 10:42:05 +0000
Mel Gorman <mgorman@techsingularity.net> wrote:

> On Sun, Mar 14, 2021 at 03:22:02PM +0000, Chuck Lever III wrote:
> > >> Anyway, I'm not arguing against a bulk allocator, nor even saying this
> > >> is a bad interface.  It just maybe could be better.
> > >>   
> > > 
> > > I think it puts more responsibility on the caller to use the API correctly
> > > but I also see no value in arguing about it further because there is no
> > > supporting data either way (I don't have routine access to a sufficiently
> > > fast network to generate the data). I can add the following patch and let
> > > callers figure out which interface is preferred. If one of the interfaces
> > > is dead in a year, it can be removed.
> > > 
> > > As there are a couple of ways the arrays could be used, I'm leaving it
> > > up to Jesper and Chuck which interface they want to use. In particular,
> > > it would be preferred if the array has no valid struct pages in it but
> > > it's up to them to judge how practical that is.  
> > 
> > I'm interested to hear from Jesper.
> > 
> > My two cents (US):
> > 
> > If svc_alloc_arg() is the /only/ consumer that wants to fill
> > a partially populated array of page pointers, then there's no
> > code-duplication benefit to changing the synopsis of
> > alloc_pages_bulk() at this point.
> > 
> > Also, if the consumers still have to pass in the number of
> > pages the array needs, rather than having the bulk allocator
> > figure it out, then there's not much additional benefit, IMO.
> > 
> > Ideally (for SUNRPC) alloc_pages_bulk() would take a pointer
> > to a sparsely-populated array and the total number of elements
> > in that array, and fill in the NULL elements. The return value
> > would be "success -- all elements are populated" or "failure --
> > some elements remain NULL".
> >   
> 
> If the array API interface was expected to handle sparse arrays, it would
> make sense to define nr_pages are the number of pages that need to be
> in the array instead of the number of pages to allocate. The preamble
> would skip the first N number of allocated pages and decrement nr_pages
> accordingly before the watermark check. The return value would then be the
> last populated array element and the caller decides if that is enough to
> proceed or if the API needs to be called again. There is a slight risk
> that with a spare array that only needed 1 page in reality would fail
> the watermark check but on low memory, allocations take more work anyway.
> That definition of nr_pages would avoid the potential buffer overrun but
> both you and Jesper would need to agree that it's an appropriate API.

I actually like the idea of doing it this way.  Even-though the
page_pool fast-path (__page_pool_get_cached()) doesn't clear/mark the
"consumed" elements with NULL.  I'm ready to change page_pool to handle
this when calling this API, as I think it will be faster than walking
the linked list.

Even-though my page_pool use-case doesn't have a sparse array to
populate (like NFS/SUNRPC) then I can still use this API that Chuck is
suggesting. Thus, I'm fine with this :-)


(p.s. working on implementing Alexander Duyck's suggestions, but I
don't have it ready yet, I will try to send new patch tomorrow. And I
do realize that with this API change I have to reimplement it again,
but as long as we make forward progress then I'll happily do it).
Jesper Dangaard Brouer March 19, 2021, 5:10 p.m. UTC | #18
On Sun, 14 Mar 2021 12:52:32 +0000
Mel Gorman <mgorman@techsingularity.net> wrote:

> mm/page_alloc: Add an array-based interface to the bulk page allocator
> 
> The existing callers for the bulk allocator are storing the pages in
> arrays. This patch adds an array-based interface to the API to avoid
> multiple list iterations. The page list interface is preserved to
> avoid requiring all users of the bulk API to allocate and manage
> enough storage to store the pages.

I'm testing this patch, see results below and in commit[1].  The array
variant is clearly faster in these micro-benchmarks.
(Some comment inlined about code)

The change to my page_bench04_bulk is here[1]:
 [1] https://github.com/netoptimizer/prototype-kernel/commit/4c41fe0d4107f514

Notice these "per elem" measurements are alloc+free cost for order-0 pages.

BASELINE
 single_page alloc+put: 207 cycles(tsc) 57.773 ns

LIST variant: time_bulk_page_alloc_free_list: step=bulk size

 Per elem: 294 cycles(tsc) 81.866 ns (step:1)
 Per elem: 214 cycles(tsc) 59.477 ns (step:2)
 Per elem: 199 cycles(tsc) 55.504 ns (step:3)
 Per elem: 192 cycles(tsc) 53.489 ns (step:4)
 Per elem: 188 cycles(tsc) 52.456 ns (step:8)
 Per elem: 184 cycles(tsc) 51.346 ns (step:16)
 Per elem: 183 cycles(tsc) 50.883 ns (step:32)
 Per elem: 184 cycles(tsc) 51.236 ns (step:64)
 Per elem: 189 cycles(tsc) 52.620 ns (step:128)

ARRAY variant: time_bulk_page_alloc_free_array: step=bulk size

 Per elem: 195 cycles(tsc) 54.174 ns (step:1)
 Per elem: 123 cycles(tsc) 34.224 ns (step:2)
 Per elem: 113 cycles(tsc) 31.430 ns (step:3)
 Per elem: 108 cycles(tsc) 30.003 ns (step:4)
 Per elem: 102 cycles(tsc) 28.417 ns (step:8)
 Per elem:  98 cycles(tsc) 27.475 ns (step:16)
 Per elem:  96 cycles(tsc) 26.901 ns (step:32)
 Per elem:  95 cycles(tsc) 26.487 ns (step:64)
 Per elem:  94 cycles(tsc) 26.170 ns (step:128)

The array variant is clearly faster.

It it worth mentioning that bulk=1 result in fallback to normal
single page allocation via __alloc_pages().


> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 4a304fd39916..fb6234e1fe59 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -520,13 +520,20 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
>  
>  int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
>  				nodemask_t *nodemask, int nr_pages,
> -				struct list_head *list);
> +				struct list_head *page_list,
> +				struct page **page_array);
>  
>  /* Bulk allocate order-0 pages */
>  static inline unsigned long
> -alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
> +alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
>  {
> -	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list);
> +	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL);
> +}
> +
> +static inline unsigned long
> +alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array)
> +{
> +	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array);
>  }
>  
>  /*
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3e0c87c588d3..96590f0726c7 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -4965,13 +4965,20 @@ static inline bool prepare_alloc_pages(gfp_t gfp, unsigned int order,
>  
>  /*
>   * This is a batched version of the page allocator that attempts to
> - * allocate nr_pages quickly from the preferred zone and add them to list.
> + * allocate nr_pages quickly from the preferred zone. Pages are added
> + * to page_list if page_list is not NULL, otherwise it is assumed
> + * that the page_array is valid.
> + *
> + * If using page_array, only NULL elements are populated with pages.
> + * The caller must ensure that the array has enough NULL elements
> + * to store nr_pages or the buffer overruns.
>   *
>   * Returns the number of pages allocated.
>   */
>  int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
>  			nodemask_t *nodemask, int nr_pages,
> -			struct list_head *alloc_list)
> +			struct list_head *page_list,
> +			struct page **page_array)
>  {
>  	struct page *page;
>  	unsigned long flags;
> @@ -4987,6 +4994,9 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
>  	if (WARN_ON_ONCE(nr_pages <= 0))
>  		return 0;
>  
> +	if (WARN_ON_ONCE(!page_list && !page_array))
> +		return 0;
> +
>  	if (nr_pages == 1)
>  		goto failed;
>  
> @@ -5035,7 +5045,24 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
>  			break;
>  		}
>  
> -		list_add(&page->lru, alloc_list);
> +		if (page_list) {
> +			/* New page prep is deferred */
> +			list_add(&page->lru, page_list);
> +		} else {
> +			/* Skip populated elements */
> +			while (*page_array)
> +				page_array++;

I don't like this approach as it is a dangerous construct, that can run
wild through the memory.  I have coded up another approach where I have
an nr_avail counter instead, that will "include" and count existing
elements in the array.

> +
> +			/*
> +			 * Array pages must be prepped immediately to
> +			 * avoid tracking which pages are new and
> +			 * which ones were already on the array.
> +			 */
> +			prep_new_page(page, 0, gfp, 0);
> +			*page_array = page;
> +			page_array++;
> +		}
> +
>  		allocated++;
>  	}
>  
> @@ -5044,9 +5071,12 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
>  
>  	local_irq_restore(flags);
>  
> -	/* Prep page with IRQs enabled to reduce disabled times */
> -	list_for_each_entry(page, alloc_list, lru)
> -		prep_new_page(page, 0, gfp, 0);
> +	/* Prep pages with IRQs enabled if using a list */
> +	if (page_list) {
> +		list_for_each_entry(page, page_list, lru) {
> +			prep_new_page(page, 0, gfp, 0);
> +		}
> +	}
>  
>  	return allocated;
>  
> @@ -5056,7 +5086,10 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
>  failed:
>  	page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
>  	if (page) {
> -		list_add(&page->lru, alloc_list);
> +		if (page_list)
> +			list_add(&page->lru, page_list);
> +		else
> +			*page_array = page;
>  		allocated = 1;
>  	}
>
diff mbox series

Patch

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 8572a1474e16..4903d1cc48dc 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -515,6 +515,10 @@  static inline int arch_make_page_accessible(struct page *page)
 }
 #endif
 
+int __alloc_pages_bulk_nodemask(gfp_t gfp_mask, int preferred_nid,
+				nodemask_t *nodemask, int nr_pages,
+				struct list_head *list);
+
 struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
 							nodemask_t *nodemask);
@@ -525,6 +529,14 @@  __alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)
 	return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
 }
 
+/* Bulk allocate order-0 pages */
+static inline unsigned long
+alloc_pages_bulk(gfp_t gfp_mask, unsigned long nr_pages, struct list_head *list)
+{
+	return __alloc_pages_bulk_nodemask(gfp_mask, numa_mem_id(), NULL,
+							nr_pages, list);
+}
+
 /*
  * Allocate pages, preferring the node given as nid. The node must be valid and
  * online. For more general interface, see alloc_pages_node().
@@ -594,6 +606,7 @@  void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
 
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
+extern void free_pages_bulk(struct list_head *list);
 
 struct page_frag_cache;
 extern void __page_frag_cache_drain(struct page *page, unsigned int count);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3e4b29ee2b1e..ff1e55793786 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4436,6 +4436,21 @@  static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
 	}
 }
 
+/* Drop reference counts and free order-0 pages from a list. */
+void free_pages_bulk(struct list_head *list)
+{
+	struct page *page, *next;
+
+	list_for_each_entry_safe(page, next, list, lru) {
+		trace_mm_page_free_batched(page);
+		if (put_page_testzero(page)) {
+			list_del(&page->lru);
+			__free_pages_ok(page, 0, FPI_NONE);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(free_pages_bulk);
+
 static inline unsigned int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
@@ -4919,6 +4934,9 @@  static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 		struct alloc_context *ac, gfp_t *alloc_mask,
 		unsigned int *alloc_flags)
 {
+	gfp_mask &= gfp_allowed_mask;
+	*alloc_mask = gfp_mask;
+
 	ac->highest_zoneidx = gfp_zone(gfp_mask);
 	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
 	ac->nodemask = nodemask;
@@ -4960,6 +4978,99 @@  static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 	return true;
 }
 
+/*
+ * This is a batched version of the page allocator that attempts to
+ * allocate nr_pages quickly from the preferred zone and add them to list.
+ */
+int __alloc_pages_bulk_nodemask(gfp_t gfp_mask, int preferred_nid,
+			nodemask_t *nodemask, int nr_pages,
+			struct list_head *alloc_list)
+{
+	struct page *page;
+	unsigned long flags;
+	struct zone *zone;
+	struct zoneref *z;
+	struct per_cpu_pages *pcp;
+	struct list_head *pcp_list;
+	struct alloc_context ac;
+	gfp_t alloc_mask;
+	unsigned int alloc_flags;
+	int alloced = 0;
+
+	if (nr_pages == 1)
+		goto failed;
+
+	/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
+	if (!prepare_alloc_pages(gfp_mask, 0, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
+		return 0;
+	gfp_mask = alloc_mask;
+
+	/* Find an allowed local zone that meets the high watermark. */
+	for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
+		unsigned long mark;
+
+		if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
+		    !__cpuset_zone_allowed(zone, gfp_mask)) {
+			continue;
+		}
+
+		if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
+		    zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
+			goto failed;
+		}
+
+		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
+		if (zone_watermark_fast(zone, 0,  mark,
+				zonelist_zone_idx(ac.preferred_zoneref),
+				alloc_flags, gfp_mask)) {
+			break;
+		}
+	}
+	if (!zone)
+		return 0;
+
+	/* Attempt the batch allocation */
+	local_irq_save(flags);
+	pcp = &this_cpu_ptr(zone->pageset)->pcp;
+	pcp_list = &pcp->lists[ac.migratetype];
+
+	while (alloced < nr_pages) {
+		page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
+								pcp, pcp_list);
+		if (!page)
+			break;
+
+		prep_new_page(page, 0, gfp_mask, 0);
+		list_add(&page->lru, alloc_list);
+		alloced++;
+	}
+
+	if (!alloced)
+		goto failed_irq;
+
+	if (alloced) {
+		__count_zid_vm_events(PGALLOC, zone_idx(zone), alloced);
+		zone_statistics(zone, zone);
+	}
+
+	local_irq_restore(flags);
+
+	return alloced;
+
+failed_irq:
+	local_irq_restore(flags);
+
+failed:
+	page = __alloc_pages_nodemask(gfp_mask, 0, preferred_nid, nodemask);
+	if (page) {
+		alloced++;
+		list_add(&page->lru, alloc_list);
+	}
+
+	return alloced;
+}
+EXPORT_SYMBOL_GPL(__alloc_pages_bulk_nodemask);
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -4981,8 +5092,6 @@  __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
 		return NULL;
 	}
 
-	gfp_mask &= gfp_allowed_mask;
-	alloc_mask = gfp_mask;
 	if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
 		return NULL;