diff mbox series

mm/page_alloc: call check_pcp_refill() while zone spinlock is not held

Message ID 20220313232547.3843690-1-eric.dumazet@gmail.com (mailing list archive)
State New
Headers show
Series mm/page_alloc: call check_pcp_refill() while zone spinlock is not held | expand

Commit Message

Eric Dumazet March 13, 2022, 11:25 p.m. UTC
From: Eric Dumazet <edumazet@google.com>

check_pcp_refill() is used from rmqueue_bulk() while zone spinlock
is held.

This used to be fine because check_pcp_refill() was testing only the
head page, while its 'struct page' was very hot in the cpu caches.

With ("mm/page_alloc: check high-order pages for corruption during PCP
operations") check_pcp_refill() will add latencies for high order pages.

We can defer the calls to check_pcp_refill() after the zone
spinlock has been released.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Wei Xu <weixugc@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Hugh Dickins <hughd@google.com>
---
 mm/page_alloc.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

Comments

Vlastimil Babka March 14, 2022, 9:34 a.m. UTC | #1
On 3/14/22 00:25, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> check_pcp_refill() is used from rmqueue_bulk() while zone spinlock
> is held.
> 
> This used to be fine because check_pcp_refill() was testing only the
> head page, while its 'struct page' was very hot in the cpu caches.
> 
> With ("mm/page_alloc: check high-order pages for corruption during PCP
> operations") check_pcp_refill() will add latencies for high order pages.
> 
> We can defer the calls to check_pcp_refill() after the zone
> spinlock has been released.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Mel Gorman <mgorman@techsingularity.net>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Shakeel Butt <shakeelb@google.com>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Vlastimil Babka <vbabka@suse.cz>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Wei Xu <weixugc@google.com>
> Cc: Greg Thelen <gthelen@google.com>
> Cc: Hugh Dickins <hughd@google.com>
> ---
>  mm/page_alloc.c | 21 +++++++++++++--------
>  1 file changed, 13 insertions(+), 8 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index c9ebf0635d592c6f58df9793ce9fa213371a9a7f..5f0531c11ad668b1c4426ebddc17821aca824783 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3024,7 +3024,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>  			unsigned long count, struct list_head *list,
>  			int migratetype, unsigned int alloc_flags)
>  {
> +	struct page *page, *tmp;
>  	int i, allocated = 0;
> +	int free_cma_pages = 0;
>  
>  	/*
>  	 * local_lock_irq held so equivalent to spin_lock_irqsave for

Right so this should be AFAIU enough for
__mod_zone_page_state(NR_FREE_CMA_PAGES) at the end to be OK outside of
zone->lock.

> @@ -3032,14 +3034,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>  	 */
>  	spin_lock(&zone->lock);
>  	for (i = 0; i < count; ++i) {
> -		struct page *page = __rmqueue(zone, order, migratetype,
> -								alloc_flags);
> +		page = __rmqueue(zone, order, migratetype, alloc_flags);
>  		if (unlikely(page == NULL))
>  			break;
>  
> -		if (unlikely(check_pcp_refill(page)))
> -			continue;
> -
>  		/*
>  		 * Split buddy pages returned by expand() are received here in
>  		 * physical page order. The page is added to the tail of
> @@ -3052,9 +3050,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>  		 */
>  		list_add_tail(&page->lru, list);
>  		allocated++;
> -		if (is_migrate_cma(get_pcppage_migratetype(page)))
> -			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
> -					      -(1 << order));
>  	}
>  
>  	/*
> @@ -3065,6 +3060,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>  	 */
>  	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
>  	spin_unlock(&zone->lock);

You could maybe even exchange those two lines as well?

> +	list_for_each_entry_safe(page, tmp, list, lru) {
> +		if (unlikely(check_pcp_refill(page))) {
> +			list_del(&page->lru);
> +			allocated--;
> +		} else if (is_migrate_cma(get_pcppage_migratetype(page))) {
> +			free_cma_pages++;

I think in line with the (IMHO correct) decreasing NR_FREE_CMA_PAGES
regardless of the check result, we should also count free_cma_pages here
unconditionally before doing check_pcp_refill().

> +		}
> +	}
> +	if (free_cma_pages)
> +		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(free_cma_pages << order));
>  	return allocated;
>  }
>
Mel Gorman March 14, 2022, 10 a.m. UTC | #2
On Sun, Mar 13, 2022 at 04:25:47PM -0700, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> check_pcp_refill() is used from rmqueue_bulk() while zone spinlock
> is held.
> 
> This used to be fine because check_pcp_refill() was testing only the
> head page, while its 'struct page' was very hot in the cpu caches.
> 
> With ("mm/page_alloc: check high-order pages for corruption during PCP
> operations") check_pcp_refill() will add latencies for high order pages.
> 
> We can defer the calls to check_pcp_refill() after the zone
> spinlock has been released.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

I'm not a major fan. While this reduces the lock hold times, it adds
another list walk which may make the overall operation more expensive
which is probably a net loss given that the cold struct pages are still
accessed.

The lower lock hold times applies to high-order allocations which are
either THPs or SLAB refills. THP can be expensive anyway depending on
whether compaction had to be used and SLAB refills do not necessarily occur
for every SLAB allocation (although it is likely much more common for
network intensive workloads). This means that the patch may be helping
the uncommon case (high order alloc) at the cost of the common case
(order-0 alloc).

Because this incurs a second list walk cost to the common case, I think
the changelog would need justification that it does not hurt common paths
and that the lock hold reduction times make a meaningful difference.
diff mbox series

Patch

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c9ebf0635d592c6f58df9793ce9fa213371a9a7f..5f0531c11ad668b1c4426ebddc17821aca824783 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3024,7 +3024,9 @@  static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
 			int migratetype, unsigned int alloc_flags)
 {
+	struct page *page, *tmp;
 	int i, allocated = 0;
+	int free_cma_pages = 0;
 
 	/*
 	 * local_lock_irq held so equivalent to spin_lock_irqsave for
@@ -3032,14 +3034,10 @@  static int rmqueue_bulk(struct zone *zone, unsigned int order,
 	 */
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
-		struct page *page = __rmqueue(zone, order, migratetype,
-								alloc_flags);
+		page = __rmqueue(zone, order, migratetype, alloc_flags);
 		if (unlikely(page == NULL))
 			break;
 
-		if (unlikely(check_pcp_refill(page)))
-			continue;
-
 		/*
 		 * Split buddy pages returned by expand() are received here in
 		 * physical page order. The page is added to the tail of
@@ -3052,9 +3050,6 @@  static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		 */
 		list_add_tail(&page->lru, list);
 		allocated++;
-		if (is_migrate_cma(get_pcppage_migratetype(page)))
-			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-					      -(1 << order));
 	}
 
 	/*
@@ -3065,6 +3060,16 @@  static int rmqueue_bulk(struct zone *zone, unsigned int order,
 	 */
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock(&zone->lock);
+	list_for_each_entry_safe(page, tmp, list, lru) {
+		if (unlikely(check_pcp_refill(page))) {
+			list_del(&page->lru);
+			allocated--;
+		} else if (is_migrate_cma(get_pcppage_migratetype(page))) {
+			free_cma_pages++;
+		}
+	}
+	if (free_cma_pages)
+		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(free_cma_pages << order));
 	return allocated;
 }