diff mbox series

[RFC,net-next/mm,V3,1/2] page_pool: Remove workqueue in new shutdown scheme

Message ID 168269857929.2191653.13267688321246766547.stgit@firesoul (mailing list archive)
State New
Headers show
Series page_pool: new approach for leak detection and shutdown phase | expand

Commit Message

Jesper Dangaard Brouer April 28, 2023, 4:16 p.m. UTC
This removes the workqueue scheme that periodically tests when
inflight reach zero such that page_pool memory can be freed.

This change adds code to fast-path free checking for a shutdown flags
bit after returning PP pages.

Performance is very important for PP, as the fast path is used for
XDP_DROP use-cases where NIC drivers recycle PP pages directly into PP
alloc cache.

This patch (since V3) shows zero impact on this fast path. Micro
benchmarked with [1] on Intel CPU E5-1650 @3.60GHz. The slight code
reorg of likely() are deliberate.

[1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/bench_page_pool_simple.c

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/net/page_pool.h |    9 +--
 net/core/page_pool.c    |  138 ++++++++++++++++++++++++++++++++++-------------
 2 files changed, 103 insertions(+), 44 deletions(-)

Comments

Toke Høiland-Jørgensen April 28, 2023, 9:38 p.m. UTC | #1
Jesper Dangaard Brouer <brouer@redhat.com> writes:

> This removes the workqueue scheme that periodically tests when
> inflight reach zero such that page_pool memory can be freed.
>
> This change adds code to fast-path free checking for a shutdown flags
> bit after returning PP pages.
>
> Performance is very important for PP, as the fast path is used for
> XDP_DROP use-cases where NIC drivers recycle PP pages directly into PP
> alloc cache.
>
> This patch (since V3) shows zero impact on this fast path. Micro
> benchmarked with [1] on Intel CPU E5-1650 @3.60GHz. The slight code
> reorg of likely() are deliberate.

Oh, you managed to get rid of the small difference you were seeing
before? Nice! :)

Just a few questions, see below:

> [1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/bench_page_pool_simple.c
>
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>  include/net/page_pool.h |    9 +--
>  net/core/page_pool.c    |  138 ++++++++++++++++++++++++++++++++++-------------
>  2 files changed, 103 insertions(+), 44 deletions(-)
>
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index c8ec2f34722b..a71c0f2695b0 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -50,6 +50,9 @@
>  				 PP_FLAG_DMA_SYNC_DEV |\
>  				 PP_FLAG_PAGE_FRAG)
>  
> +/* Internal flag: PP in shutdown phase, waiting for inflight pages */
> +#define PP_FLAG_SHUTDOWN	BIT(8)
> +
>  /*
>   * Fast allocation side cache array/stack
>   *
> @@ -151,11 +154,6 @@ static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
>  struct page_pool {
>  	struct page_pool_params p;
>  
> -	struct delayed_work release_dw;
> -	void (*disconnect)(void *);
> -	unsigned long defer_start;
> -	unsigned long defer_warn;
> -
>  	u32 pages_state_hold_cnt;
>  	unsigned int frag_offset;
>  	struct page *frag_page;
> @@ -165,6 +163,7 @@ struct page_pool {
>  	/* these stats are incremented while in softirq context */
>  	struct page_pool_alloc_stats alloc_stats;
>  #endif
> +	void (*disconnect)(void *);
>  	u32 xdp_mem_id;
>  
>  	/*
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index e212e9d7edcb..54bdd140b7bd 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -23,9 +23,6 @@
>  
>  #include <trace/events/page_pool.h>
>  
> -#define DEFER_TIME (msecs_to_jiffies(1000))
> -#define DEFER_WARN_INTERVAL (60 * HZ)
> -
>  #define BIAS_MAX	LONG_MAX
>  
>  #ifdef CONFIG_PAGE_POOL_STATS
> @@ -380,6 +377,10 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
>  	struct page *page;
>  	int i, nr_pages;
>  
> +	/* API usage BUG: PP in shutdown phase, cannot alloc new pages */
> +	if (WARN_ON(pool->p.flags & PP_FLAG_SHUTDOWN))
> +		return NULL;
> +
>  	/* Don't support bulk alloc for high-order pages */
>  	if (unlikely(pp_order))
>  		return __page_pool_alloc_page_order(pool, gfp);
> @@ -450,10 +451,9 @@ EXPORT_SYMBOL(page_pool_alloc_pages);
>   */
>  #define _distance(a, b)	(s32)((a) - (b))
>  
> -static s32 page_pool_inflight(struct page_pool *pool)
> +static s32 __page_pool_inflight(struct page_pool *pool,
> +				u32 hold_cnt, u32 release_cnt)
>  {
> -	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
> -	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
>  	s32 inflight;
>  
>  	inflight = _distance(hold_cnt, release_cnt);
> @@ -464,6 +464,17 @@ static s32 page_pool_inflight(struct page_pool *pool)
>  	return inflight;
>  }
>  
> +static s32 page_pool_inflight(struct page_pool *pool)
> +{
> +	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
> +	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
> +	return __page_pool_inflight(pool, hold_cnt, release_cnt);
> +}
> +
> +static int page_pool_free_attempt(struct page_pool *pool,
> +				  u32 hold_cnt, u32 release_cnt);
> +static u32 pp_read_hold_cnt(struct page_pool *pool);
> +
>  /* Disconnects a page (from a page_pool).  API users can have a need
>   * to disconnect a page (from a page_pool), to allow it to be used as
>   * a regular page (that will eventually be returned to the normal
> @@ -471,8 +482,10 @@ static s32 page_pool_inflight(struct page_pool *pool)
>   */
>  void page_pool_release_page(struct page_pool *pool, struct page *page)
>  {
> +	unsigned int flags = READ_ONCE(pool->p.flags);
>  	dma_addr_t dma;
> -	int count;
> +	u32 release_cnt;
> +	u32 hold_cnt;
>  
>  	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
>  		/* Always account for inflight pages, even if we didn't
> @@ -490,11 +503,15 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
>  skip_dma_unmap:
>  	page_pool_clear_pp_info(page);
>  
> -	/* This may be the last page returned, releasing the pool, so
> -	 * it is not safe to reference pool afterwards.
> -	 */
> -	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
> -	trace_page_pool_state_release(pool, page, count);
> +	if (flags & PP_FLAG_SHUTDOWN)
> +		hold_cnt = pp_read_hold_cnt(pool);
> +
> +	release_cnt = atomic_inc_return(&pool->pages_state_release_cnt);
> +	trace_page_pool_state_release(pool, page, release_cnt);
> +
> +	/* In shutdown phase, last page will free pool instance */
> +	if (flags & PP_FLAG_SHUTDOWN)
> +		page_pool_free_attempt(pool, hold_cnt, release_cnt);

I'm curious why you decided to keep the hold_cnt read separate from the
call to free attempt? Not a huge deal, and I'm fine with keeping it this
way, just curious if you have any functional reason that I missed, or if
you just prefer this style? :)

>  }
>  EXPORT_SYMBOL(page_pool_release_page);
>  
> @@ -535,7 +552,7 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
>  static bool page_pool_recycle_in_cache(struct page *page,
>  				       struct page_pool *pool)
>  {
> -	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
> +	if (pool->alloc.count == PP_ALLOC_CACHE_SIZE) {
>  		recycle_stat_inc(pool, cache_full);
>  		return false;
>  	}
> @@ -546,6 +563,8 @@ static bool page_pool_recycle_in_cache(struct page *page,
>  	return true;
>  }
>  
> +static void page_pool_empty_ring(struct page_pool *pool);
> +
>  /* If the page refcnt == 1, this will try to recycle the page.
>   * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
>   * the configured size min(dma_sync_size, pool->max_len).
> @@ -572,7 +591,8 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
>  			page_pool_dma_sync_for_device(pool, page,
>  						      dma_sync_size);
>  
> -		if (allow_direct && in_softirq() &&
> +		/* During PP shutdown, no direct recycle must occur */
> +		if (likely(allow_direct && in_softirq()) &&
>  		    page_pool_recycle_in_cache(page, pool))
>  			return NULL;
>  
> @@ -609,6 +629,8 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
>  		recycle_stat_inc(pool, ring_full);
>  		page_pool_return_page(pool, page);
>  	}
> +	if (page && pool->p.flags & PP_FLAG_SHUTDOWN)
> +		page_pool_empty_ring(pool);
>  }
>  EXPORT_SYMBOL(page_pool_put_defragged_page);
>  
> @@ -646,6 +668,9 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
>  	recycle_stat_add(pool, ring, i);
>  	page_pool_ring_unlock(pool);
>  
> +	if (pool->p.flags & PP_FLAG_SHUTDOWN)
> +		page_pool_empty_ring(pool);
> +
>  	/* Hopefully all pages was return into ptr_ring */
>  	if (likely(i == bulk_len))
>  		return;
> @@ -737,12 +762,18 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
>  }
>  EXPORT_SYMBOL(page_pool_alloc_frag);
>  
> +noinline
>  static void page_pool_empty_ring(struct page_pool *pool)
>  {
> -	struct page *page;
> +	struct page *page, *next;
> +
> +	next = ptr_ring_consume_bh(&pool->ring);
>  
>  	/* Empty recycle ring */
> -	while ((page = ptr_ring_consume_bh(&pool->ring))) {
> +	while (next) {
> +		page = next;
> +		next = ptr_ring_consume_bh(&pool->ring);
> +
>  		/* Verify the refcnt invariant of cached pages */
>  		if (!(page_ref_count(page) == 1))
>  			pr_crit("%s() page_pool refcnt %d violation\n",
> @@ -796,39 +827,36 @@ static void page_pool_scrub(struct page_pool *pool)
>  	page_pool_empty_ring(pool);
>  }
>  
> -static int page_pool_release(struct page_pool *pool)
> +/* Avoid inlining code to avoid speculative fetching cacheline */
> +noinline
> +static u32 pp_read_hold_cnt(struct page_pool *pool)
> +{
> +	return READ_ONCE(pool->pages_state_hold_cnt);
> +}
> +
> +noinline
> +static int page_pool_free_attempt(struct page_pool *pool,
> +				  u32 hold_cnt, u32 release_cnt)
>  {
>  	int inflight;
>  
> -	page_pool_scrub(pool);
> -	inflight = page_pool_inflight(pool);
> +	inflight = __page_pool_inflight(pool, hold_cnt, release_cnt);
>  	if (!inflight)
>  		page_pool_free(pool);
>  
>  	return inflight;
>  }
>  
> -static void page_pool_release_retry(struct work_struct *wq)
> +static int page_pool_release(struct page_pool *pool)
>  {
> -	struct delayed_work *dwq = to_delayed_work(wq);
> -	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
>  	int inflight;
>  
> -	inflight = page_pool_release(pool);
> +	page_pool_scrub(pool);
> +	inflight = page_pool_inflight(pool);
>  	if (!inflight)
> -		return;
> -
> -	/* Periodic warning */
> -	if (time_after_eq(jiffies, pool->defer_warn)) {
> -		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
> -
> -		pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
> -			__func__, inflight, sec);
> -		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
> -	}
> +		page_pool_free(pool);
>  
> -	/* Still not ready to be disconnected, retry later */
> -	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
> +	return inflight;
>  }
>  
>  void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
> @@ -856,6 +884,10 @@ EXPORT_SYMBOL(page_pool_unlink_napi);
>  
>  void page_pool_destroy(struct page_pool *pool)
>  {
> +	unsigned int flags;
> +	u32 release_cnt;
> +	u32 hold_cnt;
> +
>  	if (!pool)
>  		return;
>  
> @@ -868,11 +900,39 @@ void page_pool_destroy(struct page_pool *pool)
>  	if (!page_pool_release(pool))
>  		return;
>  
> -	pool->defer_start = jiffies;
> -	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
> +	/* PP have pages inflight, thus cannot immediately release memory.
> +	 * Enter into shutdown phase, depending on remaining in-flight PP
> +	 * pages to trigger shutdown process (on concurrent CPUs) and last
> +	 * page will free pool instance.
> +	 *
> +	 * There exist two race conditions here, we need to take into
> +	 * account in the following code.
> +	 *
> +	 * 1. Before setting PP_FLAG_SHUTDOWN another CPU released the last
> +	 *    pages into the ptr_ring.  Thus, it missed triggering shutdown
> +	 *    process, which can then be stalled forever.
> +	 *
> +	 * 2. After setting PP_FLAG_SHUTDOWN another CPU released the last
> +	 *    page, which triggered shutdown process and freed pool
> +	 *    instance. Thus, its not safe to dereference *pool afterwards.
> +	 *
> +	 * Handling races by holding a fake in-flight count, via
> +	 * artificially bumping pages_state_hold_cnt, which assures pool
> +	 * isn't freed under us.  For race(1) its safe to recheck ptr_ring
> +	 * (it will not free pool). Race(2) cannot happen, and we can
> +	 * release fake in-flight count as last step.
> +	 */
> +	hold_cnt = READ_ONCE(pool->pages_state_hold_cnt) + 1;
> +	smp_store_release(&pool->pages_state_hold_cnt, hold_cnt);
> +	barrier();
> +	flags = READ_ONCE(pool->p.flags) | PP_FLAG_SHUTDOWN;
> +	smp_store_release(&pool->p.flags, flags);

So in the memory barrier documentation, store_release() is usually
paired with read_acquire(), but the code reading the flag uses
READ_ONCE(). I'm not sure if those are equivalent? (As in, I am asking
more than I'm saying they're not; I find it difficult to keep these
things straight...)

-Toke
Jakub Kicinski May 3, 2023, 2:33 a.m. UTC | #2
On Fri, 28 Apr 2023 18:16:19 +0200 Jesper Dangaard Brouer wrote:
> This removes the workqueue scheme that periodically tests when
> inflight reach zero such that page_pool memory can be freed.
> 
> This change adds code to fast-path free checking for a shutdown flags
> bit after returning PP pages.

We can remove the warning without removing the entire delayed freeing
scheme. I definitely like the SHUTDOWN flag and patch 2 but I'm a bit
less clear on why the complexity of datapath freeing is justified.
Can you explain?

> Performance is very important for PP, as the fast path is used for
> XDP_DROP use-cases where NIC drivers recycle PP pages directly into PP
> alloc cache.
> 
> This patch (since V3) shows zero impact on this fast path. Micro
> benchmarked with [1] on Intel CPU E5-1650 @3.60GHz. The slight code
> reorg of likely() are deliberate.
Toke Høiland-Jørgensen May 3, 2023, 11:18 a.m. UTC | #3
Jakub Kicinski <kuba@kernel.org> writes:

> On Fri, 28 Apr 2023 18:16:19 +0200 Jesper Dangaard Brouer wrote:
>> This removes the workqueue scheme that periodically tests when
>> inflight reach zero such that page_pool memory can be freed.
>> 
>> This change adds code to fast-path free checking for a shutdown flags
>> bit after returning PP pages.
>
> We can remove the warning without removing the entire delayed freeing
> scheme. I definitely like the SHUTDOWN flag and patch 2 but I'm a bit
> less clear on why the complexity of datapath freeing is justified.
> Can you explain?

You mean just let the workqueue keep rescheduling itself every minute
for the (potentially) hours that skbs will stick around? Seems a bit
wasteful, doesn't it? :)

We did see an issue where creating and tearing down lots of page pools
in a short period of time caused significant slowdowns due to the
workqueue mechanism. Lots being "thousands per second". This is possible
using the live packet mode of bpf_prog_run() for XDP, which will setup
and destroy a page pool for each syscall...

-Toke
Jesper Dangaard Brouer May 3, 2023, 3:21 p.m. UTC | #4
On 28/04/2023 23.38, Toke Høiland-Jørgensen wrote:
> Jesper Dangaard Brouer <brouer@redhat.com> writes:
> 
>> This removes the workqueue scheme that periodically tests when
>> inflight reach zero such that page_pool memory can be freed.
>>
>> This change adds code to fast-path free checking for a shutdown flags
>> bit after returning PP pages.
>>
>> Performance is very important for PP, as the fast path is used for
>> XDP_DROP use-cases where NIC drivers recycle PP pages directly into PP
>> alloc cache.
>>
>> This patch (since V3) shows zero impact on this fast path. Micro
>> benchmarked with [1] on Intel CPU E5-1650 @3.60GHz. The slight code
>> reorg of likely() are deliberate.
> 
> Oh, you managed to get rid of the small difference you were seeing
> before? Nice! :)

Yes! - fast path is not affected at all!!! :-)
This is due to this small change in the V3.

   +	if (page && pool->p.flags & PP_FLAG_SHUTDOWN)
   +		page_pool_empty_ring(pool);

Due to V3 change of moving free_attempt to page_pool_release_page()
the page_pool_empty_ring() call (in page_pool_put_defragged_page)
should only be called when (page != NULL).  This moves the ASM code
for the PP_FLAG_SHUTDOWN out of the fast-path.

I also measure the slightly slower path, recycle into ptr_ring, where I
cannot really see any effect.  It is easier to understand, why/when is
it important, when looking at the percentage effect of the measured 0.3
nanosec overhead. For fast-path this would be approx 7%, while for
this recycle path it is approx 1.8%.


> 
> Just a few questions, see below:
> 
>> [1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/bench_page_pool_simple.c
>>
>> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
>> ---
>>   include/net/page_pool.h |    9 +--
>>   net/core/page_pool.c    |  138 ++++++++++++++++++++++++++++++++++-------------
>>   2 files changed, 103 insertions(+), 44 deletions(-)
>>
>> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
>> index c8ec2f34722b..a71c0f2695b0 100644
>> --- a/include/net/page_pool.h
>> +++ b/include/net/page_pool.h
>> @@ -50,6 +50,9 @@
>>   				 PP_FLAG_DMA_SYNC_DEV |\
>>   				 PP_FLAG_PAGE_FRAG)
>>   
>> +/* Internal flag: PP in shutdown phase, waiting for inflight pages */
>> +#define PP_FLAG_SHUTDOWN	BIT(8)
>> +
>>   /*
>>    * Fast allocation side cache array/stack
>>    *
>> @@ -151,11 +154,6 @@ static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
>>   struct page_pool {
>>   	struct page_pool_params p;
>>   
>> -	struct delayed_work release_dw;
>> -	void (*disconnect)(void *);
>> -	unsigned long defer_start;
>> -	unsigned long defer_warn;
>> -
>>   	u32 pages_state_hold_cnt;
>>   	unsigned int frag_offset;
>>   	struct page *frag_page;
>> @@ -165,6 +163,7 @@ struct page_pool {
>>   	/* these stats are incremented while in softirq context */
>>   	struct page_pool_alloc_stats alloc_stats;
>>   #endif
>> +	void (*disconnect)(void *);
>>   	u32 xdp_mem_id;
>>   
>>   	/*
>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>> index e212e9d7edcb..54bdd140b7bd 100644
>> --- a/net/core/page_pool.c
>> +++ b/net/core/page_pool.c
>> @@ -23,9 +23,6 @@
>>   
>>   #include <trace/events/page_pool.h>
>>   
>> -#define DEFER_TIME (msecs_to_jiffies(1000))
>> -#define DEFER_WARN_INTERVAL (60 * HZ)
>> -
>>   #define BIAS_MAX	LONG_MAX
>>   
>>   #ifdef CONFIG_PAGE_POOL_STATS
>> @@ -380,6 +377,10 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
>>   	struct page *page;
>>   	int i, nr_pages;
>>   
>> +	/* API usage BUG: PP in shutdown phase, cannot alloc new pages */
>> +	if (WARN_ON(pool->p.flags & PP_FLAG_SHUTDOWN))
>> +		return NULL;
>> +
>>   	/* Don't support bulk alloc for high-order pages */
>>   	if (unlikely(pp_order))
>>   		return __page_pool_alloc_page_order(pool, gfp);
>> @@ -450,10 +451,9 @@ EXPORT_SYMBOL(page_pool_alloc_pages);
>>    */
>>   #define _distance(a, b)	(s32)((a) - (b))
>>   
>> -static s32 page_pool_inflight(struct page_pool *pool)
>> +static s32 __page_pool_inflight(struct page_pool *pool,
>> +				u32 hold_cnt, u32 release_cnt)
>>   {
>> -	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
>> -	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
>>   	s32 inflight;
>>   
>>   	inflight = _distance(hold_cnt, release_cnt);
>> @@ -464,6 +464,17 @@ static s32 page_pool_inflight(struct page_pool *pool)
>>   	return inflight;
>>   }
>>   
>> +static s32 page_pool_inflight(struct page_pool *pool)
>> +{
>> +	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
>> +	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
>> +	return __page_pool_inflight(pool, hold_cnt, release_cnt);
>> +}
>> +
>> +static int page_pool_free_attempt(struct page_pool *pool,
>> +				  u32 hold_cnt, u32 release_cnt);
>> +static u32 pp_read_hold_cnt(struct page_pool *pool);
>> +
>>   /* Disconnects a page (from a page_pool).  API users can have a need
>>    * to disconnect a page (from a page_pool), to allow it to be used as
>>    * a regular page (that will eventually be returned to the normal
>> @@ -471,8 +482,10 @@ static s32 page_pool_inflight(struct page_pool *pool)
>>    */
>>   void page_pool_release_page(struct page_pool *pool, struct page *page)
>>   {
>> +	unsigned int flags = READ_ONCE(pool->p.flags);
>>   	dma_addr_t dma;
>> -	int count;
>> +	u32 release_cnt;
>> +	u32 hold_cnt;
>>   
>>   	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
>>   		/* Always account for inflight pages, even if we didn't
>> @@ -490,11 +503,15 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
>>   skip_dma_unmap:
>>   	page_pool_clear_pp_info(page);
>>   
>> -	/* This may be the last page returned, releasing the pool, so
>> -	 * it is not safe to reference pool afterwards.
>> -	 */
>> -	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
>> -	trace_page_pool_state_release(pool, page, count);
>> +	if (flags & PP_FLAG_SHUTDOWN)
>> +		hold_cnt = pp_read_hold_cnt(pool);
>> +
>> +	release_cnt = atomic_inc_return(&pool->pages_state_release_cnt);
>> +	trace_page_pool_state_release(pool, page, release_cnt);
>> +
>> +	/* In shutdown phase, last page will free pool instance */
>> +	if (flags & PP_FLAG_SHUTDOWN)
>> +		page_pool_free_attempt(pool, hold_cnt, release_cnt);
> 
> I'm curious why you decided to keep the hold_cnt read separate from the
> call to free attempt? Not a huge deal, and I'm fine with keeping it this
> way, just curious if you have any functional reason that I missed, or if
> you just prefer this style? :)

You seem to have missed my explanation in V2 reply:
 
https://lore.kernel.org/all/f671f5da-d9bc-a559-2120-10c3491e6f6d@redhat.com/

> 
>>   }
>>   EXPORT_SYMBOL(page_pool_release_page);
>>   
>> @@ -535,7 +552,7 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
>>   static bool page_pool_recycle_in_cache(struct page *page,
>>   				       struct page_pool *pool)
>>   {
>> -	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
>> +	if (pool->alloc.count == PP_ALLOC_CACHE_SIZE) {
>>   		recycle_stat_inc(pool, cache_full);
>>   		return false;
>>   	}
>> @@ -546,6 +563,8 @@ static bool page_pool_recycle_in_cache(struct page *page,
>>   	return true;
>>   }
>>   
>> +static void page_pool_empty_ring(struct page_pool *pool);
>> +
>>   /* If the page refcnt == 1, this will try to recycle the page.
>>    * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
>>    * the configured size min(dma_sync_size, pool->max_len).
>> @@ -572,7 +591,8 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
>>   			page_pool_dma_sync_for_device(pool, page,
>>   						      dma_sync_size);
>>   
>> -		if (allow_direct && in_softirq() &&
>> +		/* During PP shutdown, no direct recycle must occur */
>> +		if (likely(allow_direct && in_softirq()) &&
>>   		    page_pool_recycle_in_cache(page, pool))
>>   			return NULL;
>>   
>> @@ -609,6 +629,8 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
>>   		recycle_stat_inc(pool, ring_full);
>>   		page_pool_return_page(pool, page);
>>   	}
>> +	if (page && pool->p.flags & PP_FLAG_SHUTDOWN)
>> +		page_pool_empty_ring(pool);
>>   }
>>   EXPORT_SYMBOL(page_pool_put_defragged_page);
>>   
>> @@ -646,6 +668,9 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
>>   	recycle_stat_add(pool, ring, i);
>>   	page_pool_ring_unlock(pool);
>>   
>> +	if (pool->p.flags & PP_FLAG_SHUTDOWN)
>> +		page_pool_empty_ring(pool);
>> +
>>   	/* Hopefully all pages was return into ptr_ring */
>>   	if (likely(i == bulk_len))
>>   		return;
>> @@ -737,12 +762,18 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
>>   }
>>   EXPORT_SYMBOL(page_pool_alloc_frag);
>>   
>> +noinline
>>   static void page_pool_empty_ring(struct page_pool *pool)
>>   {
>> -	struct page *page;
>> +	struct page *page, *next;
>> +
>> +	next = ptr_ring_consume_bh(&pool->ring);
>>   
>>   	/* Empty recycle ring */
>> -	while ((page = ptr_ring_consume_bh(&pool->ring))) {
>> +	while (next) {
>> +		page = next;
>> +		next = ptr_ring_consume_bh(&pool->ring);
>> +
>>   		/* Verify the refcnt invariant of cached pages */
>>   		if (!(page_ref_count(page) == 1))
>>   			pr_crit("%s() page_pool refcnt %d violation\n",
>> @@ -796,39 +827,36 @@ static void page_pool_scrub(struct page_pool *pool)
>>   	page_pool_empty_ring(pool);
>>   }
>>   
>> -static int page_pool_release(struct page_pool *pool)
>> +/* Avoid inlining code to avoid speculative fetching cacheline */
>> +noinline
>> +static u32 pp_read_hold_cnt(struct page_pool *pool)
>> +{
>> +	return READ_ONCE(pool->pages_state_hold_cnt);
>> +}
>> +
>> +noinline
>> +static int page_pool_free_attempt(struct page_pool *pool,
>> +				  u32 hold_cnt, u32 release_cnt)
>>   {
>>   	int inflight;
>>   
>> -	page_pool_scrub(pool);
>> -	inflight = page_pool_inflight(pool);
>> +	inflight = __page_pool_inflight(pool, hold_cnt, release_cnt);
>>   	if (!inflight)
>>   		page_pool_free(pool);
>>   
>>   	return inflight;
>>   }
>>   
>> -static void page_pool_release_retry(struct work_struct *wq)
>> +static int page_pool_release(struct page_pool *pool)
>>   {
>> -	struct delayed_work *dwq = to_delayed_work(wq);
>> -	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
>>   	int inflight;
>>   
>> -	inflight = page_pool_release(pool);
>> +	page_pool_scrub(pool);
>> +	inflight = page_pool_inflight(pool);
>>   	if (!inflight)
>> -		return;
>> -
>> -	/* Periodic warning */
>> -	if (time_after_eq(jiffies, pool->defer_warn)) {
>> -		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
>> -
>> -		pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
>> -			__func__, inflight, sec);
>> -		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
>> -	}
>> +		page_pool_free(pool);
>>   
>> -	/* Still not ready to be disconnected, retry later */
>> -	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
>> +	return inflight;
>>   }
>>   
>>   void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
>> @@ -856,6 +884,10 @@ EXPORT_SYMBOL(page_pool_unlink_napi);
>>   
>>   void page_pool_destroy(struct page_pool *pool)
>>   {
>> +	unsigned int flags;
>> +	u32 release_cnt;
>> +	u32 hold_cnt;
>> +
>>   	if (!pool)
>>   		return;
>>   
>> @@ -868,11 +900,39 @@ void page_pool_destroy(struct page_pool *pool)
>>   	if (!page_pool_release(pool))
>>   		return;
>>   
>> -	pool->defer_start = jiffies;
>> -	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
>> +	/* PP have pages inflight, thus cannot immediately release memory.
>> +	 * Enter into shutdown phase, depending on remaining in-flight PP
>> +	 * pages to trigger shutdown process (on concurrent CPUs) and last
>> +	 * page will free pool instance.
>> +	 *
>> +	 * There exist two race conditions here, we need to take into
>> +	 * account in the following code.
>> +	 *
>> +	 * 1. Before setting PP_FLAG_SHUTDOWN another CPU released the last
>> +	 *    pages into the ptr_ring.  Thus, it missed triggering shutdown
>> +	 *    process, which can then be stalled forever.
>> +	 *
>> +	 * 2. After setting PP_FLAG_SHUTDOWN another CPU released the last
>> +	 *    page, which triggered shutdown process and freed pool
>> +	 *    instance. Thus, its not safe to dereference *pool afterwards.
>> +	 *
>> +	 * Handling races by holding a fake in-flight count, via
>> +	 * artificially bumping pages_state_hold_cnt, which assures pool
>> +	 * isn't freed under us.  For race(1) its safe to recheck ptr_ring
>> +	 * (it will not free pool). Race(2) cannot happen, and we can
>> +	 * release fake in-flight count as last step.
>> +	 */
>> +	hold_cnt = READ_ONCE(pool->pages_state_hold_cnt) + 1;
>> +	smp_store_release(&pool->pages_state_hold_cnt, hold_cnt);
>> +	barrier();
>> +	flags = READ_ONCE(pool->p.flags) | PP_FLAG_SHUTDOWN;
>> +	smp_store_release(&pool->p.flags, flags);
> 
> So in the memory barrier documentation, store_release() is usually
> paired with read_acquire(), but the code reading the flag uses
> READ_ONCE(). I'm not sure if those are equivalent? (As in, I am asking
> more than I'm saying they're not; I find it difficult to keep these
> things straight...)
> 

I was hoping someone could help us say what barrier to use here?

--Jesper
Jesper Dangaard Brouer May 3, 2023, 3:49 p.m. UTC | #5
On 03/05/2023 13.18, Toke Høiland-Jørgensen wrote:
> Jakub Kicinski <kuba@kernel.org> writes:
> 
>> On Fri, 28 Apr 2023 18:16:19 +0200 Jesper Dangaard Brouer wrote:
>>> This removes the workqueue scheme that periodically tests when
>>> inflight reach zero such that page_pool memory can be freed.
>>>
>>> This change adds code to fast-path free checking for a shutdown flags
>>> bit after returning PP pages.
>>
>> We can remove the warning without removing the entire delayed freeing
>> scheme. I definitely like the SHUTDOWN flag and patch 2 but I'm a bit
>> less clear on why the complexity of datapath freeing is justified.
>> Can you explain?
> 
> You mean just let the workqueue keep rescheduling itself every minute
> for the (potentially) hours that skbs will stick around? Seems a bit
> wasteful, doesn't it? :)

I agree that this workqueue that keeps rescheduling is wasteful.
It actually reschedules every second, even more wasteful.
NIC drivers will have many HW RX-queues, with separate PP instances, 
that each can start a workqueue that resched every sec.

Eric have convinced me that SKBs can "stick around" for longer than the
assumptions in PP.  The old PP assumptions came from XDP-return path.
It is time to cleanup.

> 
> We did see an issue where creating and tearing down lots of page pools
> in a short period of time caused significant slowdowns due to the
> workqueue mechanism. Lots being "thousands per second". This is possible
> using the live packet mode of bpf_prog_run() for XDP, which will setup
> and destroy a page pool for each syscall...

Yes, the XDP live packet mode of bpf_prog_run is IMHO abusing the
page_pool API.  We should fix that somehow, at least the case where live
packet mode is only injecting a single packet, but still creates a PP
instance. The PP in live packet mode IMHO only makes sense when
repeatedly sending packets that gets recycles and are pre-inited by PP.

This use of PP does exemplify why is it problematic to keep the workqueue.

I have considered (and could be convinced) delaying the free via
call_rcu, but it also create an unfortunate backlog of work in the case
of live packet mode of bpf_prog_run.

--Jesper
Jakub Kicinski May 4, 2023, 1:47 a.m. UTC | #6
On Wed, 3 May 2023 17:49:34 +0200 Jesper Dangaard Brouer wrote:
> On 03/05/2023 13.18, Toke Høiland-Jørgensen wrote:
> > Jakub Kicinski <kuba@kernel.org> writes:
> >> We can remove the warning without removing the entire delayed freeing
> >> scheme. I definitely like the SHUTDOWN flag and patch 2 but I'm a bit
> >> less clear on why the complexity of datapath freeing is justified.
> >> Can you explain?  
> > 
> > You mean just let the workqueue keep rescheduling itself every minute
> > for the (potentially) hours that skbs will stick around? Seems a bit
> > wasteful, doesn't it? :)  
> 
> I agree that this workqueue that keeps rescheduling is wasteful.
> It actually reschedules every second, even more wasteful.
> NIC drivers will have many HW RX-queues, with separate PP instances, 
> that each can start a workqueue that resched every sec.

There's a lot of work items flying around on a working system.
I don't think the rare (and very cheap) PP check work is going 
to move the needle. You should see how many work items DIM schedules :(

I'd think that potentially having the extra memory pinned is much more
of an issue than a periodic check, and that does not go away by
changing the checking mechanism.

> Eric have convinced me that SKBs can "stick around" for longer than the
> assumptions in PP.  The old PP assumptions came from XDP-return path.
> It is time to cleanup.

I see. Regardless - should we have some count/statistic for the number
of "zombie" page pools sitting around in the system? Could be useful
for debug.

> > We did see an issue where creating and tearing down lots of page pools
> > in a short period of time caused significant slowdowns due to the
> > workqueue mechanism. Lots being "thousands per second". This is possible
> > using the live packet mode of bpf_prog_run() for XDP, which will setup
> > and destroy a page pool for each syscall...  
> 
> Yes, the XDP live packet mode of bpf_prog_run is IMHO abusing the
> page_pool API.  We should fix that somehow, at least the case where live
> packet mode is only injecting a single packet, but still creates a PP
> instance. The PP in live packet mode IMHO only makes sense when
> repeatedly sending packets that gets recycles and are pre-inited by PP.

+1, FWIW, I was surprised that we have a init_callback which sits in 
the fastpath exists purely for testing.

> This use of PP does exemplify why is it problematic to keep the workqueue.
> 
> I have considered (and could be convinced) delaying the free via
> call_rcu, but it also create an unfortunate backlog of work in the case
> of live packet mode of bpf_prog_run.

Maybe let the pp used by BPF testing be reusable? No real driver will
create thousands of PPs a seconds, that's not sane.

Anyway, you'll choose what you'll choose. I just wanted to cast my vote
for the work queue rather than the tricky lockless release code.
Yunsheng Lin May 4, 2023, 2:42 a.m. UTC | #7
On 2023/4/29 0:16, Jesper Dangaard Brouer wrote:
>  void page_pool_release_page(struct page_pool *pool, struct page *page)
>  {
> +	unsigned int flags = READ_ONCE(pool->p.flags);
>  	dma_addr_t dma;
> -	int count;
> +	u32 release_cnt;
> +	u32 hold_cnt;
>  
>  	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
>  		/* Always account for inflight pages, even if we didn't
> @@ -490,11 +503,15 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
>  skip_dma_unmap:
>  	page_pool_clear_pp_info(page);
>  
> -	/* This may be the last page returned, releasing the pool, so
> -	 * it is not safe to reference pool afterwards.
> -	 */
> -	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
> -	trace_page_pool_state_release(pool, page, count);

There is a time window between "unsigned int flags = READ_ONCE(pool->p.flags)"
and flags checking, if page_pool_destroy() is called concurrently during that
time window, it seems we will have a pp instance leaking problem here?

It seems it is very hard to aovid this kind of corner case when using both
flags & PP_FLAG_SHUTDOWN and release_cnt/hold_cnt checking to decide if pp
instance can be freed.
Can we use something like biased reference counting, which used by frag support
in page pool? So that we only need to check only one variable and avoid cache
bouncing as much as possible.

> +	if (flags & PP_FLAG_SHUTDOWN)
> +		hold_cnt = pp_read_hold_cnt(pool);
> +
> +	release_cnt = atomic_inc_return(&pool->pages_state_release_cnt);
> +	trace_page_pool_state_release(pool, page, release_cnt);
> +
> +	/* In shutdown phase, last page will free pool instance */
> +	if (flags & PP_FLAG_SHUTDOWN)
> +		page_pool_free_attempt(pool, hold_cnt, release_cnt);
>  }
>  EXPORT_SYMBOL(page_pool_release_page);
> 

...

>  
>  void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
> @@ -856,6 +884,10 @@ EXPORT_SYMBOL(page_pool_unlink_napi);
>  
>  void page_pool_destroy(struct page_pool *pool)
>  {
> +	unsigned int flags;
> +	u32 release_cnt;
> +	u32 hold_cnt;
> +
>  	if (!pool)
>  		return;
>  
> @@ -868,11 +900,39 @@ void page_pool_destroy(struct page_pool *pool)
>  	if (!page_pool_release(pool))
>  		return;
>  
> -	pool->defer_start = jiffies;
> -	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
> +	/* PP have pages inflight, thus cannot immediately release memory.
> +	 * Enter into shutdown phase, depending on remaining in-flight PP
> +	 * pages to trigger shutdown process (on concurrent CPUs) and last
> +	 * page will free pool instance.
> +	 *
> +	 * There exist two race conditions here, we need to take into
> +	 * account in the following code.
> +	 *
> +	 * 1. Before setting PP_FLAG_SHUTDOWN another CPU released the last
> +	 *    pages into the ptr_ring.  Thus, it missed triggering shutdown
> +	 *    process, which can then be stalled forever.
> +	 *
> +	 * 2. After setting PP_FLAG_SHUTDOWN another CPU released the last
> +	 *    page, which triggered shutdown process and freed pool
> +	 *    instance. Thus, its not safe to dereference *pool afterwards.
> +	 *
> +	 * Handling races by holding a fake in-flight count, via
> +	 * artificially bumping pages_state_hold_cnt, which assures pool
> +	 * isn't freed under us.  For race(1) its safe to recheck ptr_ring
> +	 * (it will not free pool). Race(2) cannot happen, and we can
> +	 * release fake in-flight count as last step.
> +	 */
> +	hold_cnt = READ_ONCE(pool->pages_state_hold_cnt) + 1;
> +	smp_store_release(&pool->pages_state_hold_cnt, hold_cnt);

I assume the smp_store_release() is used to ensure the correct order
between the above store operations?
There is data dependency between those two store operations, do we
really need the smp_store_release() here?

> +	barrier();
> +	flags = READ_ONCE(pool->p.flags) | PP_FLAG_SHUTDOWN;

Do we need a stronger barrier like smp_rmb() to prevent cpu from
executing "flags = READ_ONCE(pool->p.flags) | PP_FLAG_SHUTDOWN"
before "smp_store_release(&pool->pages_state_hold_cnt, hold_cnt)"
even if there is a smp_store_release() barrier here?

> +	smp_store_release(&pool->p.flags, flags);
> +
> +	/* Concurrent CPUs could have returned last pages into ptr_ring */
> +	page_pool_empty_ring(pool);
>  
> -	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
> -	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
> +	release_cnt = atomic_inc_return(&pool->pages_state_release_cnt);
> +	page_pool_free_attempt(pool, hold_cnt, release_cnt);
>  }
>  EXPORT_SYMBOL(page_pool_destroy);
>  
> 
> 
> .
>
Jesper Dangaard Brouer May 4, 2023, 1:48 p.m. UTC | #8
On 04/05/2023 04.42, Yunsheng Lin wrote:
> On 2023/4/29 0:16, Jesper Dangaard Brouer wrote:
>>   void page_pool_release_page(struct page_pool *pool, struct page *page)
>>   {
>> +	unsigned int flags = READ_ONCE(pool->p.flags);
>>   	dma_addr_t dma;
>> -	int count;
>> +	u32 release_cnt;
>> +	u32 hold_cnt;
>>   
>>   	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
>>   		/* Always account for inflight pages, even if we didn't
>> @@ -490,11 +503,15 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
>>   skip_dma_unmap:
>>   	page_pool_clear_pp_info(page);
>>   
>> -	/* This may be the last page returned, releasing the pool, so
>> -	 * it is not safe to reference pool afterwards.
>> -	 */
>> -	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
>> -	trace_page_pool_state_release(pool, page, count);
> 
> There is a time window between "unsigned int flags = READ_ONCE(pool->p.flags)"
> and flags checking, if page_pool_destroy() is called concurrently during that
> time window, it seems we will have a pp instance leaking problem here?
> 

Nope, that is resolved by the code changes in page_pool_destroy(), see 
below.

> It seems it is very hard to aovid this kind of corner case when using both
> flags & PP_FLAG_SHUTDOWN and release_cnt/hold_cnt checking to decide if pp
> instance can be freed.
> Can we use something like biased reference counting, which used by frag support
> in page pool? So that we only need to check only one variable and avoid cache
> bouncing as much as possible.
> 

See below, I believe we are doing an equivalent refcnt bias trick, that
solves these corner cases in page_pool_destroy().
In short: hold_cnt is increased, prior to setting PP_FLAG_SHUTDOWN.
Thus, if this code READ_ONCE flags without PP_FLAG_SHUTDOWN, we know it
will not be the last to release pool->pages_state_release_cnt.
Below: Perhaps, we should add a RCU grace period to make absolutely
sure, that this code completes before page_pool_destroy() call completes.


>> +	if (flags & PP_FLAG_SHUTDOWN)
>> +		hold_cnt = pp_read_hold_cnt(pool);
>> +

I would like to avoid above code, and I'm considering using call_rcu(),
which I think will resolve the race[0] this code deals with.
As I explained here[0], this code deals with another kind of race.

  [0] 
https://lore.kernel.org/all/f671f5da-d9bc-a559-2120-10c3491e6f6d@redhat.com/

>> +	release_cnt = atomic_inc_return(&pool->pages_state_release_cnt);
>> +	trace_page_pool_state_release(pool, page, release_cnt);
>> +
>> +	/* In shutdown phase, last page will free pool instance */
>> +	if (flags & PP_FLAG_SHUTDOWN)
>> +		page_pool_free_attempt(pool, hold_cnt, release_cnt);
>>   }
>>   EXPORT_SYMBOL(page_pool_release_page);
>>
> 
> ...
> 
>>   
>>   void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
>> @@ -856,6 +884,10 @@ EXPORT_SYMBOL(page_pool_unlink_napi);
>>   
>>   void page_pool_destroy(struct page_pool *pool)
>>   {
>> +	unsigned int flags;
>> +	u32 release_cnt;
>> +	u32 hold_cnt;
>> +
>>   	if (!pool)
>>   		return;
>>   
>> @@ -868,11 +900,39 @@ void page_pool_destroy(struct page_pool *pool)
>>   	if (!page_pool_release(pool))
>>   		return;
>>   
>> -	pool->defer_start = jiffies;
>> -	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
>> +	/* PP have pages inflight, thus cannot immediately release memory.
>> +	 * Enter into shutdown phase, depending on remaining in-flight PP
>> +	 * pages to trigger shutdown process (on concurrent CPUs) and last
>> +	 * page will free pool instance.
>> +	 *
>> +	 * There exist two race conditions here, we need to take into
>> +	 * account in the following code.
>> +	 *
>> +	 * 1. Before setting PP_FLAG_SHUTDOWN another CPU released the last
>> +	 *    pages into the ptr_ring.  Thus, it missed triggering shutdown
>> +	 *    process, which can then be stalled forever.
>> +	 *
>> +	 * 2. After setting PP_FLAG_SHUTDOWN another CPU released the last
>> +	 *    page, which triggered shutdown process and freed pool
>> +	 *    instance. Thus, its not safe to dereference *pool afterwards.
>> +	 *
>> +	 * Handling races by holding a fake in-flight count, via
>> +	 * artificially bumping pages_state_hold_cnt, which assures pool
>> +	 * isn't freed under us.  For race(1) its safe to recheck ptr_ring
>> +	 * (it will not free pool). Race(2) cannot happen, and we can
>> +	 * release fake in-flight count as last step.
>> +	 */
>> +	hold_cnt = READ_ONCE(pool->pages_state_hold_cnt) + 1;
>> +	smp_store_release(&pool->pages_state_hold_cnt, hold_cnt);
> 
> I assume the smp_store_release() is used to ensure the correct order
> between the above store operations?
> There is data dependency between those two store operations, do we
> really need the smp_store_release() here?
> 
>> +	barrier();
>> +	flags = READ_ONCE(pool->p.flags) | PP_FLAG_SHUTDOWN;
> 
> Do we need a stronger barrier like smp_rmb() to prevent cpu from
> executing "flags = READ_ONCE(pool->p.flags) | PP_FLAG_SHUTDOWN"
> before "smp_store_release(&pool->pages_state_hold_cnt, hold_cnt)"
> even if there is a smp_store_release() barrier here?
> 
I do see you point and how it is related to your above comment for
page_pool_release_page().

I think we need to replace barrier() with synchronize_rcu().
Meaning we add a RCU grace period to "wait" for above code (in
page_pool_release_page) that read the old flags value to complete.


>> +	smp_store_release(&pool->p.flags, flags);

When doing a synchronize_rcu(), I assume this smp_store_release() is
overkill, right?
Will a WRITE_ONCE() be sufficient?

Hmm, the synchronize_rcu(), shouldn't that be *after* storing the flags?

>> +
>> +	/* Concurrent CPUs could have returned last pages into ptr_ring */
>> +	page_pool_empty_ring(pool);
>>   
>> -	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
>> -	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
>> +	release_cnt = atomic_inc_return(&pool->pages_state_release_cnt);
>> +	page_pool_free_attempt(pool, hold_cnt, release_cnt);
>>   }
>>   EXPORT_SYMBOL(page_pool_destroy);
Yunsheng Lin May 5, 2023, 12:54 a.m. UTC | #9
On 2023/5/4 21:48, Jesper Dangaard Brouer wrote:
> On 04/05/2023 04.42, Yunsheng Lin wrote:
>> On 2023/4/29 0:16, Jesper Dangaard Brouer wrote:
>>>   void page_pool_release_page(struct page_pool *pool, struct page *page)
>>>   {
>>> +    unsigned int flags = READ_ONCE(pool->p.flags);
>>>       dma_addr_t dma;
>>> -    int count;
>>> +    u32 release_cnt;
>>> +    u32 hold_cnt;
>>>         if (!(pool->p.flags & PP_FLAG_DMA_MAP))
>>>           /* Always account for inflight pages, even if we didn't
>>> @@ -490,11 +503,15 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
>>>   skip_dma_unmap:
>>>       page_pool_clear_pp_info(page);
>>>   -    /* This may be the last page returned, releasing the pool, so
>>> -     * it is not safe to reference pool afterwards.
>>> -     */
>>> -    count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
>>> -    trace_page_pool_state_release(pool, page, count);
>>
>> There is a time window between "unsigned int flags = READ_ONCE(pool->p.flags)"
>> and flags checking, if page_pool_destroy() is called concurrently during that
>> time window, it seems we will have a pp instance leaking problem here?
>>
> 
> Nope, that is resolved by the code changes in page_pool_destroy(), see below.

Maybe I did not describe the data race clearly enough.

         CPU 0                                               CPU1
	   .
	   .
unsigned int flags = READ_ONCE(pool->p.flags);
	   .
	   .                                         page_pool_destroy()
	   .
atomic_inc_return(&pool->pages_state_release_cnt)
           .
	   .
	   .
 if (flags & PP_FLAG_SHUTDOWN)
	page_pool_free_attempt();

The above data race may cause a pp instance leaking problem:
CPU0 is releasing the last page for a pp and it did not see the pool->p.flags
with the PP_FLAG_SHUTDOWN set because page_pool_destroy() is called after
reading pool->p.flags, so page_pool_free_attempt() is not called to free
pp.

CPU1 calling the page_pool_destroy() also did not free pp as CPU0 had not
done the atomic_inc_return() for pool->pages_state_release_cnt yet.

Or did I miss something obvious here?

> 
>> It seems it is very hard to aovid this kind of corner case when using both
>> flags & PP_FLAG_SHUTDOWN and release_cnt/hold_cnt checking to decide if pp
>> instance can be freed.
>> Can we use something like biased reference counting, which used by frag support
>> in page pool? So that we only need to check only one variable and avoid cache
>> bouncing as much as possible.
>>
> 
> See below, I believe we are doing an equivalent refcnt bias trick, that
> solves these corner cases in page_pool_destroy().
> In short: hold_cnt is increased, prior to setting PP_FLAG_SHUTDOWN.
> Thus, if this code READ_ONCE flags without PP_FLAG_SHUTDOWN, we know it
> will not be the last to release pool->pages_state_release_cnt.

It is not exactly the kind of refcnt bias trick in my mind, I was thinking
about using pool->pages_state_hold_cnt as refcnt bias and merge it to
pool->pages_state_release_cnt as needed, maybe I need to try to implement
that to see if it turn out to be what I want it to be.

> Below: Perhaps, we should add a RCU grace period to make absolutely
> sure, that this code completes before page_pool_destroy() call completes.
> 
> 
>>> +    if (flags & PP_FLAG_SHUTDOWN)
>>> +        hold_cnt = pp_read_hold_cnt(pool);
>>> +
> 
> I would like to avoid above code, and I'm considering using call_rcu(),
> which I think will resolve the race[0] this code deals with.
> As I explained here[0], this code deals with another kind of race.

Yes, I understand that. I even went to check if the below tracepoint
trace_page_pool_state_release() was causing a use-after-free problem
as it is passing 'pool':)

> 
>  [0] https://lore.kernel.org/all/f671f5da-d9bc-a559-2120-10c3491e6f6d@redhat.com/
> 
>>> +    release_cnt = atomic_inc_return(&pool->pages_state_release_cnt);
>>> +    trace_page_pool_state_release(pool, page, release_cnt);
>>> +
>>> +    /* In shutdown phase, last page will free pool instance */
>>> +    if (flags & PP_FLAG_SHUTDOWN)
>>> +        page_pool_free_attempt(pool, hold_cnt, release_cnt);
>>>   }
>>>   EXPORT_SYMBOL(page_pool_release_page);
>>>
>>
>> ...
>>
>>>     void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
>>> @@ -856,6 +884,10 @@ EXPORT_SYMBOL(page_pool_unlink_napi);
>>>     void page_pool_destroy(struct page_pool *pool)
>>>   {
>>> +    unsigned int flags;
>>> +    u32 release_cnt;
>>> +    u32 hold_cnt;
>>> +
>>>       if (!pool)
>>>           return;
>>>   @@ -868,11 +900,39 @@ void page_pool_destroy(struct page_pool *pool)
>>>       if (!page_pool_release(pool))
>>>           return;
>>>   -    pool->defer_start = jiffies;
>>> -    pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
>>> +    /* PP have pages inflight, thus cannot immediately release memory.
>>> +     * Enter into shutdown phase, depending on remaining in-flight PP
>>> +     * pages to trigger shutdown process (on concurrent CPUs) and last
>>> +     * page will free pool instance.
>>> +     *
>>> +     * There exist two race conditions here, we need to take into
>>> +     * account in the following code.
>>> +     *
>>> +     * 1. Before setting PP_FLAG_SHUTDOWN another CPU released the last
>>> +     *    pages into the ptr_ring.  Thus, it missed triggering shutdown
>>> +     *    process, which can then be stalled forever.
>>> +     *
>>> +     * 2. After setting PP_FLAG_SHUTDOWN another CPU released the last
>>> +     *    page, which triggered shutdown process and freed pool
>>> +     *    instance. Thus, its not safe to dereference *pool afterwards.
>>> +     *
>>> +     * Handling races by holding a fake in-flight count, via
>>> +     * artificially bumping pages_state_hold_cnt, which assures pool
>>> +     * isn't freed under us.  For race(1) its safe to recheck ptr_ring
>>> +     * (it will not free pool). Race(2) cannot happen, and we can
>>> +     * release fake in-flight count as last step.
>>> +     */
>>> +    hold_cnt = READ_ONCE(pool->pages_state_hold_cnt) + 1;
>>> +    smp_store_release(&pool->pages_state_hold_cnt, hold_cnt);
>>
>> I assume the smp_store_release() is used to ensure the correct order
>> between the above store operations?
>> There is data dependency between those two store operations, do we
>> really need the smp_store_release() here?
>>
>>> +    barrier();
>>> +    flags = READ_ONCE(pool->p.flags) | PP_FLAG_SHUTDOWN;
>>
>> Do we need a stronger barrier like smp_rmb() to prevent cpu from
>> executing "flags = READ_ONCE(pool->p.flags) | PP_FLAG_SHUTDOWN"
>> before "smp_store_release(&pool->pages_state_hold_cnt, hold_cnt)"
>> even if there is a smp_store_release() barrier here?
>>
> I do see you point and how it is related to your above comment for
> page_pool_release_page().
> 
> I think we need to replace barrier() with synchronize_rcu().
> Meaning we add a RCU grace period to "wait" for above code (in
> page_pool_release_page) that read the old flags value to complete.
> 
> 
>>> +    smp_store_release(&pool->p.flags, flags);
> 
> When doing a synchronize_rcu(), I assume this smp_store_release() is
> overkill, right?
> Will a WRITE_ONCE() be sufficient?
> 
> Hmm, the synchronize_rcu(), shouldn't that be *after* storing the flags?

Yes.
As my understanding, we probably do not need any of those *_ONCE() and
barrier when using rcu.

But I am not really convinced that we need to go for rcu yet.
Yunsheng Lin May 6, 2023, 1:11 p.m. UTC | #10
On 2023/5/5 8:54, Yunsheng Lin wrote:
> It is not exactly the kind of refcnt bias trick in my mind, I was thinking
> about using pool->pages_state_hold_cnt as refcnt bias and merge it to
> pool->pages_state_release_cnt as needed, maybe I need to try to implement
> that to see if it turn out to be what I want it to be.
> 

I did try implementing the above idea, not sure is there anything missing
yet as I only do the compile testing.
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index c8ec2f34722b..bd8dacc2adfd 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -50,6 +50,10 @@
 				 PP_FLAG_DMA_SYNC_DEV |\
 				 PP_FLAG_PAGE_FRAG)
 
+
+/* Internal flag: PP in shutdown phase, waiting for inflight pages */
+#define PP_FLAG_SHUTDOWN	BIT(8)
+
 /*
  * Fast allocation side cache array/stack
  *
@@ -151,11 +155,6 @@ static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
 struct page_pool {
 	struct page_pool_params p;
 
-	struct delayed_work release_dw;
-	void (*disconnect)(void *);
-	unsigned long defer_start;
-	unsigned long defer_warn;
-
 	u32 pages_state_hold_cnt;
 	unsigned int frag_offset;
 	struct page *frag_page;
@@ -165,6 +164,7 @@ struct page_pool {
 	/* these stats are incremented while in softirq context */
 	struct page_pool_alloc_stats alloc_stats;
 #endif
+	void (*disconnect)(void *);
 	u32 xdp_mem_id;
 
 	/*
@@ -206,8 +206,6 @@ struct page_pool {
 	 * refcnt serves purpose is to simplify drivers error handling.
 	 */
 	refcount_t user_cnt;
-
-	u64 destroy_cnt;
 };
 
 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h
index ca534501158b..200f539f1a70 100644
--- a/include/trace/events/page_pool.h
+++ b/include/trace/events/page_pool.h
@@ -23,7 +23,6 @@ TRACE_EVENT(page_pool_release,
 		__field(s32,	inflight)
 		__field(u32,	hold)
 		__field(u32,	release)
-		__field(u64,	cnt)
 	),
 
 	TP_fast_assign(
@@ -31,12 +30,11 @@ TRACE_EVENT(page_pool_release,
 		__entry->inflight	= inflight;
 		__entry->hold		= hold;
 		__entry->release	= release;
-		__entry->cnt		= pool->destroy_cnt;
 	),
 
-	TP_printk("page_pool=%p inflight=%d hold=%u release=%u cnt=%llu",
+	TP_printk("page_pool=%p inflight=%d hold=%u release=%u",
 		__entry->pool, __entry->inflight, __entry->hold,
-		__entry->release, __entry->cnt)
+		__entry->release)
 );
 
 TRACE_EVENT(page_pool_state_release,
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index e212e9d7edcb..6b3393f0c194 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -190,7 +190,8 @@ static int page_pool_init(struct page_pool *pool,
 	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
 		return -ENOMEM;
 
-	atomic_set(&pool->pages_state_release_cnt, 0);
+	atomic_set(&pool->pages_state_release_cnt, INT_MAX / 2);
+	pool->pages_state_hold_cnt = INT_MAX / 2;
 
 	/* Driver calling page_pool_create() also call page_pool_destroy() */
 	refcount_set(&pool->user_cnt, 1);
@@ -344,6 +345,15 @@ static void page_pool_clear_pp_info(struct page *page)
 	page->pp = NULL;
 }
 
+static void page_pool_hold_cnt_dec(struct page_pool *pool)
+{
+	if (likely(--pool->pages_state_hold_cnt))
+		return;
+
+	pool->pages_state_hold_cnt = INT_MAX / 2;
+	atomic_add(INT_MAX / 2, &pool->pages_state_release_cnt);
+}
+
 static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 						 gfp_t gfp)
 {
@@ -364,7 +374,8 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 	page_pool_set_pp_info(pool, page);
 
 	/* Track how many pages are held 'in-flight' */
-	pool->pages_state_hold_cnt++;
+	page_pool_hold_cnt_dec(pool);
+
 	trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
 	return page;
 }
@@ -410,7 +421,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 		page_pool_set_pp_info(pool, page);
 		pool->alloc.cache[pool->alloc.count++] = page;
 		/* Track how many pages are held 'in-flight' */
-		pool->pages_state_hold_cnt++;
+		page_pool_hold_cnt_dec(pool);
 		trace_page_pool_state_hold(pool, page,
 					   pool->pages_state_hold_cnt);
 	}
@@ -445,24 +456,8 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
 }
 EXPORT_SYMBOL(page_pool_alloc_pages);
 
-/* Calculate distance between two u32 values, valid if distance is below 2^(31)
- *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
- */
-#define _distance(a, b)	(s32)((a) - (b))
-
-static s32 page_pool_inflight(struct page_pool *pool)
-{
-	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
-	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
-	s32 inflight;
-
-	inflight = _distance(hold_cnt, release_cnt);
 
-	trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
-	WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
-
-	return inflight;
-}
+static void page_pool_free(struct page_pool *pool);
 
 /* Disconnects a page (from a page_pool).  API users can have a need
  * to disconnect a page (from a page_pool), to allow it to be used as
@@ -493,8 +488,12 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
 	/* This may be the last page returned, releasing the pool, so
 	 * it is not safe to reference pool afterwards.
 	 */
-	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
+	count = atomic_dec_return_relaxed(&pool->pages_state_release_cnt);
+
 	trace_page_pool_state_release(pool, page, count);
+
+	if (unlikely(!count))
+		page_pool_free(pool);
 }
 EXPORT_SYMBOL(page_pool_release_page);
 
@@ -513,11 +512,20 @@ static void page_pool_return_page(struct page_pool *pool, struct page *page)
 static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
 {
 	int ret;
-	/* BH protection not needed if current is softirq */
-	if (in_softirq())
-		ret = ptr_ring_produce(&pool->ring, page);
-	else
-		ret = ptr_ring_produce_bh(&pool->ring, page);
+
+	page_pool_ring_lock(pool);
+
+	/* Do the check within spinlock to pair with flags setting
+	 * in page_pool_destroy().
+	 */
+	if (unlikely(pool->p.flags & PP_FLAG_SHUTDOWN)) {
+		page_pool_ring_unlock(pool);
+		return false;
+	}
+
+	ret = __ptr_ring_produce(&pool->ring, page);
+
+	page_pool_ring_unlock(pool);
 
 	if (!ret) {
 		recycle_stat_inc(pool, ring);
@@ -634,9 +642,20 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 	if (unlikely(!bulk_len))
 		return;
 
+	i = 0;
+
 	/* Bulk producer into ptr_ring page_pool cache */
 	page_pool_ring_lock(pool);
-	for (i = 0; i < bulk_len; i++) {
+
+	/* Do the check within spinlock to pair with flags setting
+	 * in page_pool_destroy().
+	 */
+	if (unlikely(pool->p.flags & PP_FLAG_SHUTDOWN)) {
+		page_pool_ring_unlock(pool);
+		goto return_page;
+	}
+
+	for (; i < bulk_len; i++) {
 		if (__ptr_ring_produce(&pool->ring, data[i])) {
 			/* ring full */
 			recycle_stat_inc(pool, ring_full);
@@ -650,8 +669,10 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 	if (likely(i == bulk_len))
 		return;
 
-	/* ptr_ring cache full, free remaining pages outside producer lock
-	 * since put_page() with refcnt == 1 can be an expensive operation
+return_page:
+	/* ptr_ring cache full or in shutdown mode, free remaining pages
+	 * outside producer lock since put_page() with refcnt == 1 can
+	 * be an expensive operation
 	 */
 	for (; i < bulk_len; i++)
 		page_pool_return_page(pool, data[i]);
@@ -768,13 +789,10 @@ static void page_pool_free(struct page_pool *pool)
 	kfree(pool);
 }
 
-static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
+static void page_pool_scrub(struct page_pool *pool)
 {
 	struct page *page;
 
-	if (pool->destroy_cnt)
-		return;
-
 	/* Empty alloc cache, assume caller made sure this is
 	 * no-longer in use, and page_pool_alloc_pages() cannot be
 	 * call concurrently.
@@ -785,52 +803,6 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
 	}
 }
 
-static void page_pool_scrub(struct page_pool *pool)
-{
-	page_pool_empty_alloc_cache_once(pool);
-	pool->destroy_cnt++;
-
-	/* No more consumers should exist, but producers could still
-	 * be in-flight.
-	 */
-	page_pool_empty_ring(pool);
-}
-
-static int page_pool_release(struct page_pool *pool)
-{
-	int inflight;
-
-	page_pool_scrub(pool);
-	inflight = page_pool_inflight(pool);
-	if (!inflight)
-		page_pool_free(pool);
-
-	return inflight;
-}
-
-static void page_pool_release_retry(struct work_struct *wq)
-{
-	struct delayed_work *dwq = to_delayed_work(wq);
-	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
-	int inflight;
-
-	inflight = page_pool_release(pool);
-	if (!inflight)
-		return;
-
-	/* Periodic warning */
-	if (time_after_eq(jiffies, pool->defer_warn)) {
-		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
-
-		pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
-			__func__, inflight, sec);
-		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
-	}
-
-	/* Still not ready to be disconnected, retry later */
-	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
-}
-
 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
 			   struct xdp_mem_info *mem)
 {
@@ -864,15 +836,25 @@ void page_pool_destroy(struct page_pool *pool)
 
 	page_pool_unlink_napi(pool);
 	page_pool_free_frag(pool);
+	page_pool_scrub(pool);
 
-	if (!page_pool_release(pool))
-		return;
+	/* when page_pool_destroy() is called, it is assumed that no
+	 * page will be recycled to pool->alloc cache, we only need to
+	 * make sure no page will be recycled to pool->ring by setting
+	 * PP_FLAG_SHUTDOWN within spinlock to pair with flags checking
+	 * within spinlock.
+	 */
+	page_pool_ring_lock(pool);
+	pool->p.flags |= PP_FLAG_SHUTDOWN;
+	page_pool_ring_unlock(pool);
 
-	pool->defer_start = jiffies;
-	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
+	page_pool_empty_ring(pool);
+
+	if (atomic_sub_return_relaxed(pool->pages_state_hold_cnt,
+				      &pool->pages_state_release_cnt))
+		return;
 
-	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
-	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
+	page_pool_free(pool);
 }
 EXPORT_SYMBOL(page_pool_destroy);
diff mbox series

Patch

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index c8ec2f34722b..a71c0f2695b0 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -50,6 +50,9 @@ 
 				 PP_FLAG_DMA_SYNC_DEV |\
 				 PP_FLAG_PAGE_FRAG)
 
+/* Internal flag: PP in shutdown phase, waiting for inflight pages */
+#define PP_FLAG_SHUTDOWN	BIT(8)
+
 /*
  * Fast allocation side cache array/stack
  *
@@ -151,11 +154,6 @@  static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
 struct page_pool {
 	struct page_pool_params p;
 
-	struct delayed_work release_dw;
-	void (*disconnect)(void *);
-	unsigned long defer_start;
-	unsigned long defer_warn;
-
 	u32 pages_state_hold_cnt;
 	unsigned int frag_offset;
 	struct page *frag_page;
@@ -165,6 +163,7 @@  struct page_pool {
 	/* these stats are incremented while in softirq context */
 	struct page_pool_alloc_stats alloc_stats;
 #endif
+	void (*disconnect)(void *);
 	u32 xdp_mem_id;
 
 	/*
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index e212e9d7edcb..54bdd140b7bd 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -23,9 +23,6 @@ 
 
 #include <trace/events/page_pool.h>
 
-#define DEFER_TIME (msecs_to_jiffies(1000))
-#define DEFER_WARN_INTERVAL (60 * HZ)
-
 #define BIAS_MAX	LONG_MAX
 
 #ifdef CONFIG_PAGE_POOL_STATS
@@ -380,6 +377,10 @@  static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 	struct page *page;
 	int i, nr_pages;
 
+	/* API usage BUG: PP in shutdown phase, cannot alloc new pages */
+	if (WARN_ON(pool->p.flags & PP_FLAG_SHUTDOWN))
+		return NULL;
+
 	/* Don't support bulk alloc for high-order pages */
 	if (unlikely(pp_order))
 		return __page_pool_alloc_page_order(pool, gfp);
@@ -450,10 +451,9 @@  EXPORT_SYMBOL(page_pool_alloc_pages);
  */
 #define _distance(a, b)	(s32)((a) - (b))
 
-static s32 page_pool_inflight(struct page_pool *pool)
+static s32 __page_pool_inflight(struct page_pool *pool,
+				u32 hold_cnt, u32 release_cnt)
 {
-	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
-	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
 	s32 inflight;
 
 	inflight = _distance(hold_cnt, release_cnt);
@@ -464,6 +464,17 @@  static s32 page_pool_inflight(struct page_pool *pool)
 	return inflight;
 }
 
+static s32 page_pool_inflight(struct page_pool *pool)
+{
+	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
+	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
+	return __page_pool_inflight(pool, hold_cnt, release_cnt);
+}
+
+static int page_pool_free_attempt(struct page_pool *pool,
+				  u32 hold_cnt, u32 release_cnt);
+static u32 pp_read_hold_cnt(struct page_pool *pool);
+
 /* Disconnects a page (from a page_pool).  API users can have a need
  * to disconnect a page (from a page_pool), to allow it to be used as
  * a regular page (that will eventually be returned to the normal
@@ -471,8 +482,10 @@  static s32 page_pool_inflight(struct page_pool *pool)
  */
 void page_pool_release_page(struct page_pool *pool, struct page *page)
 {
+	unsigned int flags = READ_ONCE(pool->p.flags);
 	dma_addr_t dma;
-	int count;
+	u32 release_cnt;
+	u32 hold_cnt;
 
 	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
 		/* Always account for inflight pages, even if we didn't
@@ -490,11 +503,15 @@  void page_pool_release_page(struct page_pool *pool, struct page *page)
 skip_dma_unmap:
 	page_pool_clear_pp_info(page);
 
-	/* This may be the last page returned, releasing the pool, so
-	 * it is not safe to reference pool afterwards.
-	 */
-	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
-	trace_page_pool_state_release(pool, page, count);
+	if (flags & PP_FLAG_SHUTDOWN)
+		hold_cnt = pp_read_hold_cnt(pool);
+
+	release_cnt = atomic_inc_return(&pool->pages_state_release_cnt);
+	trace_page_pool_state_release(pool, page, release_cnt);
+
+	/* In shutdown phase, last page will free pool instance */
+	if (flags & PP_FLAG_SHUTDOWN)
+		page_pool_free_attempt(pool, hold_cnt, release_cnt);
 }
 EXPORT_SYMBOL(page_pool_release_page);
 
@@ -535,7 +552,7 @@  static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
 static bool page_pool_recycle_in_cache(struct page *page,
 				       struct page_pool *pool)
 {
-	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
+	if (pool->alloc.count == PP_ALLOC_CACHE_SIZE) {
 		recycle_stat_inc(pool, cache_full);
 		return false;
 	}
@@ -546,6 +563,8 @@  static bool page_pool_recycle_in_cache(struct page *page,
 	return true;
 }
 
+static void page_pool_empty_ring(struct page_pool *pool);
+
 /* If the page refcnt == 1, this will try to recycle the page.
  * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
  * the configured size min(dma_sync_size, pool->max_len).
@@ -572,7 +591,8 @@  __page_pool_put_page(struct page_pool *pool, struct page *page,
 			page_pool_dma_sync_for_device(pool, page,
 						      dma_sync_size);
 
-		if (allow_direct && in_softirq() &&
+		/* During PP shutdown, no direct recycle must occur */
+		if (likely(allow_direct && in_softirq()) &&
 		    page_pool_recycle_in_cache(page, pool))
 			return NULL;
 
@@ -609,6 +629,8 @@  void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
 		recycle_stat_inc(pool, ring_full);
 		page_pool_return_page(pool, page);
 	}
+	if (page && pool->p.flags & PP_FLAG_SHUTDOWN)
+		page_pool_empty_ring(pool);
 }
 EXPORT_SYMBOL(page_pool_put_defragged_page);
 
@@ -646,6 +668,9 @@  void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 	recycle_stat_add(pool, ring, i);
 	page_pool_ring_unlock(pool);
 
+	if (pool->p.flags & PP_FLAG_SHUTDOWN)
+		page_pool_empty_ring(pool);
+
 	/* Hopefully all pages was return into ptr_ring */
 	if (likely(i == bulk_len))
 		return;
@@ -737,12 +762,18 @@  struct page *page_pool_alloc_frag(struct page_pool *pool,
 }
 EXPORT_SYMBOL(page_pool_alloc_frag);
 
+noinline
 static void page_pool_empty_ring(struct page_pool *pool)
 {
-	struct page *page;
+	struct page *page, *next;
+
+	next = ptr_ring_consume_bh(&pool->ring);
 
 	/* Empty recycle ring */
-	while ((page = ptr_ring_consume_bh(&pool->ring))) {
+	while (next) {
+		page = next;
+		next = ptr_ring_consume_bh(&pool->ring);
+
 		/* Verify the refcnt invariant of cached pages */
 		if (!(page_ref_count(page) == 1))
 			pr_crit("%s() page_pool refcnt %d violation\n",
@@ -796,39 +827,36 @@  static void page_pool_scrub(struct page_pool *pool)
 	page_pool_empty_ring(pool);
 }
 
-static int page_pool_release(struct page_pool *pool)
+/* Avoid inlining code to avoid speculative fetching cacheline */
+noinline
+static u32 pp_read_hold_cnt(struct page_pool *pool)
+{
+	return READ_ONCE(pool->pages_state_hold_cnt);
+}
+
+noinline
+static int page_pool_free_attempt(struct page_pool *pool,
+				  u32 hold_cnt, u32 release_cnt)
 {
 	int inflight;
 
-	page_pool_scrub(pool);
-	inflight = page_pool_inflight(pool);
+	inflight = __page_pool_inflight(pool, hold_cnt, release_cnt);
 	if (!inflight)
 		page_pool_free(pool);
 
 	return inflight;
 }
 
-static void page_pool_release_retry(struct work_struct *wq)
+static int page_pool_release(struct page_pool *pool)
 {
-	struct delayed_work *dwq = to_delayed_work(wq);
-	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
 	int inflight;
 
-	inflight = page_pool_release(pool);
+	page_pool_scrub(pool);
+	inflight = page_pool_inflight(pool);
 	if (!inflight)
-		return;
-
-	/* Periodic warning */
-	if (time_after_eq(jiffies, pool->defer_warn)) {
-		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
-
-		pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
-			__func__, inflight, sec);
-		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
-	}
+		page_pool_free(pool);
 
-	/* Still not ready to be disconnected, retry later */
-	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
+	return inflight;
 }
 
 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
@@ -856,6 +884,10 @@  EXPORT_SYMBOL(page_pool_unlink_napi);
 
 void page_pool_destroy(struct page_pool *pool)
 {
+	unsigned int flags;
+	u32 release_cnt;
+	u32 hold_cnt;
+
 	if (!pool)
 		return;
 
@@ -868,11 +900,39 @@  void page_pool_destroy(struct page_pool *pool)
 	if (!page_pool_release(pool))
 		return;
 
-	pool->defer_start = jiffies;
-	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
+	/* PP have pages inflight, thus cannot immediately release memory.
+	 * Enter into shutdown phase, depending on remaining in-flight PP
+	 * pages to trigger shutdown process (on concurrent CPUs) and last
+	 * page will free pool instance.
+	 *
+	 * There exist two race conditions here, we need to take into
+	 * account in the following code.
+	 *
+	 * 1. Before setting PP_FLAG_SHUTDOWN another CPU released the last
+	 *    pages into the ptr_ring.  Thus, it missed triggering shutdown
+	 *    process, which can then be stalled forever.
+	 *
+	 * 2. After setting PP_FLAG_SHUTDOWN another CPU released the last
+	 *    page, which triggered shutdown process and freed pool
+	 *    instance. Thus, its not safe to dereference *pool afterwards.
+	 *
+	 * Handling races by holding a fake in-flight count, via
+	 * artificially bumping pages_state_hold_cnt, which assures pool
+	 * isn't freed under us.  For race(1) its safe to recheck ptr_ring
+	 * (it will not free pool). Race(2) cannot happen, and we can
+	 * release fake in-flight count as last step.
+	 */
+	hold_cnt = READ_ONCE(pool->pages_state_hold_cnt) + 1;
+	smp_store_release(&pool->pages_state_hold_cnt, hold_cnt);
+	barrier();
+	flags = READ_ONCE(pool->p.flags) | PP_FLAG_SHUTDOWN;
+	smp_store_release(&pool->p.flags, flags);
+
+	/* Concurrent CPUs could have returned last pages into ptr_ring */
+	page_pool_empty_ring(pool);
 
-	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
-	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
+	release_cnt = atomic_inc_return(&pool->pages_state_release_cnt);
+	page_pool_free_attempt(pool, hold_cnt, release_cnt);
 }
 EXPORT_SYMBOL(page_pool_destroy);