diff mbox series

[net-next,1/2] page_pool: check for PP direct cache locality later

Message ID 20240329165507.3240110-2-aleksander.lobakin@intel.com (mailing list archive)
State Accepted
Commit 4a96a4e807c390a9d91b450ebe04eeb2e0ecc076
Delegated to: Netdev Maintainers
Headers show
Series page_pool: allow direct bulk recycling | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 5798 this patch: 5798
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 5 maintainers not CCed: herbert@gondor.apana.org.au ilias.apalodimas@linaro.org dsahern@kernel.org hawk@kernel.org steffen.klassert@secunet.com
netdev/build_clang success Errors and warnings before: 1076 this patch: 1076
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 6082 this patch: 6082
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 293 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-03-31--03-00 (tests: 953)

Commit Message

Alexander Lobakin March 29, 2024, 4:55 p.m. UTC
Since we have pool->p.napi (Jakub) and pool->cpuid (Lorenzo) to check
whether it's safe to use direct recycling, we can use both globally for
each page instead of relying solely on @allow_direct argument.
Let's assume that @allow_direct means "I'm sure it's local, don't waste
time rechecking this" and when it's false, try the mentioned params to
still recycle the page directly. If neither is true, we'll lose some
CPU cycles, but then it surely won't be hotpath. On the other hand,
paths where it's possible to use direct cache, but not possible to
safely set @allow_direct, will benefit from this move.
The whole propagation of @napi_safe through a dozen of skb freeing
functions can now go away, which saves us some stack space.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 include/linux/skbuff.h | 12 ++++----
 net/core/page_pool.c   | 31 +++++++++++++++++--
 net/core/skbuff.c      | 70 +++++++++++++-----------------------------
 net/ipv4/esp4.c        |  2 +-
 net/ipv6/esp6.c        |  2 +-
 5 files changed, 58 insertions(+), 59 deletions(-)

Comments

Mina Almasry March 29, 2024, 7:21 p.m. UTC | #1
On Fri, Mar 29, 2024 at 9:56 AM Alexander Lobakin
<aleksander.lobakin@intel.com> wrote:
>
> Since we have pool->p.napi (Jakub) and pool->cpuid (Lorenzo) to check
> whether it's safe to use direct recycling, we can use both globally for
> each page instead of relying solely on @allow_direct argument.
> Let's assume that @allow_direct means "I'm sure it's local, don't waste
> time rechecking this" and when it's false, try the mentioned params to
> still recycle the page directly. If neither is true, we'll lose some
> CPU cycles, but then it surely won't be hotpath. On the other hand,
> paths where it's possible to use direct cache, but not possible to
> safely set @allow_direct, will benefit from this move.
> The whole propagation of @napi_safe through a dozen of skb freeing
> functions can now go away, which saves us some stack space.
>
> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
> ---
>  include/linux/skbuff.h | 12 ++++----
>  net/core/page_pool.c   | 31 +++++++++++++++++--
>  net/core/skbuff.c      | 70 +++++++++++++-----------------------------
>  net/ipv4/esp4.c        |  2 +-
>  net/ipv6/esp6.c        |  2 +-
>  5 files changed, 58 insertions(+), 59 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index dadd3f55d549..f7f6e42c6814 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3515,25 +3515,25 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
>                     unsigned int headroom);
>  int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
>                          struct bpf_prog *prog);
> -bool napi_pp_put_page(struct page *page, bool napi_safe);
> +bool napi_pp_put_page(struct page *page);
>
>  static inline void
> -skb_page_unref(const struct sk_buff *skb, struct page *page, bool napi_safe)
> +skb_page_unref(const struct sk_buff *skb, struct page *page)
>  {
>  #ifdef CONFIG_PAGE_POOL
> -       if (skb->pp_recycle && napi_pp_put_page(page, napi_safe))
> +       if (skb->pp_recycle && napi_pp_put_page(page))
>                 return;
>  #endif
>         put_page(page);
>  }
>
>  static inline void
> -napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
> +napi_frag_unref(skb_frag_t *frag, bool recycle)
>  {
>         struct page *page = skb_frag_page(frag);
>
>  #ifdef CONFIG_PAGE_POOL
> -       if (recycle && napi_pp_put_page(page, napi_safe))
> +       if (recycle && napi_pp_put_page(page))
>                 return;
>  #endif
>         put_page(page);
> @@ -3549,7 +3549,7 @@ napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
>   */
>  static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
>  {
> -       napi_frag_unref(frag, recycle, false);
> +       napi_frag_unref(frag, recycle);
>  }
>
>  /**
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index dd364d738c00..9d56257e444b 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -690,8 +690,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
>                         page_pool_dma_sync_for_device(pool, page,
>                                                       dma_sync_size);
>
> -               if (allow_direct && in_softirq() &&
> -                   page_pool_recycle_in_cache(page, pool))
> +               if (allow_direct && page_pool_recycle_in_cache(page, pool))
>                         return NULL;
>
>                 /* Page found as candidate for recycling */
> @@ -716,9 +715,35 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
>         return NULL;
>  }
>
> +static bool page_pool_napi_local(const struct page_pool *pool)
> +{
> +       const struct napi_struct *napi;
> +       u32 cpuid;
> +
> +       if (unlikely(!in_softirq()))
> +               return false;
> +
> +       /* Allow direct recycle if we have reasons to believe that we are
> +        * in the same context as the consumer would run, so there's
> +        * no possible race.
> +        * __page_pool_put_page() makes sure we're not in hardirq context
> +        * and interrupts are enabled prior to accessing the cache.
> +        */
> +       cpuid = smp_processor_id();
> +       if (READ_ONCE(pool->cpuid) == cpuid)
> +               return true;
> +
> +       napi = READ_ONCE(pool->p.napi);
> +
> +       return napi && READ_ONCE(napi->list_owner) == cpuid;
> +}
> +
>  void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
>                                 unsigned int dma_sync_size, bool allow_direct)
>  {
> +       if (!allow_direct)
> +               allow_direct = page_pool_napi_local(pool);
> +
>         page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
>         if (page && !page_pool_recycle_in_ring(pool, page)) {
>                 /* Cache full, fallback to free pages */
> @@ -969,7 +994,7 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
>  static void page_pool_disable_direct_recycling(struct page_pool *pool)
>  {
>         /* Disable direct recycling based on pool->cpuid.
> -        * Paired with READ_ONCE() in napi_pp_put_page().
> +        * Paired with READ_ONCE() in page_pool_napi_local().
>          */
>         WRITE_ONCE(pool->cpuid, -1);
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index e0e172638c0a..e01e2a618621 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -1005,11 +1005,8 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
>  EXPORT_SYMBOL(skb_cow_data_for_xdp);
>
>  #if IS_ENABLED(CONFIG_PAGE_POOL)
> -bool napi_pp_put_page(struct page *page, bool napi_safe)
> +bool napi_pp_put_page(struct page *page)
>  {
> -       bool allow_direct = false;
> -       struct page_pool *pp;
> -
>         page = compound_head(page);
>
>         /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
> @@ -1022,39 +1019,18 @@ bool napi_pp_put_page(struct page *page, bool napi_safe)
>         if (unlikely(!is_pp_page(page)))
>                 return false;
>
> -       pp = page->pp;
> -
> -       /* Allow direct recycle if we have reasons to believe that we are
> -        * in the same context as the consumer would run, so there's
> -        * no possible race.
> -        * __page_pool_put_page() makes sure we're not in hardirq context
> -        * and interrupts are enabled prior to accessing the cache.
> -        */
> -       if (napi_safe || in_softirq()) {
> -               const struct napi_struct *napi = READ_ONCE(pp->p.napi);
> -               unsigned int cpuid = smp_processor_id();
> -
> -               allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid;
> -               allow_direct |= READ_ONCE(pp->cpuid) == cpuid;
> -       }
> -
> -       /* Driver set this to memory recycling info. Reset it on recycle.
> -        * This will *not* work for NIC using a split-page memory model.
> -        * The page will be returned to the pool here regardless of the
> -        * 'flipped' fragment being in use or not.
> -        */
> -       page_pool_put_full_page(pp, page, allow_direct);
> +       page_pool_put_full_page(page->pp, page, false);
>
>         return true;
>  }
>  EXPORT_SYMBOL(napi_pp_put_page);
>  #endif
>
> -static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
> +static bool skb_pp_recycle(struct sk_buff *skb, void *data)
>  {
>         if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
>                 return false;
> -       return napi_pp_put_page(virt_to_page(data), napi_safe);
> +       return napi_pp_put_page(virt_to_page(data));
>  }
>
>  /**
> @@ -1096,12 +1072,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
>                 kfree(head);
>  }
>
> -static void skb_free_head(struct sk_buff *skb, bool napi_safe)
> +static void skb_free_head(struct sk_buff *skb)
>  {
>         unsigned char *head = skb->head;
>
>         if (skb->head_frag) {
> -               if (skb_pp_recycle(skb, head, napi_safe))
> +               if (skb_pp_recycle(skb, head))
>                         return;
>                 skb_free_frag(head);
>         } else {
> @@ -1109,8 +1085,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe)
>         }
>  }
>
> -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
> -                            bool napi_safe)
> +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
>  {
>         struct skb_shared_info *shinfo = skb_shinfo(skb);
>         int i;
> @@ -1127,13 +1102,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,

skb_release_data has a napi_safe argument. Do you want to remove that
as well? To my eye it looks like dead code now that the value is not
passed to napi_frag_unref and skb_free_head.

After this change, __napi_kfree_skb() which previously set napi_safe
to true, now isn't able to. Is my understanding correct that this
should still work fine because we now check pool->p.napi in the
page_pool code?

>         }
>
>         for (i = 0; i < shinfo->nr_frags; i++)
> -               napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
> +               napi_frag_unref(&shinfo->frags[i], skb->pp_recycle);
>
>  free_head:
>         if (shinfo->frag_list)
>                 kfree_skb_list_reason(shinfo->frag_list, reason);
>
> -       skb_free_head(skb, napi_safe);
> +       skb_free_head(skb);
>  exit:
>         /* When we clone an SKB we copy the reycling bit. The pp_recycle
>          * bit is only set on the head though, so in order to avoid races
> @@ -1194,12 +1169,11 @@ void skb_release_head_state(struct sk_buff *skb)
>  }
>
>  /* Free everything but the sk_buff shell. */
> -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
> -                           bool napi_safe)
> +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
>  {
>         skb_release_head_state(skb);
>         if (likely(skb->head))
> -               skb_release_data(skb, reason, napi_safe);
> +               skb_release_data(skb, reason);
>  }
>
>  /**
> @@ -1213,7 +1187,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
>
>  void __kfree_skb(struct sk_buff *skb)
>  {
> -       skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
> +       skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
>         kfree_skbmem(skb);
>  }
>  EXPORT_SYMBOL(__kfree_skb);
> @@ -1270,7 +1244,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
>                 return;
>         }
>
> -       skb_release_all(skb, reason, false);
> +       skb_release_all(skb, reason);
>         sa->skb_array[sa->skb_count++] = skb;
>
>         if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
> @@ -1444,7 +1418,7 @@ EXPORT_SYMBOL(consume_skb);
>  void __consume_stateless_skb(struct sk_buff *skb)
>  {
>         trace_consume_skb(skb, __builtin_return_address(0));
> -       skb_release_data(skb, SKB_CONSUMED, false);
> +       skb_release_data(skb, SKB_CONSUMED);
>         kfree_skbmem(skb);
>  }
>
> @@ -1471,7 +1445,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
>
>  void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
>  {
> -       skb_release_all(skb, reason, true);
> +       skb_release_all(skb, reason);
>         napi_skb_cache_put(skb);
>  }
>
> @@ -1509,7 +1483,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
>                 return;
>         }
>
> -       skb_release_all(skb, SKB_CONSUMED, !!budget);
> +       skb_release_all(skb, SKB_CONSUMED);
>         napi_skb_cache_put(skb);
>  }
>  EXPORT_SYMBOL(napi_consume_skb);
> @@ -1640,7 +1614,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
>   */
>  struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
>  {
> -       skb_release_all(dst, SKB_CONSUMED, false);
> +       skb_release_all(dst, SKB_CONSUMED);
>         return __skb_clone(dst, src);
>  }
>  EXPORT_SYMBOL_GPL(skb_morph);
> @@ -2272,9 +2246,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
>                 if (skb_has_frag_list(skb))
>                         skb_clone_fraglist(skb);
>
> -               skb_release_data(skb, SKB_CONSUMED, false);
> +               skb_release_data(skb, SKB_CONSUMED);
>         } else {
> -               skb_free_head(skb, false);
> +               skb_free_head(skb);
>         }
>         off = (data + nhead) - skb->head;
>
> @@ -6575,12 +6549,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
>                         skb_frag_ref(skb, i);
>                 if (skb_has_frag_list(skb))
>                         skb_clone_fraglist(skb);
> -               skb_release_data(skb, SKB_CONSUMED, false);
> +               skb_release_data(skb, SKB_CONSUMED);
>         } else {
>                 /* we can reuse existing recount- all we did was
>                  * relocate values
>                  */
> -               skb_free_head(skb, false);
> +               skb_free_head(skb);
>         }
>
>         skb->head = data;
> @@ -6715,7 +6689,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
>                 skb_kfree_head(data, size);
>                 return -ENOMEM;
>         }
> -       skb_release_data(skb, SKB_CONSUMED, false);
> +       skb_release_data(skb, SKB_CONSUMED);
>
>         skb->head = data;
>         skb->head_frag = 0;
> diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
> index d33d12421814..3d647c9a7a21 100644
> --- a/net/ipv4/esp4.c
> +++ b/net/ipv4/esp4.c
> @@ -114,7 +114,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
>          */
>         if (req->src != req->dst)
>                 for (sg = sg_next(req->src); sg; sg = sg_next(sg))
> -                       skb_page_unref(skb, sg_page(sg), false);
> +                       skb_page_unref(skb, sg_page(sg));
>  }
>
>  #ifdef CONFIG_INET_ESPINTCP
> diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
> index 7371886d4f9f..fe8d53f5a5ee 100644
> --- a/net/ipv6/esp6.c
> +++ b/net/ipv6/esp6.c
> @@ -131,7 +131,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
>          */
>         if (req->src != req->dst)
>                 for (sg = sg_next(req->src); sg; sg = sg_next(sg))
> -                       skb_page_unref(skb, sg_page(sg), false);
> +                       skb_page_unref(skb, sg_page(sg));
>  }
>
>  #ifdef CONFIG_INET6_ESPINTCP
> --
> 2.44.0
>
>
Yunsheng Lin March 30, 2024, 12:41 p.m. UTC | #2
On 2024/3/30 0:55, Alexander Lobakin wrote:
> Since we have pool->p.napi (Jakub) and pool->cpuid (Lorenzo) to check
> whether it's safe to use direct recycling, we can use both globally for
> each page instead of relying solely on @allow_direct argument.
> Let's assume that @allow_direct means "I'm sure it's local, don't waste
> time rechecking this" and when it's false, try the mentioned params to
> still recycle the page directly. If neither is true, we'll lose some
> CPU cycles, but then it surely won't be hotpath. On the other hand,
> paths where it's possible to use direct cache, but not possible to
> safely set @allow_direct, will benefit from this move.
> The whole propagation of @napi_safe through a dozen of skb freeing
> functions can now go away, which saves us some stack space.
> 
> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
> ---
>  include/linux/skbuff.h | 12 ++++----
>  net/core/page_pool.c   | 31 +++++++++++++++++--
>  net/core/skbuff.c      | 70 +++++++++++++-----------------------------
>  net/ipv4/esp4.c        |  2 +-
>  net/ipv6/esp6.c        |  2 +-
>  5 files changed, 58 insertions(+), 59 deletions(-)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index dadd3f55d549..f7f6e42c6814 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3515,25 +3515,25 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
>  		    unsigned int headroom);
>  int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
>  			 struct bpf_prog *prog);
> -bool napi_pp_put_page(struct page *page, bool napi_safe);
> +bool napi_pp_put_page(struct page *page);
>  
>  static inline void
> -skb_page_unref(const struct sk_buff *skb, struct page *page, bool napi_safe)
> +skb_page_unref(const struct sk_buff *skb, struct page *page)
>  {
>  #ifdef CONFIG_PAGE_POOL
> -	if (skb->pp_recycle && napi_pp_put_page(page, napi_safe))
> +	if (skb->pp_recycle && napi_pp_put_page(page))
>  		return;
>  #endif
>  	put_page(page);
>  }
>  
>  static inline void
> -napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
> +napi_frag_unref(skb_frag_t *frag, bool recycle)
>  {
>  	struct page *page = skb_frag_page(frag);
>  
>  #ifdef CONFIG_PAGE_POOL
> -	if (recycle && napi_pp_put_page(page, napi_safe))
> +	if (recycle && napi_pp_put_page(page))
>  		return;
>  #endif
>  	put_page(page);
> @@ -3549,7 +3549,7 @@ napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
>   */
>  static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
>  {
> -	napi_frag_unref(frag, recycle, false);
> +	napi_frag_unref(frag, recycle);
>  }
>  
>  /**
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index dd364d738c00..9d56257e444b 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -690,8 +690,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
>  			page_pool_dma_sync_for_device(pool, page,
>  						      dma_sync_size);
>  
> -		if (allow_direct && in_softirq() &&
> -		    page_pool_recycle_in_cache(page, pool))
> +		if (allow_direct && page_pool_recycle_in_cache(page, pool))
>  			return NULL;
>  
>  		/* Page found as candidate for recycling */
> @@ -716,9 +715,35 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
>  	return NULL;
>  }
>  
> +static bool page_pool_napi_local(const struct page_pool *pool)
> +{
> +	const struct napi_struct *napi;
> +	u32 cpuid;
> +
> +	if (unlikely(!in_softirq()))
> +		return false;
> +
> +	/* Allow direct recycle if we have reasons to believe that we are
> +	 * in the same context as the consumer would run, so there's
> +	 * no possible race.
> +	 * __page_pool_put_page() makes sure we're not in hardirq context
> +	 * and interrupts are enabled prior to accessing the cache.
> +	 */
> +	cpuid = smp_processor_id();
> +	if (READ_ONCE(pool->cpuid) == cpuid)
> +		return true;
> +
> +	napi = READ_ONCE(pool->p.napi);
> +
> +	return napi && READ_ONCE(napi->list_owner) == cpuid;
> +}
> +
>  void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
>  				unsigned int dma_sync_size, bool allow_direct)
>  {
> +	if (!allow_direct)

It seems we are changing some semantics here, in_softirq() is checked
even if allow_direct is true before this patch. And it seems in_softirq()
is not checked if allow_direct is true after this patch? I think we might
need some assertion to ensure @allow_direct really means "I'm sure it's
local, don't waste time rechecking this". As my understanding, it is really
hard to debug this kind of problem, so in_softirq() is always checking.

Perhaps add something like WARN_ONCE() or DEBUG_NET_WARN_ON_ONCE for
allow_direct being true case to catch the API misuse?

> +		allow_direct = page_pool_napi_local(pool);
> +
>  	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
>  	if (page && !page_pool_recycle_in_ring(pool, page)) {
>  		/* Cache full, fallback to free pages */
> @@ -969,7 +994,7 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
>  static void page_pool_disable_direct_recycling(struct page_pool *pool)
>  {
>  	/* Disable direct recycling based on pool->cpuid.
> -	 * Paired with READ_ONCE() in napi_pp_put_page().
> +	 * Paired with READ_ONCE() in page_pool_napi_local().
>  	 */
>  	WRITE_ONCE(pool->cpuid, -1);
>
Alexander Lobakin April 3, 2024, 9:21 a.m. UTC | #3
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Sat, 30 Mar 2024 20:41:24 +0800

> On 2024/3/30 0:55, Alexander Lobakin wrote:
>> Since we have pool->p.napi (Jakub) and pool->cpuid (Lorenzo) to check
>> whether it's safe to use direct recycling, we can use both globally for
>> each page instead of relying solely on @allow_direct argument.
>> Let's assume that @allow_direct means "I'm sure it's local, don't waste
>> time rechecking this" and when it's false, try the mentioned params to
>> still recycle the page directly. If neither is true, we'll lose some
>> CPU cycles, but then it surely won't be hotpath. On the other hand,
>> paths where it's possible to use direct cache, but not possible to
>> safely set @allow_direct, will benefit from this move.
>> The whole propagation of @napi_safe through a dozen of skb freeing
>> functions can now go away, which saves us some stack space.

[...]

>>  void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
>>  				unsigned int dma_sync_size, bool allow_direct)
>>  {
>> +	if (!allow_direct)
> 
> It seems we are changing some semantics here, in_softirq() is checked
> even if allow_direct is true before this patch. And it seems in_softirq()
> is not checked if allow_direct is true after this patch? I think we might
> need some assertion to ensure @allow_direct really means "I'm sure it's
> local, don't waste time rechecking this". As my understanding, it is really
> hard to debug this kind of problem, so in_softirq() is always checking.

It's implied that setting @allow_direct to true means "we're certainly
able to do that, we're certainly in the softirq context". I haven't seen
any code which would set it to true outside of softirq context and it's
counter-intuitive TBH.

> 
> Perhaps add something like WARN_ONCE() or DEBUG_NET_WARN_ON_ONCE for
> allow_direct being true case to catch the API misuse?
> 
>> +		allow_direct = page_pool_napi_local(pool);
>> +
>>  	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
>>  	if (page && !page_pool_recycle_in_ring(pool, page)) {
>>  		/* Cache full, fallback to free pages */
>> @@ -969,7 +994,7 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
>>  static void page_pool_disable_direct_recycling(struct page_pool *pool)
>>  {
>>  	/* Disable direct recycling based on pool->cpuid.
>> -	 * Paired with READ_ONCE() in napi_pp_put_page().
>> +	 * Paired with READ_ONCE() in page_pool_napi_local().
>>  	 */
>>  	WRITE_ONCE(pool->cpuid, -1);
>>  
> 

Thanks,
Olek
diff mbox series

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index dadd3f55d549..f7f6e42c6814 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3515,25 +3515,25 @@  int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
 		    unsigned int headroom);
 int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
 			 struct bpf_prog *prog);
-bool napi_pp_put_page(struct page *page, bool napi_safe);
+bool napi_pp_put_page(struct page *page);
 
 static inline void
-skb_page_unref(const struct sk_buff *skb, struct page *page, bool napi_safe)
+skb_page_unref(const struct sk_buff *skb, struct page *page)
 {
 #ifdef CONFIG_PAGE_POOL
-	if (skb->pp_recycle && napi_pp_put_page(page, napi_safe))
+	if (skb->pp_recycle && napi_pp_put_page(page))
 		return;
 #endif
 	put_page(page);
 }
 
 static inline void
-napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
+napi_frag_unref(skb_frag_t *frag, bool recycle)
 {
 	struct page *page = skb_frag_page(frag);
 
 #ifdef CONFIG_PAGE_POOL
-	if (recycle && napi_pp_put_page(page, napi_safe))
+	if (recycle && napi_pp_put_page(page))
 		return;
 #endif
 	put_page(page);
@@ -3549,7 +3549,7 @@  napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
  */
 static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
 {
-	napi_frag_unref(frag, recycle, false);
+	napi_frag_unref(frag, recycle);
 }
 
 /**
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index dd364d738c00..9d56257e444b 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -690,8 +690,7 @@  __page_pool_put_page(struct page_pool *pool, struct page *page,
 			page_pool_dma_sync_for_device(pool, page,
 						      dma_sync_size);
 
-		if (allow_direct && in_softirq() &&
-		    page_pool_recycle_in_cache(page, pool))
+		if (allow_direct && page_pool_recycle_in_cache(page, pool))
 			return NULL;
 
 		/* Page found as candidate for recycling */
@@ -716,9 +715,35 @@  __page_pool_put_page(struct page_pool *pool, struct page *page,
 	return NULL;
 }
 
+static bool page_pool_napi_local(const struct page_pool *pool)
+{
+	const struct napi_struct *napi;
+	u32 cpuid;
+
+	if (unlikely(!in_softirq()))
+		return false;
+
+	/* Allow direct recycle if we have reasons to believe that we are
+	 * in the same context as the consumer would run, so there's
+	 * no possible race.
+	 * __page_pool_put_page() makes sure we're not in hardirq context
+	 * and interrupts are enabled prior to accessing the cache.
+	 */
+	cpuid = smp_processor_id();
+	if (READ_ONCE(pool->cpuid) == cpuid)
+		return true;
+
+	napi = READ_ONCE(pool->p.napi);
+
+	return napi && READ_ONCE(napi->list_owner) == cpuid;
+}
+
 void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
 				unsigned int dma_sync_size, bool allow_direct)
 {
+	if (!allow_direct)
+		allow_direct = page_pool_napi_local(pool);
+
 	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
 	if (page && !page_pool_recycle_in_ring(pool, page)) {
 		/* Cache full, fallback to free pages */
@@ -969,7 +994,7 @@  void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
 static void page_pool_disable_direct_recycling(struct page_pool *pool)
 {
 	/* Disable direct recycling based on pool->cpuid.
-	 * Paired with READ_ONCE() in napi_pp_put_page().
+	 * Paired with READ_ONCE() in page_pool_napi_local().
 	 */
 	WRITE_ONCE(pool->cpuid, -1);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e0e172638c0a..e01e2a618621 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1005,11 +1005,8 @@  int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
 EXPORT_SYMBOL(skb_cow_data_for_xdp);
 
 #if IS_ENABLED(CONFIG_PAGE_POOL)
-bool napi_pp_put_page(struct page *page, bool napi_safe)
+bool napi_pp_put_page(struct page *page)
 {
-	bool allow_direct = false;
-	struct page_pool *pp;
-
 	page = compound_head(page);
 
 	/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
@@ -1022,39 +1019,18 @@  bool napi_pp_put_page(struct page *page, bool napi_safe)
 	if (unlikely(!is_pp_page(page)))
 		return false;
 
-	pp = page->pp;
-
-	/* Allow direct recycle if we have reasons to believe that we are
-	 * in the same context as the consumer would run, so there's
-	 * no possible race.
-	 * __page_pool_put_page() makes sure we're not in hardirq context
-	 * and interrupts are enabled prior to accessing the cache.
-	 */
-	if (napi_safe || in_softirq()) {
-		const struct napi_struct *napi = READ_ONCE(pp->p.napi);
-		unsigned int cpuid = smp_processor_id();
-
-		allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid;
-		allow_direct |= READ_ONCE(pp->cpuid) == cpuid;
-	}
-
-	/* Driver set this to memory recycling info. Reset it on recycle.
-	 * This will *not* work for NIC using a split-page memory model.
-	 * The page will be returned to the pool here regardless of the
-	 * 'flipped' fragment being in use or not.
-	 */
-	page_pool_put_full_page(pp, page, allow_direct);
+	page_pool_put_full_page(page->pp, page, false);
 
 	return true;
 }
 EXPORT_SYMBOL(napi_pp_put_page);
 #endif
 
-static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
+static bool skb_pp_recycle(struct sk_buff *skb, void *data)
 {
 	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
 		return false;
-	return napi_pp_put_page(virt_to_page(data), napi_safe);
+	return napi_pp_put_page(virt_to_page(data));
 }
 
 /**
@@ -1096,12 +1072,12 @@  static void skb_kfree_head(void *head, unsigned int end_offset)
 		kfree(head);
 }
 
-static void skb_free_head(struct sk_buff *skb, bool napi_safe)
+static void skb_free_head(struct sk_buff *skb)
 {
 	unsigned char *head = skb->head;
 
 	if (skb->head_frag) {
-		if (skb_pp_recycle(skb, head, napi_safe))
+		if (skb_pp_recycle(skb, head))
 			return;
 		skb_free_frag(head);
 	} else {
@@ -1109,8 +1085,7 @@  static void skb_free_head(struct sk_buff *skb, bool napi_safe)
 	}
 }
 
-static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
-			     bool napi_safe)
+static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	int i;
@@ -1127,13 +1102,13 @@  static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
 	}
 
 	for (i = 0; i < shinfo->nr_frags; i++)
-		napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
+		napi_frag_unref(&shinfo->frags[i], skb->pp_recycle);
 
 free_head:
 	if (shinfo->frag_list)
 		kfree_skb_list_reason(shinfo->frag_list, reason);
 
-	skb_free_head(skb, napi_safe);
+	skb_free_head(skb);
 exit:
 	/* When we clone an SKB we copy the reycling bit. The pp_recycle
 	 * bit is only set on the head though, so in order to avoid races
@@ -1194,12 +1169,11 @@  void skb_release_head_state(struct sk_buff *skb)
 }
 
 /* Free everything but the sk_buff shell. */
-static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
-			    bool napi_safe)
+static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
 {
 	skb_release_head_state(skb);
 	if (likely(skb->head))
-		skb_release_data(skb, reason, napi_safe);
+		skb_release_data(skb, reason);
 }
 
 /**
@@ -1213,7 +1187,7 @@  static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
 
 void __kfree_skb(struct sk_buff *skb)
 {
-	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
+	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
 	kfree_skbmem(skb);
 }
 EXPORT_SYMBOL(__kfree_skb);
@@ -1270,7 +1244,7 @@  static void kfree_skb_add_bulk(struct sk_buff *skb,
 		return;
 	}
 
-	skb_release_all(skb, reason, false);
+	skb_release_all(skb, reason);
 	sa->skb_array[sa->skb_count++] = skb;
 
 	if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
@@ -1444,7 +1418,7 @@  EXPORT_SYMBOL(consume_skb);
 void __consume_stateless_skb(struct sk_buff *skb)
 {
 	trace_consume_skb(skb, __builtin_return_address(0));
-	skb_release_data(skb, SKB_CONSUMED, false);
+	skb_release_data(skb, SKB_CONSUMED);
 	kfree_skbmem(skb);
 }
 
@@ -1471,7 +1445,7 @@  static void napi_skb_cache_put(struct sk_buff *skb)
 
 void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
 {
-	skb_release_all(skb, reason, true);
+	skb_release_all(skb, reason);
 	napi_skb_cache_put(skb);
 }
 
@@ -1509,7 +1483,7 @@  void napi_consume_skb(struct sk_buff *skb, int budget)
 		return;
 	}
 
-	skb_release_all(skb, SKB_CONSUMED, !!budget);
+	skb_release_all(skb, SKB_CONSUMED);
 	napi_skb_cache_put(skb);
 }
 EXPORT_SYMBOL(napi_consume_skb);
@@ -1640,7 +1614,7 @@  EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
  */
 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 {
-	skb_release_all(dst, SKB_CONSUMED, false);
+	skb_release_all(dst, SKB_CONSUMED);
 	return __skb_clone(dst, src);
 }
 EXPORT_SYMBOL_GPL(skb_morph);
@@ -2272,9 +2246,9 @@  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		if (skb_has_frag_list(skb))
 			skb_clone_fraglist(skb);
 
-		skb_release_data(skb, SKB_CONSUMED, false);
+		skb_release_data(skb, SKB_CONSUMED);
 	} else {
-		skb_free_head(skb, false);
+		skb_free_head(skb);
 	}
 	off = (data + nhead) - skb->head;
 
@@ -6575,12 +6549,12 @@  static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
 			skb_frag_ref(skb, i);
 		if (skb_has_frag_list(skb))
 			skb_clone_fraglist(skb);
-		skb_release_data(skb, SKB_CONSUMED, false);
+		skb_release_data(skb, SKB_CONSUMED);
 	} else {
 		/* we can reuse existing recount- all we did was
 		 * relocate values
 		 */
-		skb_free_head(skb, false);
+		skb_free_head(skb);
 	}
 
 	skb->head = data;
@@ -6715,7 +6689,7 @@  static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
 		skb_kfree_head(data, size);
 		return -ENOMEM;
 	}
-	skb_release_data(skb, SKB_CONSUMED, false);
+	skb_release_data(skb, SKB_CONSUMED);
 
 	skb->head = data;
 	skb->head_frag = 0;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index d33d12421814..3d647c9a7a21 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -114,7 +114,7 @@  static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
 	 */
 	if (req->src != req->dst)
 		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
-			skb_page_unref(skb, sg_page(sg), false);
+			skb_page_unref(skb, sg_page(sg));
 }
 
 #ifdef CONFIG_INET_ESPINTCP
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 7371886d4f9f..fe8d53f5a5ee 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -131,7 +131,7 @@  static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
 	 */
 	if (req->src != req->dst)
 		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
-			skb_page_unref(skb, sg_page(sg), false);
+			skb_page_unref(skb, sg_page(sg));
 }
 
 #ifdef CONFIG_INET6_ESPINTCP