diff mbox series

[net-next,v6,1/6] page_pool: frag API support for 32-bit arch with 64-bit DMA

Message ID 20230814125643.59334-2-linyunsheng@huawei.com (mailing list archive)
State Not Applicable
Headers show
Series [net-next,v6,1/6] page_pool: frag API support for 32-bit arch with 64-bit DMA | expand

Commit Message

Yunsheng Lin Aug. 14, 2023, 12:56 p.m. UTC
Currently page_pool_alloc_frag() is not supported in 32-bit
arch with 64-bit DMA because of the overlap issue between
pp_frag_count and dma_addr_upper in 'struct page' for those
arches, which seems to be quite common, see [1], which means
driver may need to handle it when using frag API.

In order to simplify the driver's work when using frag API
this patch allows page_pool_alloc_frag() to call
page_pool_alloc_pages() to return pages for those arches.

Add a PP_FLAG_PAGE_SPLIT_IN_DRIVER flag in order to fail the
page_pool creation for 32-bit arch with 64-bit DMA when driver
tries to do the page splitting itself.

Note that it may aggravate truesize underestimate problem for
skb as there is no page splitting for those pages, if driver
need a accurate truesize, it may calculate that according to
frag size, page order and PAGE_POOL_DMA_USE_PP_FRAG_COUNT
being true or not. And we may provide a helper for that if it
turns out to be helpful.

1. https://lore.kernel.org/all/20211117075652.58299-1-linyunsheng@huawei.com/

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Liang Chen <liangchen.linux@gmail.com>
CC: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  3 +-
 include/net/page_pool/helpers.h               | 38 +++++++++++++++--
 include/net/page_pool/types.h                 | 42 ++++++++++++-------
 net/core/page_pool.c                          | 15 +++----
 4 files changed, 68 insertions(+), 30 deletions(-)

Comments

Simon Horman Aug. 15, 2023, 11:24 a.m. UTC | #1
On Mon, Aug 14, 2023 at 08:56:38PM +0800, Yunsheng Lin wrote:

...

> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 77cb75e63aca..d62c11aaea9a 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c

...

> @@ -737,18 +736,16 @@ static void page_pool_free_frag(struct page_pool *pool)
>  	page_pool_return_page(pool, page);
>  }
>  
> -struct page *page_pool_alloc_frag(struct page_pool *pool,
> -				  unsigned int *offset,
> -				  unsigned int size, gfp_t gfp)
> +struct page *__page_pool_alloc_frag(struct page_pool *pool,
> +				    unsigned int *offset,
> +				    unsigned int size, gfp_t gfp)
>  {
>  	unsigned int max_size = PAGE_SIZE << pool->p.order;
>  	struct page *page = pool->frag_page;
>  
> -	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
> -		    size > max_size))
> +	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG))

Hi Yunsheng Lin,

There is a ')' missing on the line above, which results in a build failure.

>  		return NULL;
>  
> -	size = ALIGN(size, dma_get_cache_alignment());
>  	*offset = pool->frag_offset;
>  
>  	if (page && *offset + size > max_size) {
> @@ -781,7 +778,7 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
>  	alloc_stat_inc(pool, fast);
>  	return page;
>  }
> -EXPORT_SYMBOL(page_pool_alloc_frag);
> +EXPORT_SYMBOL(__page_pool_alloc_frag);
>  
>  static void page_pool_empty_ring(struct page_pool *pool)
>  {
Yunsheng Lin Aug. 15, 2023, 12:31 p.m. UTC | #2
On 2023/8/15 19:24, Simon Horman wrote:
> On Mon, Aug 14, 2023 at 08:56:38PM +0800, Yunsheng Lin wrote:
> 
> ...
> 
>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>> index 77cb75e63aca..d62c11aaea9a 100644
>> --- a/net/core/page_pool.c
>> +++ b/net/core/page_pool.c
> 
> ...
> 
>> @@ -737,18 +736,16 @@ static void page_pool_free_frag(struct page_pool *pool)
>>  	page_pool_return_page(pool, page);
>>  }
>>  
>> -struct page *page_pool_alloc_frag(struct page_pool *pool,
>> -				  unsigned int *offset,
>> -				  unsigned int size, gfp_t gfp)
>> +struct page *__page_pool_alloc_frag(struct page_pool *pool,
>> +				    unsigned int *offset,
>> +				    unsigned int size, gfp_t gfp)
>>  {
>>  	unsigned int max_size = PAGE_SIZE << pool->p.order;
>>  	struct page *page = pool->frag_page;
>>  
>> -	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
>> -		    size > max_size))
>> +	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG))
> 
> Hi Yunsheng Lin,
> 
> There is a ')' missing on the line above, which results in a build failure.

Yes, thanks for noticing.
As the above checking is removed in patch 3, so it is not noticeable in testing
when the whole patchset is applied.

>
Ilias Apalodimas Aug. 16, 2023, 11:26 a.m. UTC | #3
Hi Yunsheng

On Mon, 14 Aug 2023 at 15:59, Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> Currently page_pool_alloc_frag() is not supported in 32-bit
> arch with 64-bit DMA because of the overlap issue between
> pp_frag_count and dma_addr_upper in 'struct page' for those
> arches, which seems to be quite common, see [1], which means
> driver may need to handle it when using frag API.

That wasn't so common. IIRC it was a single TI platform that was breaking?

>
> In order to simplify the driver's work when using frag API
> this patch allows page_pool_alloc_frag() to call
> page_pool_alloc_pages() to return pages for those arches.

Do we have any use cases of people needing this?  Those architectures
should be long dead and although we have to support them in the
kernel,  I don't personally see the advantage of adjusting the API to
do that.  Right now we have a very clear separation between allocating
pages or fragments.   Why should we hide a page allocation under a
frag allocation?  A driver writer can simply allocate pages for those
boards.  Am I the only one not seeing a clean win here?

Thanks
/Ilias

>
> Add a PP_FLAG_PAGE_SPLIT_IN_DRIVER flag in order to fail the
> page_pool creation for 32-bit arch with 64-bit DMA when driver
> tries to do the page splitting itself.
>
> Note that it may aggravate truesize underestimate problem for
> skb as there is no page splitting for those pages, if driver
> need a accurate truesize, it may calculate that according to
> frag size, page order and PAGE_POOL_DMA_USE_PP_FRAG_COUNT
> being true or not. And we may provide a helper for that if it
> turns out to be helpful.
>
> 1. https://lore.kernel.org/all/20211117075652.58299-1-linyunsheng@huawei.com/
>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> CC: Lorenzo Bianconi <lorenzo@kernel.org>
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> CC: Liang Chen <liangchen.linux@gmail.com>
> CC: Alexander Lobakin <aleksander.lobakin@intel.com>
> ---
>  .../net/ethernet/mellanox/mlx5/core/en_main.c |  3 +-
>  include/net/page_pool/helpers.h               | 38 +++++++++++++++--
>  include/net/page_pool/types.h                 | 42 ++++++++++++-------
>  net/core/page_pool.c                          | 15 +++----
>  4 files changed, 68 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index bc9d5a5bea01..ec9c5a8cbda6 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -834,7 +834,8 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params,
>                 struct page_pool_params pp_params = { 0 };
>
>                 pp_params.order     = 0;
> -               pp_params.flags     = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | PP_FLAG_PAGE_FRAG;
> +               pp_params.flags     = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV |
> +                                       PP_FLAG_PAGE_SPLIT_IN_DRIVER;
>                 pp_params.pool_size = pool_size;
>                 pp_params.nid       = node;
>                 pp_params.dev       = rq->pdev;
> diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
> index 94231533a369..cb18de55f239 100644
> --- a/include/net/page_pool/helpers.h
> +++ b/include/net/page_pool/helpers.h
> @@ -29,8 +29,12 @@
>  #ifndef _NET_PAGE_POOL_HELPERS_H
>  #define _NET_PAGE_POOL_HELPERS_H
>
> +#include <linux/dma-mapping.h>
>  #include <net/page_pool/types.h>
>
> +#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT        \
> +               (sizeof(dma_addr_t) > sizeof(unsigned long))
> +
>  #ifdef CONFIG_PAGE_POOL_STATS
>  int page_pool_ethtool_stats_get_count(void);
>  u8 *page_pool_ethtool_stats_get_strings(u8 *data);
> @@ -73,6 +77,29 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
>         return page_pool_alloc_pages(pool, gfp);
>  }
>
> +static inline struct page *page_pool_alloc_frag(struct page_pool *pool,
> +                                               unsigned int *offset,
> +                                               unsigned int size, gfp_t gfp)
> +{
> +       unsigned int max_size = PAGE_SIZE << pool->p.order;
> +
> +       size = ALIGN(size, dma_get_cache_alignment());
> +
> +       if (WARN_ON(size > max_size))
> +               return NULL;
> +
> +       /* Don't allow page splitting and allocate one big frag
> +        * for 32-bit arch with 64-bit DMA, corresponding to
> +        * the checking in page_pool_is_last_frag().
> +        */
> +       if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
> +               *offset = 0;
> +               return page_pool_alloc_pages(pool, gfp);
> +       }
> +
> +       return __page_pool_alloc_frag(pool, offset, size, gfp);
> +}
> +
>  static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
>                                                     unsigned int *offset,
>                                                     unsigned int size)
> @@ -134,8 +161,14 @@ static inline long page_pool_defrag_page(struct page *page, long nr)
>  static inline bool page_pool_is_last_frag(struct page_pool *pool,
>                                           struct page *page)
>  {
> -       /* If fragments aren't enabled or count is 0 we were the last user */
> +       /* We assume we are the last frag user that is still holding
> +        * on to the page if:
> +        * 1. Fragments aren't enabled.
> +        * 2. We are running in 32-bit arch with 64-bit DMA.
> +        * 3. page_pool_defrag_page() indicate we are the last user.
> +        */
>         return !(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
> +              PAGE_POOL_DMA_USE_PP_FRAG_COUNT ||
>                (page_pool_defrag_page(page, 1) == 0);
>  }
>
> @@ -197,9 +230,6 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
>         page_pool_put_full_page(pool, page, true);
>  }
>
> -#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT        \
> -               (sizeof(dma_addr_t) > sizeof(unsigned long))
> -
>  /**
>   * page_pool_get_dma_addr() - Retrieve the stored DMA address.
>   * @page:      page allocated from a page pool
> diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
> index 887e7946a597..079337c42aa6 100644
> --- a/include/net/page_pool/types.h
> +++ b/include/net/page_pool/types.h
> @@ -6,21 +6,29 @@
>  #include <linux/dma-direction.h>
>  #include <linux/ptr_ring.h>
>
> -#define PP_FLAG_DMA_MAP                BIT(0) /* Should page_pool do the DMA
> -                                       * map/unmap
> -                                       */
> -#define PP_FLAG_DMA_SYNC_DEV   BIT(1) /* If set all pages that the driver gets
> -                                       * from page_pool will be
> -                                       * DMA-synced-for-device according to
> -                                       * the length provided by the device
> -                                       * driver.
> -                                       * Please note DMA-sync-for-CPU is still
> -                                       * device driver responsibility
> -                                       */
> -#define PP_FLAG_PAGE_FRAG      BIT(2) /* for page frag feature */
> +/* Should page_pool do the DMA map/unmap */
> +#define PP_FLAG_DMA_MAP                        BIT(0)
> +
> +/* If set all pages that the driver gets from page_pool will be
> + * DMA-synced-for-device according to the length provided by the device driver.
> + * Please note DMA-sync-for-CPU is still device driver responsibility
> + */
> +#define PP_FLAG_DMA_SYNC_DEV           BIT(1)
> +
> +/* for page frag feature */
> +#define PP_FLAG_PAGE_FRAG              BIT(2)
> +
> +/* If set driver will do the page splitting itself. This is used to fail the
> + * page_pool creation because there is overlap issue between pp_frag_count and
> + * dma_addr_upper in 'struct page' for some arches with
> + * PAGE_POOL_DMA_USE_PP_FRAG_COUNT being true.
> + */
> +#define PP_FLAG_PAGE_SPLIT_IN_DRIVER   BIT(3)
> +
>  #define PP_FLAG_ALL            (PP_FLAG_DMA_MAP |\
>                                  PP_FLAG_DMA_SYNC_DEV |\
> -                                PP_FLAG_PAGE_FRAG)
> +                                PP_FLAG_PAGE_FRAG |\
> +                                PP_FLAG_PAGE_SPLIT_IN_DRIVER)
>
>  /*
>   * Fast allocation side cache array/stack
> @@ -45,7 +53,8 @@ struct pp_alloc_cache {
>
>  /**
>   * struct page_pool_params - page pool parameters
> - * @flags:     PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG
> + * @flags:     PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG,
> + *             PP_FLAG_PAGE_SPLIT_IN_DRIVER
>   * @order:     2^order pages on allocation
>   * @pool_size: size of the ptr_ring
>   * @nid:       NUMA node id to allocate from pages from
> @@ -183,8 +192,9 @@ struct page_pool {
>  };
>
>  struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
> -struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
> -                                 unsigned int size, gfp_t gfp);
> +struct page *__page_pool_alloc_frag(struct page_pool *pool,
> +                                   unsigned int *offset, unsigned int size,
> +                                   gfp_t gfp);
>  struct page_pool *page_pool_create(const struct page_pool_params *params);
>
>  struct xdp_mem_info;
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 77cb75e63aca..d62c11aaea9a 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -14,7 +14,6 @@
>  #include <net/xdp.h>
>
>  #include <linux/dma-direction.h>
> -#include <linux/dma-mapping.h>
>  #include <linux/page-flags.h>
>  #include <linux/mm.h> /* for put_page() */
>  #include <linux/poison.h>
> @@ -212,7 +211,7 @@ static int page_pool_init(struct page_pool *pool,
>         }
>
>         if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
> -           pool->p.flags & PP_FLAG_PAGE_FRAG)
> +           pool->p.flags & PP_FLAG_PAGE_SPLIT_IN_DRIVER)
>                 return -EINVAL;
>
>  #ifdef CONFIG_PAGE_POOL_STATS
> @@ -737,18 +736,16 @@ static void page_pool_free_frag(struct page_pool *pool)
>         page_pool_return_page(pool, page);
>  }
>
> -struct page *page_pool_alloc_frag(struct page_pool *pool,
> -                                 unsigned int *offset,
> -                                 unsigned int size, gfp_t gfp)
> +struct page *__page_pool_alloc_frag(struct page_pool *pool,
> +                                   unsigned int *offset,
> +                                   unsigned int size, gfp_t gfp)
>  {
>         unsigned int max_size = PAGE_SIZE << pool->p.order;
>         struct page *page = pool->frag_page;
>
> -       if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
> -                   size > max_size))
> +       if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG))
>                 return NULL;
>
> -       size = ALIGN(size, dma_get_cache_alignment());
>         *offset = pool->frag_offset;
>
>         if (page && *offset + size > max_size) {
> @@ -781,7 +778,7 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
>         alloc_stat_inc(pool, fast);
>         return page;
>  }
> -EXPORT_SYMBOL(page_pool_alloc_frag);
> +EXPORT_SYMBOL(__page_pool_alloc_frag);
>
>  static void page_pool_empty_ring(struct page_pool *pool)
>  {
> --
> 2.33.0
>
Yunsheng Lin Aug. 16, 2023, 12:49 p.m. UTC | #4
On 2023/8/16 19:26, Ilias Apalodimas wrote:
> Hi Yunsheng
> 
> On Mon, 14 Aug 2023 at 15:59, Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> Currently page_pool_alloc_frag() is not supported in 32-bit
>> arch with 64-bit DMA because of the overlap issue between
>> pp_frag_count and dma_addr_upper in 'struct page' for those
>> arches, which seems to be quite common, see [1], which means
>> driver may need to handle it when using frag API.
> 
> That wasn't so common. IIRC it was a single TI platform that was breaking?

I am not so sure about that as grepping 'ARM_LPAE' has a long
list for that.

> 
>>
>> In order to simplify the driver's work when using frag API
>> this patch allows page_pool_alloc_frag() to call
>> page_pool_alloc_pages() to return pages for those arches.
> 
> Do we have any use cases of people needing this?  Those architectures
> should be long dead and although we have to support them in the
> kernel,  I don't personally see the advantage of adjusting the API to
> do that.  Right now we have a very clear separation between allocating
> pages or fragments.   Why should we hide a page allocation under a
> frag allocation?  A driver writer can simply allocate pages for those
> boards.  Am I the only one not seeing a clean win here?

It is also a part of removing the per page_pool PP_FLAG_PAGE_FRAG flag
in this patchset.

> 
> Thanks
> /Ilias
>
Ilias Apalodimas Aug. 16, 2023, 5:01 p.m. UTC | #5
On Wed, 16 Aug 2023 at 15:49, Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2023/8/16 19:26, Ilias Apalodimas wrote:
> > Hi Yunsheng
> >
> > On Mon, 14 Aug 2023 at 15:59, Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >>
> >> Currently page_pool_alloc_frag() is not supported in 32-bit
> >> arch with 64-bit DMA because of the overlap issue between
> >> pp_frag_count and dma_addr_upper in 'struct page' for those
> >> arches, which seems to be quite common, see [1], which means
> >> driver may need to handle it when using frag API.
> >
> > That wasn't so common. IIRC it was a single TI platform that was breaking?
>
> I am not so sure about that as grepping 'ARM_LPAE' has a long
> list for that.

Shouldn't we be grepping for CONFIG_ARCH_DMA_ADDR_T_64BIT and
PHYS_ADDR_T_64BIT to find the affected platforms?  Why LPAE?

>
> >
> >>
> >> In order to simplify the driver's work when using frag API
> >> this patch allows page_pool_alloc_frag() to call
> >> page_pool_alloc_pages() to return pages for those arches.
> >
> > Do we have any use cases of people needing this?  Those architectures
> > should be long dead and although we have to support them in the
> > kernel,  I don't personally see the advantage of adjusting the API to
> > do that.  Right now we have a very clear separation between allocating
> > pages or fragments.   Why should we hide a page allocation under a
> > frag allocation?  A driver writer can simply allocate pages for those
> > boards.  Am I the only one not seeing a clean win here?
>
> It is also a part of removing the per page_pool PP_FLAG_PAGE_FRAG flag
> in this patchset.

Yes, that happens *because* of this patchset.  I am not against the
change.  In fact, I'll have a closer look tomorrow.  I am just trying
to figure out if we really need it.  When the recycling patches were
introduced into page pool we had a very specific reason.  Due to the
XDP verifier we *had* to allocate a packet per page.  That was
expensive so we added the recycling capabilities to compensate and get
some performance back. Eventually we added page fragments and had a
very clear separation on the API.

Regards
/Ilias
>
> >
> > Thanks
> > /Ilias
> >
Yunsheng Lin Aug. 17, 2023, 9:05 a.m. UTC | #6
On 2023/8/17 1:01, Ilias Apalodimas wrote:
> On Wed, 16 Aug 2023 at 15:49, Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2023/8/16 19:26, Ilias Apalodimas wrote:
>>> Hi Yunsheng
>>>
>>> On Mon, 14 Aug 2023 at 15:59, Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>>>
>>>> Currently page_pool_alloc_frag() is not supported in 32-bit
>>>> arch with 64-bit DMA because of the overlap issue between
>>>> pp_frag_count and dma_addr_upper in 'struct page' for those
>>>> arches, which seems to be quite common, see [1], which means
>>>> driver may need to handle it when using frag API.
>>>
>>> That wasn't so common. IIRC it was a single TI platform that was breaking?
>>
>> I am not so sure about that as grepping 'ARM_LPAE' has a long
>> list for that.
> 
> Shouldn't we be grepping for CONFIG_ARCH_DMA_ADDR_T_64BIT and
> PHYS_ADDR_T_64BIT to find the affected platforms?  Why LPAE?


I used the key in the  original report:

https://www.spinics.net/lists/netdev/msg779890.html

>> Please see the bisection report below about a boot failure on
>> rk3288-rock2-square which is pointing to this patch.  The issue
>> appears to only happen with CONFIG_ARM_LPAE=y.

grepping the 'CONFIG_PHYS_ADDR_T_64BIT' seems to be more common?
https://elixir.free-electrons.com/linux/v6.4-rc6/K/ident/CONFIG_PHYS_ADDR_T_64BIT

> 
>>
>>>
>>>>
>>>> In order to simplify the driver's work when using frag API
>>>> this patch allows page_pool_alloc_frag() to call
>>>> page_pool_alloc_pages() to return pages for those arches.
>>>
>>> Do we have any use cases of people needing this?  Those architectures
>>> should be long dead and although we have to support them in the
>>> kernel,  I don't personally see the advantage of adjusting the API to
>>> do that.  Right now we have a very clear separation between allocating
>>> pages or fragments.   Why should we hide a page allocation under a
>>> frag allocation?  A driver writer can simply allocate pages for those
>>> boards.  Am I the only one not seeing a clean win here?
>>
>> It is also a part of removing the per page_pool PP_FLAG_PAGE_FRAG flag
>> in this patchset.
> 
> Yes, that happens *because* of this patchset.  I am not against the
> change.  In fact, I'll have a closer look tomorrow.  I am just trying
> to figure out if we really need it.  When the recycling patches were
> introduced into page pool we had a very specific reason.  Due to the
> XDP verifier we *had* to allocate a packet per page.  That was

Did you mean a xdp frame containing a frag page can not be passed to the
xdp core?
What is exact reason why the XDP verifier need a packet per page?
Is there a code block that you can point me to?

I wonder if it is still the case for now, as bnxt and mlx5 seems to be
supporting frag page and xdp now.

> expensive so we added the recycling capabilities to compensate and get
> some performance back. Eventually we added page fragments and had a
> very clear separation on the API.
> 
> Regards
> /Ilias
>>
>>>
>>> Thanks
>>> /Ilias
>>>
> .
>
Ilias Apalodimas Aug. 17, 2023, 11:43 a.m. UTC | #7
On Thu, 17 Aug 2023 at 12:06, Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2023/8/17 1:01, Ilias Apalodimas wrote:
> > On Wed, 16 Aug 2023 at 15:49, Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >>
> >> On 2023/8/16 19:26, Ilias Apalodimas wrote:
> >>> Hi Yunsheng
> >>>
> >>> On Mon, 14 Aug 2023 at 15:59, Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >>>>
> >>>> Currently page_pool_alloc_frag() is not supported in 32-bit
> >>>> arch with 64-bit DMA because of the overlap issue between
> >>>> pp_frag_count and dma_addr_upper in 'struct page' for those
> >>>> arches, which seems to be quite common, see [1], which means
> >>>> driver may need to handle it when using frag API.
> >>>
> >>> That wasn't so common. IIRC it was a single TI platform that was breaking?
> >>
> >> I am not so sure about that as grepping 'ARM_LPAE' has a long
> >> list for that.
> >
> > Shouldn't we be grepping for CONFIG_ARCH_DMA_ADDR_T_64BIT and
> > PHYS_ADDR_T_64BIT to find the affected platforms?  Why LPAE?
>
>
> I used the key in the  original report:
>
> https://www.spinics.net/lists/netdev/msg779890.html
>
> >> Please see the bisection report below about a boot failure on
> >> rk3288-rock2-square which is pointing to this patch.  The issue
> >> appears to only happen with CONFIG_ARM_LPAE=y.
>
> grepping the 'CONFIG_PHYS_ADDR_T_64BIT' seems to be more common?
> https://elixir.free-electrons.com/linux/v6.4-rc6/K/ident/CONFIG_PHYS_ADDR_T_64BIT
>

Yes, grepping around a bit uncovered this arch/arm/mm/Kconfig, which
enables PHYS_ADDR_T_64BIT if ARM_LPAE is enabled.  Then
ARCH_DMA_ADDR_T_64BIT
is also enabled from kernel/dma/Kconfig.  But that doesn't mean
grepping for any of those uncovers all the problematic platforms,
there are more than Arm platforms.  The ones that will actually fail
are
- ARCH_DMA_ADDR_T_64BIT is enabled and it's a 32bit architecture
- You have a network driver for that platform that uses page pool.

The combination of these shouldn't be that common.  The only one that
comes to mind is the stmmac driver, which the report was for.

> >
> >>
> >>>
> >>>>
> >>>> In order to simplify the driver's work when using frag API
> >>>> this patch allows page_pool_alloc_frag() to call
> >>>> page_pool_alloc_pages() to return pages for those arches.
> >>>
> >>> Do we have any use cases of people needing this?  Those architectures
> >>> should be long dead and although we have to support them in the
> >>> kernel,  I don't personally see the advantage of adjusting the API to
> >>> do that.  Right now we have a very clear separation between allocating
> >>> pages or fragments.   Why should we hide a page allocation under a
> >>> frag allocation?  A driver writer can simply allocate pages for those
> >>> boards.  Am I the only one not seeing a clean win here?
> >>
> >> It is also a part of removing the per page_pool PP_FLAG_PAGE_FRAG flag
> >> in this patchset.
> >
> > Yes, that happens *because* of this patchset.  I am not against the
> > change.  In fact, I'll have a closer look tomorrow.  I am just trying
> > to figure out if we really need it.  When the recycling patches were
> > introduced into page pool we had a very specific reason.  Due to the
> > XDP verifier we *had* to allocate a packet per page.  That was
>
> Did you mean a xdp frame containing a frag page can not be passed to the
> xdp core?
> What is exact reason why the XDP verifier need a packet per page?
> Is there a code block that you can point me to?

It's been a while since I looked at this, but doesn't __xdp_return()
still sync the entire page if the mem type comes from page_pool?

>
> I wonder if it is still the case for now, as bnxt and mlx5 seems to be
> supporting frag page and xdp now.

I only looked at bnxt, but that doesnt seem to be entirely true.  That
code still allocates pages if an XDP prog is installed.  The only case
where it allocates fragments is if the kernel is compiled with 65k
pages, but the hardware doesnt support that (check for
BNXT_RX_PAGE_SHIFT)


Thanks
/Ilias
>
> > expensive so we added the recycling capabilities to compensate and get
> > some performance back. Eventually we added page fragments and had a
> > very clear separation on the API.
> >
> > Regards
> > /Ilias
> >>
> >>>
> >>> Thanks
> >>> /Ilias
> >>>
> > .
> >
Yunsheng Lin Aug. 18, 2023, 8:46 a.m. UTC | #8
On 2023/8/17 19:43, Ilias Apalodimas wrote:
>>>>>>
>>>>>> In order to simplify the driver's work when using frag API
>>>>>> this patch allows page_pool_alloc_frag() to call
>>>>>> page_pool_alloc_pages() to return pages for those arches.
>>>>>
>>>>> Do we have any use cases of people needing this?  Those architectures
>>>>> should be long dead and although we have to support them in the
>>>>> kernel,  I don't personally see the advantage of adjusting the API to
>>>>> do that.  Right now we have a very clear separation between allocating
>>>>> pages or fragments.   Why should we hide a page allocation under a
>>>>> frag allocation?  A driver writer can simply allocate pages for those
>>>>> boards.  Am I the only one not seeing a clean win here?
>>>>
>>>> It is also a part of removing the per page_pool PP_FLAG_PAGE_FRAG flag
>>>> in this patchset.
>>>
>>> Yes, that happens *because* of this patchset.  I am not against the
>>> change.  In fact, I'll have a closer look tomorrow.  I am just trying
>>> to figure out if we really need it.  When the recycling patches were
>>> introduced into page pool we had a very specific reason.  Due to the
>>> XDP verifier we *had* to allocate a packet per page.  That was
>>
>> Did you mean a xdp frame containing a frag page can not be passed to the
>> xdp core?
>> What is exact reason why the XDP verifier need a packet per page?
>> Is there a code block that you can point me to?
> 
> It's been a while since I looked at this, but doesn't __xdp_return()
> still sync the entire page if the mem type comes from page_pool?

Yes, I checked that too.
It is supposed to sync the entire page if the mem type comes from page_pool,
as it depend on the last freed frag to do the sync_for_device operation.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bc9d5a5bea01..ec9c5a8cbda6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -834,7 +834,8 @@  static int mlx5e_alloc_rq(struct mlx5e_params *params,
 		struct page_pool_params pp_params = { 0 };
 
 		pp_params.order     = 0;
-		pp_params.flags     = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | PP_FLAG_PAGE_FRAG;
+		pp_params.flags     = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV |
+					PP_FLAG_PAGE_SPLIT_IN_DRIVER;
 		pp_params.pool_size = pool_size;
 		pp_params.nid       = node;
 		pp_params.dev       = rq->pdev;
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 94231533a369..cb18de55f239 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -29,8 +29,12 @@ 
 #ifndef _NET_PAGE_POOL_HELPERS_H
 #define _NET_PAGE_POOL_HELPERS_H
 
+#include <linux/dma-mapping.h>
 #include <net/page_pool/types.h>
 
+#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT	\
+		(sizeof(dma_addr_t) > sizeof(unsigned long))
+
 #ifdef CONFIG_PAGE_POOL_STATS
 int page_pool_ethtool_stats_get_count(void);
 u8 *page_pool_ethtool_stats_get_strings(u8 *data);
@@ -73,6 +77,29 @@  static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
 	return page_pool_alloc_pages(pool, gfp);
 }
 
+static inline struct page *page_pool_alloc_frag(struct page_pool *pool,
+						unsigned int *offset,
+						unsigned int size, gfp_t gfp)
+{
+	unsigned int max_size = PAGE_SIZE << pool->p.order;
+
+	size = ALIGN(size, dma_get_cache_alignment());
+
+	if (WARN_ON(size > max_size))
+		return NULL;
+
+	/* Don't allow page splitting and allocate one big frag
+	 * for 32-bit arch with 64-bit DMA, corresponding to
+	 * the checking in page_pool_is_last_frag().
+	 */
+	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) {
+		*offset = 0;
+		return page_pool_alloc_pages(pool, gfp);
+	}
+
+	return __page_pool_alloc_frag(pool, offset, size, gfp);
+}
+
 static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
 						    unsigned int *offset,
 						    unsigned int size)
@@ -134,8 +161,14 @@  static inline long page_pool_defrag_page(struct page *page, long nr)
 static inline bool page_pool_is_last_frag(struct page_pool *pool,
 					  struct page *page)
 {
-	/* If fragments aren't enabled or count is 0 we were the last user */
+	/* We assume we are the last frag user that is still holding
+	 * on to the page if:
+	 * 1. Fragments aren't enabled.
+	 * 2. We are running in 32-bit arch with 64-bit DMA.
+	 * 3. page_pool_defrag_page() indicate we are the last user.
+	 */
 	return !(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
+	       PAGE_POOL_DMA_USE_PP_FRAG_COUNT ||
 	       (page_pool_defrag_page(page, 1) == 0);
 }
 
@@ -197,9 +230,6 @@  static inline void page_pool_recycle_direct(struct page_pool *pool,
 	page_pool_put_full_page(pool, page, true);
 }
 
-#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT	\
-		(sizeof(dma_addr_t) > sizeof(unsigned long))
-
 /**
  * page_pool_get_dma_addr() - Retrieve the stored DMA address.
  * @page:	page allocated from a page pool
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 887e7946a597..079337c42aa6 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -6,21 +6,29 @@ 
 #include <linux/dma-direction.h>
 #include <linux/ptr_ring.h>
 
-#define PP_FLAG_DMA_MAP		BIT(0) /* Should page_pool do the DMA
-					* map/unmap
-					*/
-#define PP_FLAG_DMA_SYNC_DEV	BIT(1) /* If set all pages that the driver gets
-					* from page_pool will be
-					* DMA-synced-for-device according to
-					* the length provided by the device
-					* driver.
-					* Please note DMA-sync-for-CPU is still
-					* device driver responsibility
-					*/
-#define PP_FLAG_PAGE_FRAG	BIT(2) /* for page frag feature */
+/* Should page_pool do the DMA map/unmap */
+#define PP_FLAG_DMA_MAP			BIT(0)
+
+/* If set all pages that the driver gets from page_pool will be
+ * DMA-synced-for-device according to the length provided by the device driver.
+ * Please note DMA-sync-for-CPU is still device driver responsibility
+ */
+#define PP_FLAG_DMA_SYNC_DEV		BIT(1)
+
+/* for page frag feature */
+#define PP_FLAG_PAGE_FRAG		BIT(2)
+
+/* If set driver will do the page splitting itself. This is used to fail the
+ * page_pool creation because there is overlap issue between pp_frag_count and
+ * dma_addr_upper in 'struct page' for some arches with
+ * PAGE_POOL_DMA_USE_PP_FRAG_COUNT being true.
+ */
+#define PP_FLAG_PAGE_SPLIT_IN_DRIVER	BIT(3)
+
 #define PP_FLAG_ALL		(PP_FLAG_DMA_MAP |\
 				 PP_FLAG_DMA_SYNC_DEV |\
-				 PP_FLAG_PAGE_FRAG)
+				 PP_FLAG_PAGE_FRAG |\
+				 PP_FLAG_PAGE_SPLIT_IN_DRIVER)
 
 /*
  * Fast allocation side cache array/stack
@@ -45,7 +53,8 @@  struct pp_alloc_cache {
 
 /**
  * struct page_pool_params - page pool parameters
- * @flags:	PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG
+ * @flags:	PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG,
+ *		PP_FLAG_PAGE_SPLIT_IN_DRIVER
  * @order:	2^order pages on allocation
  * @pool_size:	size of the ptr_ring
  * @nid:	NUMA node id to allocate from pages from
@@ -183,8 +192,9 @@  struct page_pool {
 };
 
 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
-struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
-				  unsigned int size, gfp_t gfp);
+struct page *__page_pool_alloc_frag(struct page_pool *pool,
+				    unsigned int *offset, unsigned int size,
+				    gfp_t gfp);
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 
 struct xdp_mem_info;
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 77cb75e63aca..d62c11aaea9a 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -14,7 +14,6 @@ 
 #include <net/xdp.h>
 
 #include <linux/dma-direction.h>
-#include <linux/dma-mapping.h>
 #include <linux/page-flags.h>
 #include <linux/mm.h> /* for put_page() */
 #include <linux/poison.h>
@@ -212,7 +211,7 @@  static int page_pool_init(struct page_pool *pool,
 	}
 
 	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
-	    pool->p.flags & PP_FLAG_PAGE_FRAG)
+	    pool->p.flags & PP_FLAG_PAGE_SPLIT_IN_DRIVER)
 		return -EINVAL;
 
 #ifdef CONFIG_PAGE_POOL_STATS
@@ -737,18 +736,16 @@  static void page_pool_free_frag(struct page_pool *pool)
 	page_pool_return_page(pool, page);
 }
 
-struct page *page_pool_alloc_frag(struct page_pool *pool,
-				  unsigned int *offset,
-				  unsigned int size, gfp_t gfp)
+struct page *__page_pool_alloc_frag(struct page_pool *pool,
+				    unsigned int *offset,
+				    unsigned int size, gfp_t gfp)
 {
 	unsigned int max_size = PAGE_SIZE << pool->p.order;
 	struct page *page = pool->frag_page;
 
-	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
-		    size > max_size))
+	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG))
 		return NULL;
 
-	size = ALIGN(size, dma_get_cache_alignment());
 	*offset = pool->frag_offset;
 
 	if (page && *offset + size > max_size) {
@@ -781,7 +778,7 @@  struct page *page_pool_alloc_frag(struct page_pool *pool,
 	alloc_stat_inc(pool, fast);
 	return page;
 }
-EXPORT_SYMBOL(page_pool_alloc_frag);
+EXPORT_SYMBOL(__page_pool_alloc_frag);
 
 static void page_pool_empty_ring(struct page_pool *pool)
 {