diff mbox series

[net-next,v3,7/7] bnxt_en: add support for device memory tcp

Message ID 20241003160620.1521626-8-ap420073@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series bnxt_en: implement device memory TCP for bnxt | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; GEN HAS DIFF 2 files changed, 38 insertions(+);
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 9 this patch: 9
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 5 maintainers not CCed: bpf@vger.kernel.org ast@kernel.org hawk@kernel.org daniel@iogearbox.net john.fastabend@gmail.com
netdev/build_clang success Errors and warnings before: 10 this patch: 10
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 13 this patch: 13
netdev/checkpatch warning WARNING: line length of 90 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 2 this patch: 2
netdev/source_inline warning Was 1 now: 1

Commit Message

Taehee Yoo Oct. 3, 2024, 4:06 p.m. UTC
Currently, bnxt_en driver satisfies the requirements of Device memory
TCP, which is tcp-data-split.
So, it implements Device memory TCP for bnxt_en driver.

From now on, the aggregation ring handles netmem_ref instead of page
regardless of the on/off of netmem.
So, for the aggregation ring, memory will be handled with the netmem
page_pool API instead of generic page_pool API.

If Devmem is enabled, netmem_ref is used as-is and if Devmem is not
enabled, netmem_ref will be converted to page and that is used.

Driver recognizes whether the devmem is set or unset based on the
mp_params.mp_priv is not NULL.
Only if devmem is set, it passes PP_FLAG_ALLOW_UNREADABLE_NETMEM.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---

v3:
 - Patch added

 drivers/net/ethernet/broadcom/Kconfig     |  1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 98 +++++++++++++++--------
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  2 +-
 3 files changed, 66 insertions(+), 35 deletions(-)

Comments

Mina Almasry Oct. 3, 2024, 6:43 p.m. UTC | #1
On Thu, Oct 3, 2024 at 9:07 AM Taehee Yoo <ap420073@gmail.com> wrote:
>
> Currently, bnxt_en driver satisfies the requirements of Device memory
> TCP, which is tcp-data-split.
> So, it implements Device memory TCP for bnxt_en driver.
>
> From now on, the aggregation ring handles netmem_ref instead of page
> regardless of the on/off of netmem.
> So, for the aggregation ring, memory will be handled with the netmem
> page_pool API instead of generic page_pool API.
>
> If Devmem is enabled, netmem_ref is used as-is and if Devmem is not
> enabled, netmem_ref will be converted to page and that is used.
>
> Driver recognizes whether the devmem is set or unset based on the
> mp_params.mp_priv is not NULL.
> Only if devmem is set, it passes PP_FLAG_ALLOW_UNREADABLE_NETMEM.
>
> Signed-off-by: Taehee Yoo <ap420073@gmail.com>
> ---
>
> v3:
>  - Patch added
>
>  drivers/net/ethernet/broadcom/Kconfig     |  1 +
>  drivers/net/ethernet/broadcom/bnxt/bnxt.c | 98 +++++++++++++++--------
>  drivers/net/ethernet/broadcom/bnxt/bnxt.h |  2 +-
>  3 files changed, 66 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
> index 75ca3ddda1f5..f37ff12d4746 100644
> --- a/drivers/net/ethernet/broadcom/Kconfig
> +++ b/drivers/net/ethernet/broadcom/Kconfig
> @@ -211,6 +211,7 @@ config BNXT
>         select FW_LOADER
>         select LIBCRC32C
>         select NET_DEVLINK
> +       select NET_DEVMEM
>         select PAGE_POOL
>         select DIMLIB
>         select AUXILIARY_BUS
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> index 872b15842b11..64e07d247f97 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> @@ -55,6 +55,7 @@
>  #include <net/page_pool/helpers.h>
>  #include <linux/align.h>
>  #include <net/netdev_queues.h>
> +#include <net/netdev_rx_queue.h>
>
>  #include "bnxt_hsi.h"
>  #include "bnxt.h"
> @@ -863,6 +864,22 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget)
>                 bnapi->events &= ~BNXT_TX_CMP_EVENT;
>  }
>
> +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping,
> +                                        struct bnxt_rx_ring_info *rxr,
> +                                        unsigned int *offset,
> +                                        gfp_t gfp)
> +{
> +       netmem_ref netmem;
> +
> +       netmem = page_pool_alloc_netmem(rxr->page_pool, GFP_ATOMIC);
> +       if (!netmem)
> +               return 0;
> +       *offset = 0;
> +
> +       *mapping = page_pool_get_dma_addr_netmem(netmem) + *offset;
> +       return netmem;
> +}
> +
>  static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping,
>                                          struct bnxt_rx_ring_info *rxr,
>                                          unsigned int *offset,
> @@ -972,21 +989,21 @@ static inline u16 bnxt_find_next_agg_idx(struct bnxt_rx_ring_info *rxr, u16 idx)
>         return next;
>  }
>
> -static inline int bnxt_alloc_rx_page(struct bnxt *bp,
> -                                    struct bnxt_rx_ring_info *rxr,
> -                                    u16 prod, gfp_t gfp)
> +static inline int bnxt_alloc_rx_netmem(struct bnxt *bp,
> +                                      struct bnxt_rx_ring_info *rxr,
> +                                      u16 prod, gfp_t gfp)
>  {
>         struct rx_bd *rxbd =
>                 &rxr->rx_agg_desc_ring[RX_AGG_RING(bp, prod)][RX_IDX(prod)];
>         struct bnxt_sw_rx_agg_bd *rx_agg_buf;
> -       struct page *page;
> +       netmem_ref netmem;
>         dma_addr_t mapping;
>         u16 sw_prod = rxr->rx_sw_agg_prod;
>         unsigned int offset = 0;
>
> -       page = __bnxt_alloc_rx_page(bp, &mapping, rxr, &offset, gfp);
> +       netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, &offset, gfp);

Does __bnxt_alloc_rx_page become dead code after this change? Or is it
still used for something?

>
> -       if (!page)
> +       if (!netmem)
>                 return -ENOMEM;
>
>         if (unlikely(test_bit(sw_prod, rxr->rx_agg_bmap)))
> @@ -996,7 +1013,7 @@ static inline int bnxt_alloc_rx_page(struct bnxt *bp,
>         rx_agg_buf = &rxr->rx_agg_ring[sw_prod];
>         rxr->rx_sw_agg_prod = RING_RX_AGG(bp, NEXT_RX_AGG(sw_prod));
>
> -       rx_agg_buf->page = page;
> +       rx_agg_buf->netmem = netmem;
>         rx_agg_buf->offset = offset;
>         rx_agg_buf->mapping = mapping;
>         rxbd->rx_bd_haddr = cpu_to_le64(mapping);
> @@ -1044,7 +1061,7 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx,
>                 struct rx_agg_cmp *agg;
>                 struct bnxt_sw_rx_agg_bd *cons_rx_buf, *prod_rx_buf;
>                 struct rx_bd *prod_bd;
> -               struct page *page;
> +               netmem_ref netmem;
>
>                 if (p5_tpa)
>                         agg = bnxt_get_tpa_agg_p5(bp, rxr, idx, start + i);
> @@ -1061,11 +1078,11 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx,
>                 cons_rx_buf = &rxr->rx_agg_ring[cons];
>
>                 /* It is possible for sw_prod to be equal to cons, so
> -                * set cons_rx_buf->page to NULL first.
> +                * set cons_rx_buf->netmem to 0 first.
>                  */
> -               page = cons_rx_buf->page;
> -               cons_rx_buf->page = NULL;
> -               prod_rx_buf->page = page;
> +               netmem = cons_rx_buf->netmem;
> +               cons_rx_buf->netmem = 0;
> +               prod_rx_buf->netmem = netmem;
>                 prod_rx_buf->offset = cons_rx_buf->offset;
>
>                 prod_rx_buf->mapping = cons_rx_buf->mapping;
> @@ -1192,6 +1209,7 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp,
>
>  static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
>                                struct bnxt_cp_ring_info *cpr,
> +                              struct sk_buff *skb,
>                                struct skb_shared_info *shinfo,
>                                u16 idx, u32 agg_bufs, bool tpa,
>                                struct xdp_buff *xdp)
> @@ -1211,7 +1229,7 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
>                 u16 cons, frag_len;
>                 struct rx_agg_cmp *agg;
>                 struct bnxt_sw_rx_agg_bd *cons_rx_buf;
> -               struct page *page;
> +               netmem_ref netmem;
>                 dma_addr_t mapping;
>
>                 if (p5_tpa)
> @@ -1223,9 +1241,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
>                             RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT;
>
>                 cons_rx_buf = &rxr->rx_agg_ring[cons];
> -               skb_frag_fill_page_desc(frag, cons_rx_buf->page,
> -                                       cons_rx_buf->offset, frag_len);
> -               shinfo->nr_frags = i + 1;
> +               if (skb) {
> +                       skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem,
> +                                              cons_rx_buf->offset, frag_len,
> +                                              BNXT_RX_PAGE_SIZE);
> +               } else {
> +                       skb_frag_fill_page_desc(frag, netmem_to_page(cons_rx_buf->netmem),
> +                                               cons_rx_buf->offset, frag_len);

Our intention with the whole netmem design is that drivers should
never have to call netmem_to_page(). I.e. the driver should use netmem
unaware of whether it's page or non-page underneath, to minimize
complexity driver needs to handle.

This netmem_to_page() call can be removed by using
skb_frag_fill_netmem_desc() instead of the page variant. But, more
improtantly, why did the code change here? The code before calls
skb_frag_fill_page_desc, but the new code sometimes will
skb_frag_fill_netmem_desc() and sometimes will skb_add_rx_frag_netmem.
I'm not sure why that logic changed.

> +                       shinfo->nr_frags = i + 1;
> +               }
>                 __clear_bit(cons, rxr->rx_agg_bmap);
>
>                 /* It is possible for bnxt_alloc_rx_page() to allocate
> @@ -1233,15 +1257,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
>                  * need to clear the cons entry now.
>                  */
>                 mapping = cons_rx_buf->mapping;
> -               page = cons_rx_buf->page;
> -               cons_rx_buf->page = NULL;
> +               netmem = cons_rx_buf->netmem;
> +               cons_rx_buf->netmem = 0;
>
> -               if (xdp && page_is_pfmemalloc(page))
> +               if (xdp && page_is_pfmemalloc(netmem_to_page(netmem)))

Similarly, add netmem_is_pfmemalloc to netmem.h, instead of doing a
netmem_to_page() call here I think.

>                         xdp_buff_set_frag_pfmemalloc(xdp);
>
> -               if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_ATOMIC) != 0) {
> +               if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_ATOMIC) != 0) {
>                         --shinfo->nr_frags;
> -                       cons_rx_buf->page = page;
> +                       cons_rx_buf->netmem = netmem;
>
>                         /* Update prod since possibly some pages have been
>                          * allocated already.
> @@ -1269,7 +1293,7 @@ static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp,
>         struct skb_shared_info *shinfo = skb_shinfo(skb);
>         u32 total_frag_len = 0;
>
> -       total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, idx,
> +       total_frag_len = __bnxt_rx_agg_pages(bp, cpr, skb, shinfo, idx,
>                                              agg_bufs, tpa, NULL);
>         if (!total_frag_len) {
>                 skb_mark_for_recycle(skb);
> @@ -1277,9 +1301,6 @@ static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp,
>                 return NULL;
>         }
>
> -       skb->data_len += total_frag_len;
> -       skb->len += total_frag_len;
> -       skb->truesize += BNXT_RX_PAGE_SIZE * agg_bufs;
>         return skb;
>  }
>
> @@ -1294,7 +1315,7 @@ static u32 bnxt_rx_agg_pages_xdp(struct bnxt *bp,
>         if (!xdp_buff_has_frags(xdp))
>                 shinfo->nr_frags = 0;
>
> -       total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo,
> +       total_frag_len = __bnxt_rx_agg_pages(bp, cpr, NULL, shinfo,
>                                              idx, agg_bufs, tpa, xdp);
>         if (total_frag_len) {
>                 xdp_buff_set_frags_flag(xdp);
> @@ -3342,15 +3363,15 @@ static void bnxt_free_one_rx_agg_ring(struct bnxt *bp, struct bnxt_rx_ring_info
>
>         for (i = 0; i < max_idx; i++) {
>                 struct bnxt_sw_rx_agg_bd *rx_agg_buf = &rxr->rx_agg_ring[i];
> -               struct page *page = rx_agg_buf->page;
> +               netmem_ref netmem = rx_agg_buf->netmem;
>
> -               if (!page)
> +               if (!netmem)
>                         continue;
>
> -               rx_agg_buf->page = NULL;
> +               rx_agg_buf->netmem = 0;
>                 __clear_bit(i, rxr->rx_agg_bmap);
>
> -               page_pool_recycle_direct(rxr->page_pool, page);
> +               page_pool_put_full_netmem(rxr->page_pool, netmem, true);
>         }
>  }
>
> @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp)
>
>  static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
>                                    struct bnxt_rx_ring_info *rxr,
> +                                  int queue_idx,
>                                    int numa_node)
>  {
>         struct page_pool_params pp = { 0 };
> +       struct netdev_rx_queue *rxq;
>
>         pp.pool_size = bp->rx_agg_ring_size;
>         if (BNXT_RX_PAGE_MODE(bp))
> @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
>         pp.dev = &bp->pdev->dev;
>         pp.dma_dir = bp->rx_dir;
>         pp.max_len = PAGE_SIZE;
> -       pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> +       pp.order = 0;
> +
> +       rxq = __netif_get_rx_queue(bp->dev, queue_idx);
> +       if (rxq->mp_params.mp_priv)
> +               pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM;

This is not the intended use of PP_FLAG_ALLOW_UNREADABLE_NETMEM.

The driver should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when it's able
to handle unreadable netmem, it should not worry about whether
rxq->mp_params.mp_priv is set or not.

You should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when HDS is enabled.
Let core figure out if mp_params.mp_priv is enabled. All the driver
needs to report is whether it's configured to be able to handle
unreadable netmem (which practically means HDS is enabled).

> +       else
> +               pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
>
> +       pp.queue_idx = queue_idx;
>         rxr->page_pool = page_pool_create(&pp);
>         if (IS_ERR(rxr->page_pool)) {
>                 int err = PTR_ERR(rxr->page_pool);
> @@ -3655,7 +3685,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
>                 cpu_node = cpu_to_node(cpu);
>                 netdev_dbg(bp->dev, "Allocating page pool for rx_ring[%d] on numa_node: %d\n",
>                            i, cpu_node);
> -               rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node);
> +               rc = bnxt_alloc_rx_page_pool(bp, rxr, i, cpu_node);
>                 if (rc)
>                         return rc;
>
> @@ -4154,7 +4184,7 @@ static void bnxt_alloc_one_rx_ring_page(struct bnxt *bp,
>
>         prod = rxr->rx_agg_prod;
>         for (i = 0; i < bp->rx_agg_ring_size; i++) {
> -               if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_KERNEL)) {
> +               if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_KERNEL)) {
>                         netdev_warn(bp->dev, "init'ed rx ring %d with %d/%d pages only\n",
>                                     ring_nr, i, bp->rx_ring_size);
>                         break;
> @@ -15063,7 +15093,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx)
>         clone->rx_sw_agg_prod = 0;
>         clone->rx_next_cons = 0;
>
> -       rc = bnxt_alloc_rx_page_pool(bp, clone, rxr->page_pool->p.nid);
> +       rc = bnxt_alloc_rx_page_pool(bp, clone, idx, rxr->page_pool->p.nid);
>         if (rc)
>                 return rc;
>
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> index 48f390519c35..3cf57a3c7664 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> @@ -895,7 +895,7 @@ struct bnxt_sw_rx_bd {
>  };
>
>  struct bnxt_sw_rx_agg_bd {
> -       struct page             *page;
> +       netmem_ref              netmem;
>         unsigned int            offset;
>         dma_addr_t              mapping;
>  };
> --
> 2.34.1
>
Taehee Yoo Oct. 4, 2024, 10:34 a.m. UTC | #2
On Fri, Oct 4, 2024 at 3:43 AM Mina Almasry <almasrymina@google.com> wrote:
>

Hi Mina,
Thanks a lot for your review!

> On Thu, Oct 3, 2024 at 9:07 AM Taehee Yoo <ap420073@gmail.com> wrote:
> >
> > Currently, bnxt_en driver satisfies the requirements of Device memory
> > TCP, which is tcp-data-split.
> > So, it implements Device memory TCP for bnxt_en driver.
> >
> > From now on, the aggregation ring handles netmem_ref instead of page
> > regardless of the on/off of netmem.
> > So, for the aggregation ring, memory will be handled with the netmem
> > page_pool API instead of generic page_pool API.
> >
> > If Devmem is enabled, netmem_ref is used as-is and if Devmem is not
> > enabled, netmem_ref will be converted to page and that is used.
> >
> > Driver recognizes whether the devmem is set or unset based on the
> > mp_params.mp_priv is not NULL.
> > Only if devmem is set, it passes PP_FLAG_ALLOW_UNREADABLE_NETMEM.
> >
> > Signed-off-by: Taehee Yoo <ap420073@gmail.com>
> > ---
> >
> > v3:
> >  - Patch added
> >
> >  drivers/net/ethernet/broadcom/Kconfig     |  1 +
> >  drivers/net/ethernet/broadcom/bnxt/bnxt.c | 98 +++++++++++++++--------
> >  drivers/net/ethernet/broadcom/bnxt/bnxt.h |  2 +-
> >  3 files changed, 66 insertions(+), 35 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
> > index 75ca3ddda1f5..f37ff12d4746 100644
> > --- a/drivers/net/ethernet/broadcom/Kconfig
> > +++ b/drivers/net/ethernet/broadcom/Kconfig
> > @@ -211,6 +211,7 @@ config BNXT
> >         select FW_LOADER
> >         select LIBCRC32C
> >         select NET_DEVLINK
> > +       select NET_DEVMEM
> >         select PAGE_POOL
> >         select DIMLIB
> >         select AUXILIARY_BUS
> > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > index 872b15842b11..64e07d247f97 100644
> > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > @@ -55,6 +55,7 @@
> >  #include <net/page_pool/helpers.h>
> >  #include <linux/align.h>
> >  #include <net/netdev_queues.h>
> > +#include <net/netdev_rx_queue.h>
> >
> >  #include "bnxt_hsi.h"
> >  #include "bnxt.h"
> > @@ -863,6 +864,22 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget)
> >                 bnapi->events &= ~BNXT_TX_CMP_EVENT;
> >  }
> >
> > +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping,
> > +                                        struct bnxt_rx_ring_info *rxr,
> > +                                        unsigned int *offset,
> > +                                        gfp_t gfp)
> > +{
> > +       netmem_ref netmem;
> > +
> > +       netmem = page_pool_alloc_netmem(rxr->page_pool, GFP_ATOMIC);
> > +       if (!netmem)
> > +               return 0;
> > +       *offset = 0;
> > +
> > +       *mapping = page_pool_get_dma_addr_netmem(netmem) + *offset;
> > +       return netmem;
> > +}
> > +
> >  static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping,
> >                                          struct bnxt_rx_ring_info *rxr,
> >                                          unsigned int *offset,
> > @@ -972,21 +989,21 @@ static inline u16 bnxt_find_next_agg_idx(struct bnxt_rx_ring_info *rxr, u16 idx)
> >         return next;
> >  }
> >
> > -static inline int bnxt_alloc_rx_page(struct bnxt *bp,
> > -                                    struct bnxt_rx_ring_info *rxr,
> > -                                    u16 prod, gfp_t gfp)
> > +static inline int bnxt_alloc_rx_netmem(struct bnxt *bp,
> > +                                      struct bnxt_rx_ring_info *rxr,
> > +                                      u16 prod, gfp_t gfp)
> >  {
> >         struct rx_bd *rxbd =
> >                 &rxr->rx_agg_desc_ring[RX_AGG_RING(bp, prod)][RX_IDX(prod)];
> >         struct bnxt_sw_rx_agg_bd *rx_agg_buf;
> > -       struct page *page;
> > +       netmem_ref netmem;
> >         dma_addr_t mapping;
> >         u16 sw_prod = rxr->rx_sw_agg_prod;
> >         unsigned int offset = 0;
> >
> > -       page = __bnxt_alloc_rx_page(bp, &mapping, rxr, &offset, gfp);
> > +       netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, &offset, gfp);
>
> Does __bnxt_alloc_rx_page become dead code after this change? Or is it
> still used for something?

__bnxt_alloc_rx_page() is still used.

>
> >
> > -       if (!page)
> > +       if (!netmem)
> >                 return -ENOMEM;
> >
> >         if (unlikely(test_bit(sw_prod, rxr->rx_agg_bmap)))
> > @@ -996,7 +1013,7 @@ static inline int bnxt_alloc_rx_page(struct bnxt *bp,
> >         rx_agg_buf = &rxr->rx_agg_ring[sw_prod];
> >         rxr->rx_sw_agg_prod = RING_RX_AGG(bp, NEXT_RX_AGG(sw_prod));
> >
> > -       rx_agg_buf->page = page;
> > +       rx_agg_buf->netmem = netmem;
> >         rx_agg_buf->offset = offset;
> >         rx_agg_buf->mapping = mapping;
> >         rxbd->rx_bd_haddr = cpu_to_le64(mapping);
> > @@ -1044,7 +1061,7 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx,
> >                 struct rx_agg_cmp *agg;
> >                 struct bnxt_sw_rx_agg_bd *cons_rx_buf, *prod_rx_buf;
> >                 struct rx_bd *prod_bd;
> > -               struct page *page;
> > +               netmem_ref netmem;
> >
> >                 if (p5_tpa)
> >                         agg = bnxt_get_tpa_agg_p5(bp, rxr, idx, start + i);
> > @@ -1061,11 +1078,11 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx,
> >                 cons_rx_buf = &rxr->rx_agg_ring[cons];
> >
> >                 /* It is possible for sw_prod to be equal to cons, so
> > -                * set cons_rx_buf->page to NULL first.If I misunderstand about
> > +                * set cons_rx_buf->netmem to 0 first.
> >                  */
> > -               page = cons_rx_buf->page;
> > -               cons_rx_buf->page = NULL;
> > -               prod_rx_buf->page = page;
> > +               netmem = cons_rx_buf->netmem;
> > +               cons_rx_buf->netmem = 0;
> > +               prod_rx_buf->netmem = netmem;
> >                 prod_rx_buf->offset = cons_rx_buf->offset;
> >
> >                 prod_rx_buf->mapping = cons_rx_buf->mapping;
> > @@ -1192,6 +1209,7 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp,
> >
> >  static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
> >                                struct bnxt_cp_ring_info *cpr,
> > +                              struct sk_buff *skb,
> >                                struct skb_shared_info *shinfo,
> >                                u16 idx, u32 agg_bufs, bool tpa,
> >                                struct xdp_buff *xdp)
> > @@ -1211,7 +1229,7 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
> >                 u16 cons, frag_len;
> >                 struct rx_agg_cmp *agg;
> >                 struct bnxt_sw_rx_agg_bd *cons_rx_buf;
> > -               struct page *page;
> > +               netmem_ref netmem;
> >                 dma_addr_t mapping;
> >
> >                 if (p5_tpa)
> > @@ -1223,9 +1241,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
> >                             RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT;
> >
> >                 cons_rx_buf = &rxr->rx_agg_ring[cons];
> > -               skb_frag_fill_page_desc(frag, cons_rx_buf->page,
> > -                                       cons_rx_buf->offset, frag_len);
> > -               shinfo->nr_frags = i + 1;
> > +               if (skb) {
> > +                       skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem,
> > +                                              cons_rx_buf->offset, frag_len,
> > +                                              BNXT_RX_PAGE_SIZE);
> > +               } else {
> > +                       skb_frag_fill_page_desc(frag, netmem_to_page(cons_rx_buf->netmem),
> > +                                               cons_rx_buf->offset, frag_len);
>
> Our intention with the whole netmem design is that drivers should
> never have to call netmem_to_page(). I.e. the driver should use netmem
> unaware of whether it's page or non-page underneath, to minimize
> complexity driver needs to handle.
>
> This netmem_to_page() call can be removed by using
> skb_frag_fill_netmem_desc() instead of the page variant. But, more
> improtantly, why did the code change here? The code before calls
> skb_frag_fill_page_desc, but the new code sometimes will
> skb_frag_fill_netmem_desc() and sometimes will skb_add_rx_frag_netmem.
> I'm not sure why that logic changed.

The reason why skb_add_rx_frag_netmem() is used here is to set
skb->unreadable flag. the skb_frag_fill_netmem_desc() doesn't set
skb->unreadable because it doesn't handle skb, it only handles frag.
As far as I know, skb->unreadable should be set to true for devmem
TCP, am I misunderstood?
I tested that don't using skb_add_rx_frag_netmem() here, and it
immediately fails.

The "if (skb)" branch will be hit only when devmem TCP path.
Normal packet and XDP path will hit "else" branch.

I will use skb_frag_fill_netmem_desc() instead of
skb_frag_fill_page_desc() in the "else" branch.
With this change, as you said, there is no netmem_to_page() in bnxt_en
driver, Thanks!

>
> > +                       shinfo->nr_frags = i + 1;
> > +               }
> >                 __clear_bit(cons, rxr->rx_agg_bmap);
> >
> >                 /* It is possible for bnxt_alloc_rx_page() to allocate
> > @@ -1233,15 +1257,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
> >                  * need to clear the cons entry now.
> >                  */
> >                 mapping = cons_rx_buf->mapping;
> > -               page = cons_rx_buf->page;
> > -               cons_rx_buf->page = NULL;
> > +               netmem = cons_rx_buf->netmem;
> > +               cons_rx_buf->netmem = 0;
> >
> > -               if (xdp && page_is_pfmemalloc(page))
> > +               if (xdp && page_is_pfmemalloc(netmem_to_page(netmem)))
>
> Similarly, add netmem_is_pfmemalloc to netmem.h, instead of doing a
> netmem_to_page() call here I think.

Thanks, I will add netmem_is_pfmemalloc() to netmem.h in a v4 patch.

>
> >                         xdp_buff_set_frag_pfmemalloc(xdp);
> >
> > -               if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_ATOMIC) != 0) {
> > +               if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_ATOMIC) != 0) {
> >                         --shinfo->nr_frags;
> > -                       cons_rx_buf->page = page;
> > +                       cons_rx_buf->netmem = netmem;
> >
> >                         /* Update prod since possibly some pages have been
> >                          * allocated already.
> > @@ -1269,7 +1293,7 @@ static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp,
> >         struct skb_shared_info *shinfo = skb_shinfo(skb);
> >         u32 total_frag_len = 0;
> >
> > -       total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, idx,
> > +       total_frag_len = __bnxt_rx_agg_pages(bp, cpr, skb, shinfo, idx,
> >                                              agg_bufs, tpa, NULL);
> >         if (!total_frag_len) {
> >                 skb_mark_for_recycle(skb);
> > @@ -1277,9 +1301,6 @@ static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp,
> >                 return NULL;
> >         }
> >
> > -       skb->data_len += total_frag_len;
> > -       skb->len += total_frag_len;
> > -       skb->truesize += BNXT_RX_PAGE_SIZE * agg_bufs;
> >         return skb;
> >  }
> >
> > @@ -1294,7 +1315,7 @@ static u32 bnxt_rx_agg_pages_xdp(struct bnxt *bp,
> >         if (!xdp_buff_has_frags(xdp))
> >                 shinfo->nr_frags = 0;
> >
> > -       total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo,
> > +       total_frag_len = __bnxt_rx_agg_pages(bp, cpr, NULL, shinfo,
> >                                              idx, agg_bufs, tpa, xdp);
> >         if (total_frag_len) {
> >                 xdp_buff_set_frags_flag(xdp);
> > @@ -3342,15 +3363,15 @@ static void bnxt_free_one_rx_agg_ring(struct bnxt *bp, struct bnxt_rx_ring_info
> >
> >         for (i = 0; i < max_idx; i++) {
> >                 struct bnxt_sw_rx_agg_bd *rx_agg_buf = &rxr->rx_agg_ring[i];
> > -               struct page *page = rx_agg_buf->page;
> > +               netmem_ref netmem = rx_agg_buf->netmem;
> >
> > -               if (!page)
> > +               if (!netmem)
> >                         continue;
> >
> > -               rx_agg_buf->page = NULL;
> > +               rx_agg_buf->netmem = 0;
> >                 __clear_bit(i, rxr->rx_agg_bmap);
> >
> > -               page_pool_recycle_direct(rxr->page_pool, page);
> > +               page_pool_put_full_netmem(rxr->page_pool, netmem, true);
> >         }
> >  }
> >
> > @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp)
> >
> >  static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
> >                                    struct bnxt_rx_ring_info *rxr,
> > +                                  int queue_idx,
> >                                    int numa_node)
> >  {
> >         struct page_pool_params pp = { 0 };
> > +       struct netdev_rx_queue *rxq;
> >
> >         pp.pool_size = bp->rx_agg_ring_size;
> >         if (BNXT_RX_PAGE_MODE(bp))
> > @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
> >         pp.dev = &bp->pdev->dev;
> >         pp.dma_dir = bp->rx_dir;
> >         pp.max_len = PAGE_SIZE;
> > -       pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> > +       pp.order = 0;
> > +
> > +       rxq = __netif_get_rx_queue(bp->dev, queue_idx);
> > +       if (rxq->mp_params.mp_priv)
> > +               pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM;
>
> This is not the intended use of PP_FLAG_ALLOW_UNREADABLE_NETMEM.
>
> The driver should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when it's able
> to handle unreadable netmem, it should not worry about whether
> rxq->mp_params.mp_priv is set or not.
>
> You should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when HDS is enabled.
> Let core figure out if mp_params.mp_priv is enabled. All the driver
> needs to report is whether it's configured to be able to handle
> unreadable netmem (which practically means HDS is enabled).

The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM
flag can't be used with PP_FLAG_DMA_SYNC_DEV.

 228         if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
 229                 /* In order to request DMA-sync-for-device the page
 230                  * needs to be mapped
 231                  */
 232                 if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
 233                         return -EINVAL;
 234
 235                 if (!pool->p.max_len)
 236                         return -EINVAL;
 237
 238                 pool->dma_sync = true;                //here
 239
 240                 /* pool->p.offset has to be set according to the address
 241                  * offset used by the DMA engine to start copying rx data
 242                  */
 243         }

If PP_FLAG_DMA_SYNC_DEV is set, page->dma_sync is set to true.

347 int mp_dmabuf_devmem_init(struct page_pool *pool)
348 {
349         struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
350
351         if (!binding)
352                 return -EINVAL;
353
354         if (!pool->dma_map)
355                 return -EOPNOTSUPP;
356
357         if (pool->dma_sync)                      //here
358                 return -EOPNOTSUPP;
359
360         if (pool->p.order != 0)
361                 return -E2BIG;
362
363         net_devmem_dmabuf_binding_get(binding);
364         return 0;
365 }

In the mp_dmabuf_devmem_init(), it fails when pool->dma_sync is true.

tcp-data-split can be used for normal cases, not only devmem TCP case.
If we enable tcp-data-split and disable devmem TCP, page_pool doesn't
have PP_FLAG_DMA_SYNC_DEV.
So I think mp_params.mp_priv is still useful.

Thanks a lot,
Taehee Yoo

>
> > +       else
> > +               pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> >
> > +       pp.queue_idx = queue_idx;
> >         rxr->page_pool = page_pool_create(&pp);
> >         if (IS_ERR(rxr->page_pool)) {
> >                 int err = PTR_ERR(rxr->page_pool);
> > @@ -3655,7 +3685,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
> >                 cpu_node = cpu_to_node(cpu);
> >                 netdev_dbg(bp->dev, "Allocating page pool for rx_ring[%d] on numa_node: %d\n",
> >                            i, cpu_node);
> > -               rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node);
> > +               rc = bnxt_alloc_rx_page_pool(bp, rxr, i, cpu_node);
> >                 if (rc)
> >                         return rc;
> >
> > @@ -4154,7 +4184,7 @@ static void bnxt_alloc_one_rx_ring_page(struct bnxt *bp,
> >
> >         prod = rxr->rx_agg_prod;
> >         for (i = 0; i < bp->rx_agg_ring_size; i++) {
> > -               if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_KERNEL)) {
> > +               if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_KERNEL)) {
> >                         netdev_warn(bp->dev, "init'ed rx ring %d with %d/%d pages only\n",
> >                                     ring_nr, i, bp->rx_ring_size);
> >                         break;
> > @@ -15063,7 +15093,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx)
> >         clone->rx_sw_agg_prod = 0;
> >         clone->rx_next_cons = 0;
> >
> > -       rc = bnxt_alloc_rx_page_pool(bp, clone, rxr->page_pool->p.nid);
> > +       rc = bnxt_alloc_rx_page_pool(bp, clone, idx, rxr->page_pool->p.nid);
> >         if (rc)
> >                 return rc;
> >
> > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > index 48f390519c35..3cf57a3c7664 100644
> > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > @@ -895,7 +895,7 @@ struct bnxt_sw_rx_bd {
> >  };
> >
> >  struct bnxt_sw_rx_agg_bd {
> > -       struct page             *page;
> > +       netmem_ref              netmem;
> >         unsigned int            offset;
> >         dma_addr_t              mapping;
> >  };
> > --
> > 2.34.1
> >
>
>
> --
> Thanks,
> Mina
kernel test robot Oct. 5, 2024, 3:48 a.m. UTC | #3
Hi Taehee,

kernel test robot noticed the following build warnings:

[auto build test WARNING on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Taehee-Yoo/bnxt_en-add-support-for-rx-copybreak-ethtool-command/20241004-000934
base:   net-next/main
patch link:    https://lore.kernel.org/r/20241003160620.1521626-8-ap420073%40gmail.com
patch subject: [PATCH net-next v3 7/7] bnxt_en: add support for device memory tcp
config: x86_64-kismet-CONFIG_NET_DEVMEM-CONFIG_BNXT-0-0 (https://download.01.org/0day-ci/archive/20241005/202410051156.r68SYo4V-lkp@intel.com/config)
reproduce: (https://download.01.org/0day-ci/archive/20241005/202410051156.r68SYo4V-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202410051156.r68SYo4V-lkp@intel.com/

kismet warnings: (new ones prefixed by >>)
>> kismet: WARNING: unmet direct dependencies detected for NET_DEVMEM when selected by BNXT
   WARNING: unmet direct dependencies detected for NET_DEVMEM
     Depends on [n]: NET [=y] && DMA_SHARED_BUFFER [=n] && GENERIC_ALLOCATOR [=y] && PAGE_POOL [=y]
     Selected by [y]:
     - BNXT [=y] && NETDEVICES [=y] && ETHERNET [=y] && NET_VENDOR_BROADCOM [=y] && PCI [=y] && PTP_1588_CLOCK_OPTIONAL [=y]
David Wei Oct. 8, 2024, 2:45 a.m. UTC | #4
On 2024-10-03 09:06, Taehee Yoo wrote:
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> index 872b15842b11..64e07d247f97 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> @@ -55,6 +55,7 @@
>  #include <net/page_pool/helpers.h>
>  #include <linux/align.h>
>  #include <net/netdev_queues.h>
> +#include <net/netdev_rx_queue.h>
>  
>  #include "bnxt_hsi.h"
>  #include "bnxt.h"
> @@ -863,6 +864,22 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget)
>  		bnapi->events &= ~BNXT_TX_CMP_EVENT;
>  }
>  
> +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping,
> +					 struct bnxt_rx_ring_info *rxr,
> +					 unsigned int *offset,
> +					 gfp_t gfp)

gfp is unused

> +{
> +	netmem_ref netmem;
> +
> +	netmem = page_pool_alloc_netmem(rxr->page_pool, GFP_ATOMIC);
> +	if (!netmem)
> +		return 0;
> +	*offset = 0;
> +
> +	*mapping = page_pool_get_dma_addr_netmem(netmem) + *offset;

offset is always 0

> +	return netmem;
> +}
> +
>  static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping,
>  					 struct bnxt_rx_ring_info *rxr,
>  					 unsigned int *offset,

[...]

> @@ -1192,6 +1209,7 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp,
>  
>  static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
>  			       struct bnxt_cp_ring_info *cpr,
> +			       struct sk_buff *skb,
>  			       struct skb_shared_info *shinfo,
>  			       u16 idx, u32 agg_bufs, bool tpa,
>  			       struct xdp_buff *xdp)
> @@ -1211,7 +1229,7 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
>  		u16 cons, frag_len;
>  		struct rx_agg_cmp *agg;
>  		struct bnxt_sw_rx_agg_bd *cons_rx_buf;
> -		struct page *page;
> +		netmem_ref netmem;
>  		dma_addr_t mapping;
>  
>  		if (p5_tpa)
> @@ -1223,9 +1241,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
>  			    RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT;
>  
>  		cons_rx_buf = &rxr->rx_agg_ring[cons];
> -		skb_frag_fill_page_desc(frag, cons_rx_buf->page,
> -					cons_rx_buf->offset, frag_len);
> -		shinfo->nr_frags = i + 1;
> +		if (skb) {
> +			skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem,
> +					       cons_rx_buf->offset, frag_len,
> +					       BNXT_RX_PAGE_SIZE);
> +		} else {
> +			skb_frag_fill_page_desc(frag, netmem_to_page(cons_rx_buf->netmem),
> +						cons_rx_buf->offset, frag_len);
> +			shinfo->nr_frags = i + 1;
> +		}

I feel like this function needs a refactor at some point to split out
the skb and xdp paths.

>  		__clear_bit(cons, rxr->rx_agg_bmap);
>  
>  		/* It is possible for bnxt_alloc_rx_page() to allocate

[...]

> @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp)
>  
>  static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
>  				   struct bnxt_rx_ring_info *rxr,
> +				   int queue_idx,

To save a parameter, the index is available already in rxr->bnapi->index

>  				   int numa_node)
>  {
>  	struct page_pool_params pp = { 0 };
> +	struct netdev_rx_queue *rxq;
>  
>  	pp.pool_size = bp->rx_agg_ring_size;
>  	if (BNXT_RX_PAGE_MODE(bp))
> @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
>  	pp.dev = &bp->pdev->dev;
>  	pp.dma_dir = bp->rx_dir;
>  	pp.max_len = PAGE_SIZE;
> -	pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> +	pp.order = 0;
> +
> +	rxq = __netif_get_rx_queue(bp->dev, queue_idx);
> +	if (rxq->mp_params.mp_priv)
> +		pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM;
> +	else
> +		pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
>  
> +	pp.queue_idx = queue_idx;
>  	rxr->page_pool = page_pool_create(&pp);
>  	if (IS_ERR(rxr->page_pool)) {
>  		int err = PTR_ERR(rxr->page_pool);
> @@ -3655,7 +3685,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
>  		cpu_node = cpu_to_node(cpu);
>  		netdev_dbg(bp->dev, "Allocating page pool for rx_ring[%d] on numa_node: %d\n",
>  			   i, cpu_node);
> -		rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node);
> +		rc = bnxt_alloc_rx_page_pool(bp, rxr, i, cpu_node);
>  		if (rc)
>  			return rc;
>
David Wei Oct. 8, 2024, 2:57 a.m. UTC | #5
On 2024-10-04 03:34, Taehee Yoo wrote:
> On Fri, Oct 4, 2024 at 3:43 AM Mina Almasry <almasrymina@google.com> wrote:
>>> @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp)
>>>
>>>  static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
>>>                                    struct bnxt_rx_ring_info *rxr,
>>> +                                  int queue_idx,
>>>                                    int numa_node)
>>>  {
>>>         struct page_pool_params pp = { 0 };
>>> +       struct netdev_rx_queue *rxq;
>>>
>>>         pp.pool_size = bp->rx_agg_ring_size;
>>>         if (BNXT_RX_PAGE_MODE(bp))
>>> @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
>>>         pp.dev = &bp->pdev->dev;
>>>         pp.dma_dir = bp->rx_dir;
>>>         pp.max_len = PAGE_SIZE;
>>> -       pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
>>> +       pp.order = 0;
>>> +
>>> +       rxq = __netif_get_rx_queue(bp->dev, queue_idx);
>>> +       if (rxq->mp_params.mp_priv)
>>> +               pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM;
>>
>> This is not the intended use of PP_FLAG_ALLOW_UNREADABLE_NETMEM.
>>
>> The driver should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when it's able
>> to handle unreadable netmem, it should not worry about whether
>> rxq->mp_params.mp_priv is set or not.
>>
>> You should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when HDS is enabled.
>> Let core figure out if mp_params.mp_priv is enabled. All the driver
>> needs to report is whether it's configured to be able to handle
>> unreadable netmem (which practically means HDS is enabled).
> 
> The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM
> flag can't be used with PP_FLAG_DMA_SYNC_DEV.
> 
>  228         if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
>  229                 /* In order to request DMA-sync-for-device the page
>  230                  * needs to be mapped
>  231                  */
>  232                 if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
>  233                         return -EINVAL;
>  234
>  235                 if (!pool->p.max_len)
>  236                         return -EINVAL;
>  237
>  238                 pool->dma_sync = true;                //here
>  239
>  240                 /* pool->p.offset has to be set according to the address
>  241                  * offset used by the DMA engine to start copying rx data
>  242                  */
>  243         }
> 
> If PP_FLAG_DMA_SYNC_DEV is set, page->dma_sync is set to true.
> 
> 347 int mp_dmabuf_devmem_init(struct page_pool *pool)
> 348 {
> 349         struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
> 350
> 351         if (!binding)
> 352                 return -EINVAL;
> 353
> 354         if (!pool->dma_map)
> 355                 return -EOPNOTSUPP;
> 356
> 357         if (pool->dma_sync)                      //here
> 358                 return -EOPNOTSUPP;
> 359
> 360         if (pool->p.order != 0)
> 361                 return -E2BIG;
> 362
> 363         net_devmem_dmabuf_binding_get(binding);
> 364         return 0;
> 365 }
> 
> In the mp_dmabuf_devmem_init(), it fails when pool->dma_sync is true.

This won't work for io_uring zero copy into user memory. We need all
PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | PP_FLAG_ALLOW_UNREADABLE_NETMEM
set.

I agree with Mina that the driver should not be poking at the mp_priv
fields. How about setting all the flags and then letting the mp->init()
figure it out? mp_dmabuf_devmem_init() is called within page_pool_init()
so as long as it resets dma_sync if set I don't see any issues.

> 
> tcp-data-split can be used for normal cases, not only devmem TCP case.
> If we enable tcp-data-split and disable devmem TCP, page_pool doesn't
> have PP_FLAG_DMA_SYNC_DEV.
> So I think mp_params.mp_priv is still useful.
> 
> Thanks a lot,
> Taehee Yoo
> 
>>
>>
>> --
>> Thanks,
>> Mina
Taehee Yoo Oct. 8, 2024, 3:54 a.m. UTC | #6
On Tue, Oct 8, 2024 at 11:45 AM David Wei <dw@davidwei.uk> wrote:
>

Hi David,
Thanks a lot for your review!

> On 2024-10-03 09:06, Taehee Yoo wrote:
> > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > index 872b15842b11..64e07d247f97 100644
> > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > @@ -55,6 +55,7 @@
> >  #include <net/page_pool/helpers.h>
> >  #include <linux/align.h>
> >  #include <net/netdev_queues.h>
> > +#include <net/netdev_rx_queue.h>
> >
> >  #include "bnxt_hsi.h"
> >  #include "bnxt.h"
> > @@ -863,6 +864,22 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget)
> >               bnapi->events &= ~BNXT_TX_CMP_EVENT;
> >  }
> >
> > +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping,
> > +                                      struct bnxt_rx_ring_info *rxr,
> > +                                      unsigned int *offset,
> > +                                      gfp_t gfp)
>
> gfp is unused

I will remove unnecessary gfp parameter in v4.

>
> > +{
> > +     netmem_ref netmem;
> > +
> > +     netmem = page_pool_alloc_netmem(rxr->page_pool, GFP_ATOMIC);
> > +     if (!netmem)
> > +             return 0;
> > +     *offset = 0;
> > +
> > +     *mapping = page_pool_get_dma_addr_netmem(netmem) + *offset;
>
> offset is always 0

Okay, I will remove this too in v4.

>
> > +     return netmem;
> > +}
> > +
> >  static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping,
> >                                        struct bnxt_rx_ring_info *rxr,
> >                                        unsigned int *offset,
>
> [...]
>
> > @@ -1192,6 +1209,7 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp,
> >
> >  static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
> >                              struct bnxt_cp_ring_info *cpr,
> > +                            struct sk_buff *skb,
> >                              struct skb_shared_info *shinfo,
> >                              u16 idx, u32 agg_bufs, bool tpa,
> >                              struct xdp_buff *xdp)
> > @@ -1211,7 +1229,7 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
> >               u16 cons, frag_len;
> >               struct rx_agg_cmp *agg;
> >               struct bnxt_sw_rx_agg_bd *cons_rx_buf;
> > -             struct page *page;
> > +             netmem_ref netmem;
> >               dma_addr_t mapping;
> >
> >               if (p5_tpa)
> > @@ -1223,9 +1241,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
> >                           RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT;
> >
> >               cons_rx_buf = &rxr->rx_agg_ring[cons];
> > -             skb_frag_fill_page_desc(frag, cons_rx_buf->page,
> > -                                     cons_rx_buf->offset, frag_len);
> > -             shinfo->nr_frags = i + 1;
> > +             if (skb) {
> > +                     skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem,
> > +                                            cons_rx_buf->offset, frag_len,
> > +                                            BNXT_RX_PAGE_SIZE);
> > +             } else {
> > +                     skb_frag_fill_page_desc(frag, netmem_to_page(cons_rx_buf->netmem),
> > +                                             cons_rx_buf->offset, frag_len);
> > +                     shinfo->nr_frags = i + 1;
> > +             }
>
> I feel like this function needs a refactor at some point to split out
> the skb and xdp paths.

Okay, I will add __bnxt_rx_agg_netmem() in v4 patch.

>
> >               __clear_bit(cons, rxr->rx_agg_bmap);
> >
> >               /* It is possible for bnxt_alloc_rx_page() to allocate
>
> [...]
>
> > @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp)
> >
> >  static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
> >                                  struct bnxt_rx_ring_info *rxr,
> > +                                int queue_idx,
>
> To save a parameter, the index is available already in rxr->bnapi->index

Okay, I also remove the queue_idx parameter in v4.

>
> >                                  int numa_node)
> >  {
> >       struct page_pool_params pp = { 0 };
> > +     struct netdev_rx_queue *rxq;
> >
> >       pp.pool_size = bp->rx_agg_ring_size;
> >       if (BNXT_RX_PAGE_MODE(bp))
> > @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
> >       pp.dev = &bp->pdev->dev;
> >       pp.dma_dir = bp->rx_dir;
> >       pp.max_len = PAGE_SIZE;
> > -     pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> > +     pp.order = 0;
> > +
> > +     rxq = __netif_get_rx_queue(bp->dev, queue_idx);
> > +     if (rxq->mp_params.mp_priv)
> > +             pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM;
> > +     else
> > +             pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> >
> > +     pp.queue_idx = queue_idx;
> >       rxr->page_pool = page_pool_create(&pp);
> >       if (IS_ERR(rxr->page_pool)) {
> >               int err = PTR_ERR(rxr->page_pool);
> > @@ -3655,7 +3685,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
> >               cpu_node = cpu_to_node(cpu);
> >               netdev_dbg(bp->dev, "Allocating page pool for rx_ring[%d] on numa_node: %d\n",
> >                          i, cpu_node);
> > -             rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node);
> > +             rc = bnxt_alloc_rx_page_pool(bp, rxr, i, cpu_node);
> >               if (rc)
> >                       return rc;
> >

Thanks a lot for catching things,
I will send v4 if there is no problem after some tests.

Thanks!
Taehee Yoo
Taehee Yoo Oct. 8, 2024, 3:58 a.m. UTC | #7
On Tue, Oct 8, 2024 at 12:54 PM Taehee Yoo <ap420073@gmail.com> wrote:
>
> On Tue, Oct 8, 2024 at 11:45 AM David Wei <dw@davidwei.uk> wrote:
> >
>
> Hi David,
> Thanks a lot for your review!
>
> > On 2024-10-03 09:06, Taehee Yoo wrote:
> > > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > > index 872b15842b11..64e07d247f97 100644
> > > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > > @@ -55,6 +55,7 @@
> > >  #include <net/page_pool/helpers.h>
> > >  #include <linux/align.h>
> > >  #include <net/netdev_queues.h>
> > > +#include <net/netdev_rx_queue.h>
> > >
> > >  #include "bnxt_hsi.h"
> > >  #include "bnxt.h"
> > > @@ -863,6 +864,22 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget)
> > >               bnapi->events &= ~BNXT_TX_CMP_EVENT;
> > >  }
> > >
> > > +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping,
> > > +                                      struct bnxt_rx_ring_info *rxr,
> > > +                                      unsigned int *offset,
> > > +                                      gfp_t gfp)
> >
> > gfp is unused
>
> I will remove unnecessary gfp parameter in v4.

Oh sorry,
I will use gfp parameter, not remove it.

>
> >
> > > +{
> > > +     netmem_ref netmem;
> > > +
> > > +     netmem = page_pool_alloc_netmem(rxr->page_pool, GFP_ATOMIC);
> > > +     if (!netmem)
> > > +             return 0;
> > > +     *offset = 0;
> > > +
> > > +     *mapping = page_pool_get_dma_addr_netmem(netmem) + *offset;
> >
> > offset is always 0
>
> Okay, I will remove this too in v4.
>
> >
> > > +     return netmem;
> > > +}
> > > +
> > >  static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping,
> > >                                        struct bnxt_rx_ring_info *rxr,
> > >                                        unsigned int *offset,
> >
> > [...]
> >
> > > @@ -1192,6 +1209,7 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp,
> > >
> > >  static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
> > >                              struct bnxt_cp_ring_info *cpr,
> > > +                            struct sk_buff *skb,
> > >                              struct skb_shared_info *shinfo,
> > >                              u16 idx, u32 agg_bufs, bool tpa,
> > >                              struct xdp_buff *xdp)
> > > @@ -1211,7 +1229,7 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
> > >               u16 cons, frag_len;
> > >               struct rx_agg_cmp *agg;
> > >               struct bnxt_sw_rx_agg_bd *cons_rx_buf;
> > > -             struct page *page;
> > > +             netmem_ref netmem;
> > >               dma_addr_t mapping;
> > >
> > >               if (p5_tpa)
> > > @@ -1223,9 +1241,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
> > >                           RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT;
> > >
> > >               cons_rx_buf = &rxr->rx_agg_ring[cons];
> > > -             skb_frag_fill_page_desc(frag, cons_rx_buf->page,
> > > -                                     cons_rx_buf->offset, frag_len);
> > > -             shinfo->nr_frags = i + 1;
> > > +             if (skb) {
> > > +                     skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem,
> > > +                                            cons_rx_buf->offset, frag_len,
> > > +                                            BNXT_RX_PAGE_SIZE);
> > > +             } else {
> > > +                     skb_frag_fill_page_desc(frag, netmem_to_page(cons_rx_buf->netmem),
> > > +                                             cons_rx_buf->offset, frag_len);
> > > +                     shinfo->nr_frags = i + 1;
> > > +             }
> >
> > I feel like this function needs a refactor at some point to split out
> > the skb and xdp paths.
>
> Okay, I will add __bnxt_rx_agg_netmem() in v4 patch.
>
> >
> > >               __clear_bit(cons, rxr->rx_agg_bmap);
> > >
> > >               /* It is possible for bnxt_alloc_rx_page() to allocate
> >
> > [...]
> >
> > > @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp)
> > >
> > >  static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
> > >                                  struct bnxt_rx_ring_info *rxr,
> > > +                                int queue_idx,
> >
> > To save a parameter, the index is available already in rxr->bnapi->index
>
> Okay, I also remove the queue_idx parameter in v4.
>
> >
> > >                                  int numa_node)
> > >  {
> > >       struct page_pool_params pp = { 0 };
> > > +     struct netdev_rx_queue *rxq;
> > >
> > >       pp.pool_size = bp->rx_agg_ring_size;
> > >       if (BNXT_RX_PAGE_MODE(bp))
> > > @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
> > >       pp.dev = &bp->pdev->dev;
> > >       pp.dma_dir = bp->rx_dir;
> > >       pp.max_len = PAGE_SIZE;
> > > -     pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> > > +     pp.order = 0;
> > > +
> > > +     rxq = __netif_get_rx_queue(bp->dev, queue_idx);
> > > +     if (rxq->mp_params.mp_priv)
> > > +             pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM;
> > > +     else
> > > +             pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> > >
> > > +     pp.queue_idx = queue_idx;
> > >       rxr->page_pool = page_pool_create(&pp);
> > >       if (IS_ERR(rxr->page_pool)) {
> > >               int err = PTR_ERR(rxr->page_pool);
> > > @@ -3655,7 +3685,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
> > >               cpu_node = cpu_to_node(cpu);
> > >               netdev_dbg(bp->dev, "Allocating page pool for rx_ring[%d] on numa_node: %d\n",
> > >                          i, cpu_node);
> > > -             rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node);
> > > +             rc = bnxt_alloc_rx_page_pool(bp, rxr, i, cpu_node);
> > >               if (rc)
> > >                       return rc;
> > >
>
> Thanks a lot for catching things,
> I will send v4 if there is no problem after some tests.
>
> Thanks!
> Taehee Yoo
Jakub Kicinski Oct. 8, 2024, 7:50 p.m. UTC | #8
On Fri, 4 Oct 2024 19:34:45 +0900 Taehee Yoo wrote:
> > Our intention with the whole netmem design is that drivers should
> > never have to call netmem_to_page(). I.e. the driver should use netmem
> > unaware of whether it's page or non-page underneath, to minimize
> > complexity driver needs to handle.
> >
> > This netmem_to_page() call can be removed by using
> > skb_frag_fill_netmem_desc() instead of the page variant. But, more
> > improtantly, why did the code change here? The code before calls
> > skb_frag_fill_page_desc, but the new code sometimes will
> > skb_frag_fill_netmem_desc() and sometimes will skb_add_rx_frag_netmem.
> > I'm not sure why that logic changed.  
> 
> The reason why skb_add_rx_frag_netmem() is used here is to set
> skb->unreadable flag. the skb_frag_fill_netmem_desc() doesn't set
> skb->unreadable because it doesn't handle skb, it only handles frag.
> As far as I know, skb->unreadable should be set to true for devmem
> TCP, am I misunderstood?
> I tested that don't using skb_add_rx_frag_netmem() here, and it
> immediately fails.

Yes, but netmem_ref can be either a net_iov or a normal page,
and skb_add_rx_frag_netmem() and similar helpers should automatically
set skb->unreadable or not.

IOW you should be able to always use netmem-aware APIs, no?

> > This is not the intended use of PP_FLAG_ALLOW_UNREADABLE_NETMEM.
> >
> > The driver should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when it's able
> > to handle unreadable netmem, it should not worry about whether
> > rxq->mp_params.mp_priv is set or not.
> >
> > You should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when HDS is enabled.
> > Let core figure out if mp_params.mp_priv is enabled. All the driver
> > needs to report is whether it's configured to be able to handle
> > unreadable netmem (which practically means HDS is enabled).  
> 
> The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM
> flag can't be used with PP_FLAG_DMA_SYNC_DEV.

Hm. Isn't the existing check the wrong way around? Is the driver
supposed to sync the buffers for device before passing them down?
Taehee Yoo Oct. 9, 2024, 3:02 p.m. UTC | #9
On Tue, Oct 8, 2024 at 11:57 AM David Wei <dw@davidwei.uk> wrote:
>
> On 2024-10-04 03:34, Taehee Yoo wrote:
> > On Fri, Oct 4, 2024 at 3:43 AM Mina Almasry <almasrymina@google.com> wrote:
> >>> @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp)
> >>>
> >>>  static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
> >>>                                    struct bnxt_rx_ring_info *rxr,
> >>> +                                  int queue_idx,
> >>>                                    int numa_node)
> >>>  {
> >>>         struct page_pool_params pp = { 0 };
> >>> +       struct netdev_rx_queue *rxq;
> >>>
> >>>         pp.pool_size = bp->rx_agg_ring_size;
> >>>         if (BNXT_RX_PAGE_MODE(bp))
> >>> @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
> >>>         pp.dev = &bp->pdev->dev;
> >>>         pp.dma_dir = bp->rx_dir;
> >>>         pp.max_len = PAGE_SIZE;
> >>> -       pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> >>> +       pp.order = 0;
> >>> +
> >>> +       rxq = __netif_get_rx_queue(bp->dev, queue_idx);
> >>> +       if (rxq->mp_params.mp_priv)
> >>> +               pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM;
> >>
> >> This is not the intended use of PP_FLAG_ALLOW_UNREADABLE_NETMEM.
> >>
> >> The driver should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when it's able
> >> to handle unreadable netmem, it should not worry about whether
> >> rxq->mp_params.mp_priv is set or not.
> >>
> >> You should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when HDS is enabled.
> >> Let core figure out if mp_params.mp_priv is enabled. All the driver
> >> needs to report is whether it's configured to be able to handle
> >> unreadable netmem (which practically means HDS is enabled).
> >
> > The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM
> > flag can't be used with PP_FLAG_DMA_SYNC_DEV.
> >
> >  228         if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
> >  229                 /* In order to request DMA-sync-for-device the page
> >  230                  * needs to be mapped
> >  231                  */
> >  232                 if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
> >  233                         return -EINVAL;
> >  234
> >  235                 if (!pool->p.max_len)
> >  236                         return -EINVAL;
> >  237
> >  238                 pool->dma_sync = true;                //here
> >  239
> >  240                 /* pool->p.offset has to be set according to the address
> >  241                  * offset used by the DMA engine to start copying rx data
> >  242                  */
> >  243         }
> >
> > If PP_FLAG_DMA_SYNC_DEV is set, page->dma_sync is set to true.
> >
> > 347 int mp_dmabuf_devmem_init(struct page_pool *pool)
> > 348 {
> > 349         struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
> > 350
> > 351         if (!binding)
> > 352                 return -EINVAL;
> > 353
> > 354         if (!pool->dma_map)
> > 355                 return -EOPNOTSUPP;
> > 356
> > 357         if (pool->dma_sync)                      //here
> > 358                 return -EOPNOTSUPP;
> > 359
> > 360         if (pool->p.order != 0)
> > 361                 return -E2BIG;
> > 362
> > 363         net_devmem_dmabuf_binding_get(binding);
> > 364         return 0;
> > 365 }
> >
> > In the mp_dmabuf_devmem_init(), it fails when pool->dma_sync is true.
>
> This won't work for io_uring zero copy into user memory. We need all
> PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | PP_FLAG_ALLOW_UNREADABLE_NETMEM
> set.
>
> I agree with Mina that the driver should not be poking at the mp_priv
> fields. How about setting all the flags and then letting the mp->init()
> figure it out? mp_dmabuf_devmem_init() is called within page_pool_init()
> so as long as it resets dma_sync if set I don't see any issues.
>

Ah, I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV
for dmabuf may be wrong.
IIUC this flag indicates sync between device and CPU.
But device memory TCP is not related to sync between device and CPU.
So, I think we need to remove this condition check code in the core.
How do you think about it?

> >
> > tcp-data-split can be used for normal cases, not only devmem TCP case.
> > If we enable tcp-data-split and disable devmem TCP, page_pool doesn't
> > have PP_FLAG_DMA_SYNC_DEV.
> > So I think mp_params.mp_priv is still useful.
> >
> > Thanks a lot,
> > Taehee Yoo
> >
> >>
> >>
> >> --
> >> Thanks,
> >> Mina
Taehee Yoo Oct. 9, 2024, 3:37 p.m. UTC | #10
On Wed, Oct 9, 2024 at 4:50 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Fri, 4 Oct 2024 19:34:45 +0900 Taehee Yoo wrote:
> > > Our intention with the whole netmem design is that drivers should
> > > never have to call netmem_to_page(). I.e. the driver should use netmem
> > > unaware of whether it's page or non-page underneath, to minimize
> > > complexity driver needs to handle.
> > >
> > > This netmem_to_page() call can be removed by using
> > > skb_frag_fill_netmem_desc() instead of the page variant. But, more
> > > improtantly, why did the code change here? The code before calls
> > > skb_frag_fill_page_desc, but the new code sometimes will
> > > skb_frag_fill_netmem_desc() and sometimes will skb_add_rx_frag_netmem.
> > > I'm not sure why that logic changed.
> >
> > The reason why skb_add_rx_frag_netmem() is used here is to set
> > skb->unreadable flag. the skb_frag_fill_netmem_desc() doesn't set
> > skb->unreadable because it doesn't handle skb, it only handles frag.
> > As far as I know, skb->unreadable should be set to true for devmem
> > TCP, am I misunderstood?
> > I tested that don't using skb_add_rx_frag_netmem() here, and it
> > immediately fails.
>
> Yes, but netmem_ref can be either a net_iov or a normal page,
> and skb_add_rx_frag_netmem() and similar helpers should automatically
> set skb->unreadable or not.
>
> IOW you should be able to always use netmem-aware APIs, no?

I'm not sure the update skb->unreadable flag is possible because
frag API like skb_add_rx_frag_netmem(), receives only frag, not skb.
How about an additional API to update skb->unreadable flag?
skb_update_unreadable() or skb_update_netmem()?

>
> > > This is not the intended use of PP_FLAG_ALLOW_UNREADABLE_NETMEM.
> > >
> > > The driver should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when it's able
> > > to handle unreadable netmem, it should not worry about whether
> > > rxq->mp_params.mp_priv is set or not.
> > >
> > > You should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when HDS is enabled.
> > > Let core figure out if mp_params.mp_priv is enabled. All the driver
> > > needs to report is whether it's configured to be able to handle
> > > unreadable netmem (which practically means HDS is enabled).
> >
> > The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM
> > flag can't be used with PP_FLAG_DMA_SYNC_DEV.
>
> Hm. Isn't the existing check the wrong way around? Is the driver
> supposed to sync the buffers for device before passing them down?

I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV
for dmabuf may be wrong.
I think device memory TCP is not related to this flag.
So device memory TCP core API should not return failure when
PP_FLAG_DMA_SYNC_DEV flag is set.
How about removing this condition check code in device memory TCP core?
Jakub Kicinski Oct. 10, 2024, 12:01 a.m. UTC | #11
On Thu, 10 Oct 2024 00:37:49 +0900 Taehee Yoo wrote:
> > Yes, but netmem_ref can be either a net_iov or a normal page,
> > and skb_add_rx_frag_netmem() and similar helpers should automatically
> > set skb->unreadable or not.
> >
> > IOW you should be able to always use netmem-aware APIs, no?  
> 
> I'm not sure the update skb->unreadable flag is possible because
> frag API like skb_add_rx_frag_netmem(), receives only frag, not skb.
> How about an additional API to update skb->unreadable flag?
> skb_update_unreadable() or skb_update_netmem()?

Ah, the case where we don't get skb is because we're just building XDP
frame at that stage. And XDP can't be netmem.

In that case switching to skb_frag_fill_netmem_desc() should be enough.

> > > The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM
> > > flag can't be used with PP_FLAG_DMA_SYNC_DEV.  
> >
> > Hm. Isn't the existing check the wrong way around? Is the driver
> > supposed to sync the buffers for device before passing them down?  
> 
> I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV
> for dmabuf may be wrong.
> I think device memory TCP is not related to this flag.
> So device memory TCP core API should not return failure when
> PP_FLAG_DMA_SYNC_DEV flag is set.
> How about removing this condition check code in device memory TCP core?

I think we need to invert the check..
Mina, WDYT?

diff --git a/net/core/devmem.c b/net/core/devmem.c
index 11b91c12ee11..c5cace3f9831 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -331,12 +331,6 @@ int mp_dmabuf_devmem_init(struct page_pool *pool)
 	if (!binding)
 		return -EINVAL;
 
-	if (!pool->dma_map)
-		return -EOPNOTSUPP;
-
-	if (pool->dma_sync)
-		return -EOPNOTSUPP;
-
 	if (pool->p.order != 0)
 		return -E2BIG;
 
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index a813d30d2135..c8dbbf262de3 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -287,6 +287,12 @@ static int page_pool_init(struct page_pool *pool,
 	}
 
 	if (pool->mp_priv) {
+		if (!pool->dma_map || !pool->dma_sync)
+			return -EOPNOTSUPP;
+
+		/* Memory provider is responsible for syncing the pages. */
+		pool->dma_sync = 0;
+
 		err = mp_dmabuf_devmem_init(pool);
 		if (err) {
 			pr_warn("%s() mem-provider init failed %d\n", __func__,
Mina Almasry Oct. 10, 2024, 5:44 p.m. UTC | #12
On Wed, Oct 9, 2024 at 5:01 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Thu, 10 Oct 2024 00:37:49 +0900 Taehee Yoo wrote:
> > > Yes, but netmem_ref can be either a net_iov or a normal page,
> > > and skb_add_rx_frag_netmem() and similar helpers should automatically
> > > set skb->unreadable or not.
> > >
> > > IOW you should be able to always use netmem-aware APIs, no?
> >
> > I'm not sure the update skb->unreadable flag is possible because
> > frag API like skb_add_rx_frag_netmem(), receives only frag, not skb.
> > How about an additional API to update skb->unreadable flag?
> > skb_update_unreadable() or skb_update_netmem()?
>
> Ah, the case where we don't get skb is because we're just building XDP
> frame at that stage. And XDP can't be netmem.
>
> In that case switching to skb_frag_fill_netmem_desc() should be enough.
>
> > > > The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM
> > > > flag can't be used with PP_FLAG_DMA_SYNC_DEV.
> > >
> > > Hm. Isn't the existing check the wrong way around? Is the driver
> > > supposed to sync the buffers for device before passing them down?
> >
> > I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV
> > for dmabuf may be wrong.
> > I think device memory TCP is not related to this flag.
> > So device memory TCP core API should not return failure when
> > PP_FLAG_DMA_SYNC_DEV flag is set.
> > How about removing this condition check code in device memory TCP core?
>
> I think we need to invert the check..
> Mina, WDYT?
>

On a closer look, my feeling is similar to Taehee,
PP_FLAG_DMA_SYNC_DEV should be orthogonal to memory providers. The
memory providers allocate the memory and provide the dma-addr, but
need not dma-sync the dma-addr, right? The driver can sync the
dma-addr if it wants and the driver can delegate the syncing to the pp
via PP_FLAG_DMA_SYNC_DEV if it wants. AFAICT I think the check should
be removed, not inverted, but I could be missing something.
Jakub Kicinski Oct. 11, 2024, 1:34 a.m. UTC | #13
On Thu, 10 Oct 2024 10:44:38 -0700 Mina Almasry wrote:
> > > I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV
> > > for dmabuf may be wrong.
> > > I think device memory TCP is not related to this flag.
> > > So device memory TCP core API should not return failure when
> > > PP_FLAG_DMA_SYNC_DEV flag is set.
> > > How about removing this condition check code in device memory TCP core?  
> >
> > I think we need to invert the check..
> > Mina, WDYT?
> 
> On a closer look, my feeling is similar to Taehee,
> PP_FLAG_DMA_SYNC_DEV should be orthogonal to memory providers. The
> memory providers allocate the memory and provide the dma-addr, but
> need not dma-sync the dma-addr, right? The driver can sync the
> dma-addr if it wants and the driver can delegate the syncing to the pp
> via PP_FLAG_DMA_SYNC_DEV if it wants. AFAICT I think the check should
> be removed, not inverted, but I could be missing something.

I don't know much about dmabuf but it hinges on the question whether
doing DMA sync for device on a dmabuf address is :
 - a good thing
 - a noop
 - a bad thing

If it's a good thing or a noop - agreed.

Similar question for the sync for CPU.

I agree that intuitively it should be all fine. But the fact that dmabuf
has a bespoke API for accessing the memory by the CPU makes me worried
that there may be assumptions about these addresses not getting
randomly fed into the normal DMA API..
Mina Almasry Oct. 11, 2024, 5:33 p.m. UTC | #14
On Thu, Oct 10, 2024 at 6:34 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Thu, 10 Oct 2024 10:44:38 -0700 Mina Almasry wrote:
> > > > I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV
> > > > for dmabuf may be wrong.
> > > > I think device memory TCP is not related to this flag.
> > > > So device memory TCP core API should not return failure when
> > > > PP_FLAG_DMA_SYNC_DEV flag is set.
> > > > How about removing this condition check code in device memory TCP core?
> > >
> > > I think we need to invert the check..
> > > Mina, WDYT?
> >
> > On a closer look, my feeling is similar to Taehee,
> > PP_FLAG_DMA_SYNC_DEV should be orthogonal to memory providers. The
> > memory providers allocate the memory and provide the dma-addr, but
> > need not dma-sync the dma-addr, right? The driver can sync the
> > dma-addr if it wants and the driver can delegate the syncing to the pp
> > via PP_FLAG_DMA_SYNC_DEV if it wants. AFAICT I think the check should
> > be removed, not inverted, but I could be missing something.
>
> I don't know much about dmabuf but it hinges on the question whether
> doing DMA sync for device on a dmabuf address is :
>  - a good thing
>  - a noop
>  - a bad thing
>
> If it's a good thing or a noop - agreed.
>
> Similar question for the sync for CPU.
>
> I agree that intuitively it should be all fine. But the fact that dmabuf
> has a bespoke API for accessing the memory by the CPU makes me worried
> that there may be assumptions about these addresses not getting
> randomly fed into the normal DMA API..

Sorry I'm also a bit unsure what is the right thing to do here. The
code that we've been running in GVE does a dma-sync for cpu
unconditionally on RX for dma-buf and non-dmabuf dma-addrs and we
haven't been seeing issues. It never does dma-sync for device.

My first question is why is dma-sync for device needed on RX path at
all for some drivers in the first place? For incoming (non-dmabuf)
data, the data is written by the device and read by the cpu, so sync
for cpu is really what's needed. Is the sync for device for XDP? Or is
it that buffers should be dma-syncd for device before they are
re-posted to the NIC?

Christian/Jason, sorry quick question: are
dma_sync_single_for_{device|cpu} needed or wanted when the dma-addrs
come from a dma-buf? Or these dma-addrs to be treated like any other
with the normal dma_sync_for_{device|cpu} rules?

--
Thanks,
Mina
Jason Gunthorpe Oct. 11, 2024, 11:42 p.m. UTC | #15
On Fri, Oct 11, 2024 at 10:33:43AM -0700, Mina Almasry wrote:
> On Thu, Oct 10, 2024 at 6:34 PM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Thu, 10 Oct 2024 10:44:38 -0700 Mina Almasry wrote:
> > > > > I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV
> > > > > for dmabuf may be wrong.
> > > > > I think device memory TCP is not related to this flag.
> > > > > So device memory TCP core API should not return failure when
> > > > > PP_FLAG_DMA_SYNC_DEV flag is set.
> > > > > How about removing this condition check code in device memory TCP core?
> > > >
> > > > I think we need to invert the check..
> > > > Mina, WDYT?
> > >
> > > On a closer look, my feeling is similar to Taehee,
> > > PP_FLAG_DMA_SYNC_DEV should be orthogonal to memory providers. The
> > > memory providers allocate the memory and provide the dma-addr, but
> > > need not dma-sync the dma-addr, right? The driver can sync the
> > > dma-addr if it wants and the driver can delegate the syncing to the pp
> > > via PP_FLAG_DMA_SYNC_DEV if it wants. AFAICT I think the check should
> > > be removed, not inverted, but I could be missing something.
> >
> > I don't know much about dmabuf but it hinges on the question whether
> > doing DMA sync for device on a dmabuf address is :
> >  - a good thing
> >  - a noop
> >  - a bad thing
> >
> > If it's a good thing or a noop - agreed.
> >
> > Similar question for the sync for CPU.
> >
> > I agree that intuitively it should be all fine. But the fact that dmabuf
> > has a bespoke API for accessing the memory by the CPU makes me worried
> > that there may be assumptions about these addresses not getting
> > randomly fed into the normal DMA API..
> 
> Sorry I'm also a bit unsure what is the right thing to do here. The
> code that we've been running in GVE does a dma-sync for cpu
> unconditionally on RX for dma-buf and non-dmabuf dma-addrs and we
> haven't been seeing issues. It never does dma-sync for device.
> 
> My first question is why is dma-sync for device needed on RX path at
> all for some drivers in the first place? For incoming (non-dmabuf)
> data, the data is written by the device and read by the cpu, so sync
> for cpu is really what's needed. Is the sync for device for XDP? Or is
> it that buffers should be dma-syncd for device before they are
> re-posted to the NIC?
> 
> Christian/Jason, sorry quick question: are
> dma_sync_single_for_{device|cpu} needed or wanted when the dma-addrs
> come from a dma-buf? Or these dma-addrs to be treated like any other
> with the normal dma_sync_for_{device|cpu} rules?

Um, I think because dma-buf hacks things up and generates illegal
scatterlist entries with weird dma_map_resource() addresses for the
typical P2P case the dma sync API should not be used on those things.

However, there is no way to know if the dma-buf has does this, and
there are valid case where the scatterlist is not ill formed and the
sync is necessary.

We are getting soo close to being able to start fixing these API
issues in dmabuf, I hope next cylce we can begin.. Fingers crossed.

From a CPU architecture perspective you do not need to cache flush PCI
MMIO BAR memory, and perhaps doing so be might be problematic on some
arches (???). But you do need to flush normal cachable CPU memory if
that is in the DMA buf.

Jason
Mina Almasry Oct. 14, 2024, 10:38 p.m. UTC | #16
On Sat, Oct 12, 2024 at 2:42 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Fri, Oct 11, 2024 at 10:33:43AM -0700, Mina Almasry wrote:
> > On Thu, Oct 10, 2024 at 6:34 PM Jakub Kicinski <kuba@kernel.org> wrote:
> > >
> > > On Thu, 10 Oct 2024 10:44:38 -0700 Mina Almasry wrote:
> > > > > > I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV
> > > > > > for dmabuf may be wrong.
> > > > > > I think device memory TCP is not related to this flag.
> > > > > > So device memory TCP core API should not return failure when
> > > > > > PP_FLAG_DMA_SYNC_DEV flag is set.
> > > > > > How about removing this condition check code in device memory TCP core?
> > > > >
> > > > > I think we need to invert the check..
> > > > > Mina, WDYT?
> > > >
> > > > On a closer look, my feeling is similar to Taehee,
> > > > PP_FLAG_DMA_SYNC_DEV should be orthogonal to memory providers. The
> > > > memory providers allocate the memory and provide the dma-addr, but
> > > > need not dma-sync the dma-addr, right? The driver can sync the
> > > > dma-addr if it wants and the driver can delegate the syncing to the pp
> > > > via PP_FLAG_DMA_SYNC_DEV if it wants. AFAICT I think the check should
> > > > be removed, not inverted, but I could be missing something.
> > >
> > > I don't know much about dmabuf but it hinges on the question whether
> > > doing DMA sync for device on a dmabuf address is :
> > >  - a good thing
> > >  - a noop
> > >  - a bad thing
> > >
> > > If it's a good thing or a noop - agreed.
> > >
> > > Similar question for the sync for CPU.
> > >
> > > I agree that intuitively it should be all fine. But the fact that dmabuf
> > > has a bespoke API for accessing the memory by the CPU makes me worried
> > > that there may be assumptions about these addresses not getting
> > > randomly fed into the normal DMA API..
> >
> > Sorry I'm also a bit unsure what is the right thing to do here. The
> > code that we've been running in GVE does a dma-sync for cpu
> > unconditionally on RX for dma-buf and non-dmabuf dma-addrs and we
> > haven't been seeing issues. It never does dma-sync for device.
> >
> > My first question is why is dma-sync for device needed on RX path at
> > all for some drivers in the first place? For incoming (non-dmabuf)
> > data, the data is written by the device and read by the cpu, so sync
> > for cpu is really what's needed. Is the sync for device for XDP? Or is
> > it that buffers should be dma-syncd for device before they are
> > re-posted to the NIC?
> >
> > Christian/Jason, sorry quick question: are
> > dma_sync_single_for_{device|cpu} needed or wanted when the dma-addrs
> > come from a dma-buf? Or these dma-addrs to be treated like any other
> > with the normal dma_sync_for_{device|cpu} rules?
>
> Um, I think because dma-buf hacks things up and generates illegal
> scatterlist entries with weird dma_map_resource() addresses for the
> typical P2P case the dma sync API should not be used on those things.
>
> However, there is no way to know if the dma-buf has does this, and
> there are valid case where the scatterlist is not ill formed and the
> sync is necessary.
>
> We are getting soo close to being able to start fixing these API
> issues in dmabuf, I hope next cylce we can begin.. Fingers crossed.
>
> From a CPU architecture perspective you do not need to cache flush PCI
> MMIO BAR memory, and perhaps doing so be might be problematic on some
> arches (???). But you do need to flush normal cachable CPU memory if
> that is in the DMA buf.
>

Thanks Jason. In that case I agree with Jakub we should take in his change here:

https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/

With this change the driver would delegate dma_sync_for_device to the
page_pool, and the page_pool will skip it altogether for the dma-buf
memory provider.

--
Thanks,
Mina
Jakub Kicinski Oct. 15, 2024, 12:16 a.m. UTC | #17
On Tue, 15 Oct 2024 01:38:20 +0300 Mina Almasry wrote:
> Thanks Jason. In that case I agree with Jakub we should take in his change here:
> 
> https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/
> 
> With this change the driver would delegate dma_sync_for_device to the
> page_pool, and the page_pool will skip it altogether for the dma-buf
> memory provider.

And we need a wrapper for a sync for CPU which will skip if the page
comes from an unreadable pool?
Mina Almasry Oct. 15, 2024, 1:10 a.m. UTC | #18
On Tue, Oct 15, 2024 at 3:16 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Tue, 15 Oct 2024 01:38:20 +0300 Mina Almasry wrote:
> > Thanks Jason. In that case I agree with Jakub we should take in his change here:
> >
> > https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/
> >
> > With this change the driver would delegate dma_sync_for_device to the
> > page_pool, and the page_pool will skip it altogether for the dma-buf
> > memory provider.
>
> And we need a wrapper for a sync for CPU which will skip if the page
> comes from an unreadable pool?

This is where it gets a bit tricky, no?

Our production code does a dma_sync_for_cpu but no
dma_sync_for_device. That has been working reliably for us with GPU
dmabufs and udmabuf, but we haven't of course tested every dma-buf.
I'm comfortable enforcing the 'no dma_sync_for_device' now since it
brings upstream in line with our well tested setup. I'm not sure I'm
100% comfortable enforcing the 'no dma_sync_for_cpu' now since it's a
deviation. The dma_sync_for_cpu is very very likely a no-op since we
don't really access the data from cpu ever with devmem TCP, but who
knows.

Is it possible to give me a couple of weeks to make this change
locally and run it through some testing to see if it breaks anything?

But if you or Jason think that enforcing the 'no dma_buf_sync_for_cpu'
now is critical, no problem. We can also provide this patch, and seek
to revert it or fix it up properly later in the event it turns out it
causes issues.

Note that io_uring provider, or other non-dmabuf providers may need to
do a dma-sync, but that bridge can be crossed in David's patchset.
Jason Gunthorpe Oct. 15, 2024, 12:44 p.m. UTC | #19
On Tue, Oct 15, 2024 at 04:10:44AM +0300, Mina Almasry wrote:
> On Tue, Oct 15, 2024 at 3:16 AM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Tue, 15 Oct 2024 01:38:20 +0300 Mina Almasry wrote:
> > > Thanks Jason. In that case I agree with Jakub we should take in his change here:
> > >
> > > https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/
> > >
> > > With this change the driver would delegate dma_sync_for_device to the
> > > page_pool, and the page_pool will skip it altogether for the dma-buf
> > > memory provider.
> >
> > And we need a wrapper for a sync for CPU which will skip if the page
> > comes from an unreadable pool?
> 
> This is where it gets a bit tricky, no?
> 
> Our production code does a dma_sync_for_cpu but no
> dma_sync_for_device. That has been working reliably for us with GPU

Those functions are all NOP on systems you are testing on.

The question is what is correct to do on systems where it is not a
NOP, and none of this is really right, as I explained..

> But if you or Jason think that enforcing the 'no dma_buf_sync_for_cpu'
> now is critical, no problem. We can also provide this patch, and seek
> to revert it or fix it up properly later in the event it turns out it
> causes issues.

What is important is you organize things going forward to be able to
do this properly, which means the required sync type is dependent on
the actual page being synced and you will eventually somehow learn
which is required from the dmabuf.

Most likely nobody will ever run this code on system where dma_sync is
not a NOP, but we should still use the DMA API properly and things
should make architectural sense.

Jason
Pavel Begunkov Oct. 15, 2024, 2:29 p.m. UTC | #20
On 10/14/24 23:38, Mina Almasry wrote:
> On Sat, Oct 12, 2024 at 2:42 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>>
>> On Fri, Oct 11, 2024 at 10:33:43AM -0700, Mina Almasry wrote:
>>> On Thu, Oct 10, 2024 at 6:34 PM Jakub Kicinski <kuba@kernel.org> wrote:
>>>>
>>>> On Thu, 10 Oct 2024 10:44:38 -0700 Mina Almasry wrote:
>>>>>>> I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV
>>>>>>> for dmabuf may be wrong.
>>>>>>> I think device memory TCP is not related to this flag.
>>>>>>> So device memory TCP core API should not return failure when
>>>>>>> PP_FLAG_DMA_SYNC_DEV flag is set.
>>>>>>> How about removing this condition check code in device memory TCP core?
>>>>>>
>>>>>> I think we need to invert the check..
>>>>>> Mina, WDYT?
>>>>>
>>>>> On a closer look, my feeling is similar to Taehee,
>>>>> PP_FLAG_DMA_SYNC_DEV should be orthogonal to memory providers. The
>>>>> memory providers allocate the memory and provide the dma-addr, but
>>>>> need not dma-sync the dma-addr, right? The driver can sync the
>>>>> dma-addr if it wants and the driver can delegate the syncing to the pp
>>>>> via PP_FLAG_DMA_SYNC_DEV if it wants. AFAICT I think the check should
>>>>> be removed, not inverted, but I could be missing something.
>>>>
>>>> I don't know much about dmabuf but it hinges on the question whether
>>>> doing DMA sync for device on a dmabuf address is :
>>>>   - a good thing
>>>>   - a noop
>>>>   - a bad thing
>>>>
>>>> If it's a good thing or a noop - agreed.
>>>>
>>>> Similar question for the sync for CPU.
>>>>
>>>> I agree that intuitively it should be all fine. But the fact that dmabuf
>>>> has a bespoke API for accessing the memory by the CPU makes me worried
>>>> that there may be assumptions about these addresses not getting
>>>> randomly fed into the normal DMA API..
>>>
>>> Sorry I'm also a bit unsure what is the right thing to do here. The
>>> code that we've been running in GVE does a dma-sync for cpu
>>> unconditionally on RX for dma-buf and non-dmabuf dma-addrs and we
>>> haven't been seeing issues. It never does dma-sync for device.
>>>
>>> My first question is why is dma-sync for device needed on RX path at
>>> all for some drivers in the first place? For incoming (non-dmabuf)
>>> data, the data is written by the device and read by the cpu, so sync
>>> for cpu is really what's needed. Is the sync for device for XDP? Or is
>>> it that buffers should be dma-syncd for device before they are
>>> re-posted to the NIC?
>>>
>>> Christian/Jason, sorry quick question: are
>>> dma_sync_single_for_{device|cpu} needed or wanted when the dma-addrs
>>> come from a dma-buf? Or these dma-addrs to be treated like any other
>>> with the normal dma_sync_for_{device|cpu} rules?
>>
>> Um, I think because dma-buf hacks things up and generates illegal
>> scatterlist entries with weird dma_map_resource() addresses for the
>> typical P2P case the dma sync API should not be used on those things.
>>
>> However, there is no way to know if the dma-buf has does this, and
>> there are valid case where the scatterlist is not ill formed and the
>> sync is necessary.
>>
>> We are getting soo close to being able to start fixing these API
>> issues in dmabuf, I hope next cylce we can begin.. Fingers crossed.
>>
>>  From a CPU architecture perspective you do not need to cache flush PCI
>> MMIO BAR memory, and perhaps doing so be might be problematic on some
>> arches (???). But you do need to flush normal cachable CPU memory if
>> that is in the DMA buf.
>>
> 
> Thanks Jason. In that case I agree with Jakub we should take in his change here:
> 
> https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/
> 
> With this change the driver would delegate dma_sync_for_device to the
> page_pool, and the page_pool will skip it altogether for the dma-buf
> memory provider.

Requiring ->dma_map should be common to all providers as page pool
shouldn't be dipping to net_iovs figuring out how to map them. However,
looking at this discussion seems that the ->dma_sync concern is devmem
specific and should be discarded by pp providers using dmabufs, i.e. in
devmem.c:mp_dmabuf_devmem_init().
David Wei Oct. 15, 2024, 5:38 p.m. UTC | #21
On 2024-10-15 07:29, Pavel Begunkov wrote:
> On 10/14/24 23:38, Mina Almasry wrote:
>> On Sat, Oct 12, 2024 at 2:42 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>>>
>>> On Fri, Oct 11, 2024 at 10:33:43AM -0700, Mina Almasry wrote:
>>>> On Thu, Oct 10, 2024 at 6:34 PM Jakub Kicinski <kuba@kernel.org> wrote:
>>>>>
>>>>> On Thu, 10 Oct 2024 10:44:38 -0700 Mina Almasry wrote:
>>>>>>>> I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV
>>>>>>>> for dmabuf may be wrong.
>>>>>>>> I think device memory TCP is not related to this flag.
>>>>>>>> So device memory TCP core API should not return failure when
>>>>>>>> PP_FLAG_DMA_SYNC_DEV flag is set.
>>>>>>>> How about removing this condition check code in device memory TCP core?
>>>>>>>
>>>>>>> I think we need to invert the check..
>>>>>>> Mina, WDYT?
>>>>>>
>>>>>> On a closer look, my feeling is similar to Taehee,
>>>>>> PP_FLAG_DMA_SYNC_DEV should be orthogonal to memory providers. The
>>>>>> memory providers allocate the memory and provide the dma-addr, but
>>>>>> need not dma-sync the dma-addr, right? The driver can sync the
>>>>>> dma-addr if it wants and the driver can delegate the syncing to the pp
>>>>>> via PP_FLAG_DMA_SYNC_DEV if it wants. AFAICT I think the check should
>>>>>> be removed, not inverted, but I could be missing something.
>>>>>
>>>>> I don't know much about dmabuf but it hinges on the question whether
>>>>> doing DMA sync for device on a dmabuf address is :
>>>>>   - a good thing
>>>>>   - a noop
>>>>>   - a bad thing
>>>>>
>>>>> If it's a good thing or a noop - agreed.
>>>>>
>>>>> Similar question for the sync for CPU.
>>>>>
>>>>> I agree that intuitively it should be all fine. But the fact that dmabuf
>>>>> has a bespoke API for accessing the memory by the CPU makes me worried
>>>>> that there may be assumptions about these addresses not getting
>>>>> randomly fed into the normal DMA API..
>>>>
>>>> Sorry I'm also a bit unsure what is the right thing to do here. The
>>>> code that we've been running in GVE does a dma-sync for cpu
>>>> unconditionally on RX for dma-buf and non-dmabuf dma-addrs and we
>>>> haven't been seeing issues. It never does dma-sync for device.
>>>>
>>>> My first question is why is dma-sync for device needed on RX path at
>>>> all for some drivers in the first place? For incoming (non-dmabuf)
>>>> data, the data is written by the device and read by the cpu, so sync
>>>> for cpu is really what's needed. Is the sync for device for XDP? Or is
>>>> it that buffers should be dma-syncd for device before they are
>>>> re-posted to the NIC?
>>>>
>>>> Christian/Jason, sorry quick question: are
>>>> dma_sync_single_for_{device|cpu} needed or wanted when the dma-addrs
>>>> come from a dma-buf? Or these dma-addrs to be treated like any other
>>>> with the normal dma_sync_for_{device|cpu} rules?
>>>
>>> Um, I think because dma-buf hacks things up and generates illegal
>>> scatterlist entries with weird dma_map_resource() addresses for the
>>> typical P2P case the dma sync API should not be used on those things.
>>>
>>> However, there is no way to know if the dma-buf has does this, and
>>> there are valid case where the scatterlist is not ill formed and the
>>> sync is necessary.
>>>
>>> We are getting soo close to being able to start fixing these API
>>> issues in dmabuf, I hope next cylce we can begin.. Fingers crossed.
>>>
>>>  From a CPU architecture perspective you do not need to cache flush PCI
>>> MMIO BAR memory, and perhaps doing so be might be problematic on some
>>> arches (???). But you do need to flush normal cachable CPU memory if
>>> that is in the DMA buf.
>>>
>>
>> Thanks Jason. In that case I agree with Jakub we should take in his change here:
>>
>> https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/
>>
>> With this change the driver would delegate dma_sync_for_device to the
>> page_pool, and the page_pool will skip it altogether for the dma-buf
>> memory provider.
> 
> Requiring ->dma_map should be common to all providers as page pool
> shouldn't be dipping to net_iovs figuring out how to map them. However,
> looking at this discussion seems that the ->dma_sync concern is devmem
> specific and should be discarded by pp providers using dmabufs, i.e. in
> devmem.c:mp_dmabuf_devmem_init().

Yes, that's my preference as well, see my earlier reply.

>
Mina Almasry Oct. 18, 2024, 8:25 a.m. UTC | #22
On Tue, Oct 15, 2024 at 3:44 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Tue, Oct 15, 2024 at 04:10:44AM +0300, Mina Almasry wrote:
> > On Tue, Oct 15, 2024 at 3:16 AM Jakub Kicinski <kuba@kernel.org> wrote:
> > >
> > > On Tue, 15 Oct 2024 01:38:20 +0300 Mina Almasry wrote:
> > > > Thanks Jason. In that case I agree with Jakub we should take in his change here:
> > > >
> > > > https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/
> > > >
> > > > With this change the driver would delegate dma_sync_for_device to the
> > > > page_pool, and the page_pool will skip it altogether for the dma-buf
> > > > memory provider.
> > >
> > > And we need a wrapper for a sync for CPU which will skip if the page
> > > comes from an unreadable pool?
> >
> > This is where it gets a bit tricky, no?
> >
> > Our production code does a dma_sync_for_cpu but no
> > dma_sync_for_device. That has been working reliably for us with GPU
>
> Those functions are all NOP on systems you are testing on.
>

OK, thanks. This is what I wanted to confirm. If you already know this
here then there is no need to wait for me to confirm.

> The question is what is correct to do on systems where it is not a
> NOP, and none of this is really right, as I explained..
>
> > But if you or Jason think that enforcing the 'no dma_buf_sync_for_cpu'
> > now is critical, no problem. We can also provide this patch, and seek
> > to revert it or fix it up properly later in the event it turns out it
> > causes issues.
>
> What is important is you organize things going forward to be able to
> do this properly, which means the required sync type is dependent on
> the actual page being synced and you will eventually somehow learn
> which is required from the dmabuf.
>
> Most likely nobody will ever run this code on system where dma_sync is
> not a NOP, but we should still use the DMA API properly and things
> should make architectural sense.
>

Makes sense. OK, we can do what Jakub suggested in the thread earlier.
I.e. likely some wrapper which skips the dma_sync_for_cpu if the
netmem is unreadable.

--
Thanks,
Mina
Taehee Yoo Oct. 19, 2024, 1:55 p.m. UTC | #23
On Fri, Oct 18, 2024 at 5:25 PM Mina Almasry <almasrymina@google.com> wrote:
>
> On Tue, Oct 15, 2024 at 3:44 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >
> > On Tue, Oct 15, 2024 at 04:10:44AM +0300, Mina Almasry wrote:
> > > On Tue, Oct 15, 2024 at 3:16 AM Jakub Kicinski <kuba@kernel.org> wrote:
> > > >
> > > > On Tue, 15 Oct 2024 01:38:20 +0300 Mina Almasry wrote:
> > > > > Thanks Jason. In that case I agree with Jakub we should take in his change here:
> > > > >
> > > > > https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/
> > > > >
> > > > > With this change the driver would delegate dma_sync_for_device to the
> > > > > page_pool, and the page_pool will skip it altogether for the dma-buf
> > > > > memory provider.
> > > >
> > > > And we need a wrapper for a sync for CPU which will skip if the page
> > > > comes from an unreadable pool?
> > >
> > > This is where it gets a bit tricky, no?
> > >
> > > Our production code does a dma_sync_for_cpu but no
> > > dma_sync_for_device. That has been working reliably for us with GPU
> >
> > Those functions are all NOP on systems you are testing on.
> >
>
> OK, thanks. This is what I wanted to confirm. If you already know this
> here then there is no need to wait for me to confirm.
>
> > The question is what is correct to do on systems where it is not a
> > NOP, and none of this is really right, as I explained..
> >
> > > But if you or Jason think that enforcing the 'no dma_buf_sync_for_cpu'
> > > now is critical, no problem. We can also provide this patch, and seek
> > > to revert it or fix it up properly later in the event it turns out it
> > > causes issues.
> >
> > What is important is you organize things going forward to be able to
> > do this properly, which means the required sync type is dependent on
> > the actual page being synced and you will eventually somehow learn
> > which is required from the dmabuf.
> >
> > Most likely nobody will ever run this code on system where dma_sync is
> > not a NOP, but we should still use the DMA API properly and things
> > should make architectural sense.
> >
>
> Makes sense. OK, we can do what Jakub suggested in the thread earlier.
> I.e. likely some wrapper which skips the dma_sync_for_cpu if the
> netmem is unreadable.
>

Thanks a lot for confirmation about it.
I will pass the PP_FLAG_ALLOW_UNREADABLE_NETMEM flag
regardless of enabling/disabling devmem TCP in a v4 patch.
The page_pool core logic will handle flags properly.

I think patches for changes of page_pool are worked on by Mina,
so I will not include changes for page_pool in a v4 patch.

If you think I missed something, please let me know :)

Thanks a lot!
Taehee Yoo
diff mbox series

Patch

diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
index 75ca3ddda1f5..f37ff12d4746 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -211,6 +211,7 @@  config BNXT
 	select FW_LOADER
 	select LIBCRC32C
 	select NET_DEVLINK
+	select NET_DEVMEM
 	select PAGE_POOL
 	select DIMLIB
 	select AUXILIARY_BUS
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 872b15842b11..64e07d247f97 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -55,6 +55,7 @@ 
 #include <net/page_pool/helpers.h>
 #include <linux/align.h>
 #include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
 
 #include "bnxt_hsi.h"
 #include "bnxt.h"
@@ -863,6 +864,22 @@  static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget)
 		bnapi->events &= ~BNXT_TX_CMP_EVENT;
 }
 
+static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping,
+					 struct bnxt_rx_ring_info *rxr,
+					 unsigned int *offset,
+					 gfp_t gfp)
+{
+	netmem_ref netmem;
+
+	netmem = page_pool_alloc_netmem(rxr->page_pool, GFP_ATOMIC);
+	if (!netmem)
+		return 0;
+	*offset = 0;
+
+	*mapping = page_pool_get_dma_addr_netmem(netmem) + *offset;
+	return netmem;
+}
+
 static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping,
 					 struct bnxt_rx_ring_info *rxr,
 					 unsigned int *offset,
@@ -972,21 +989,21 @@  static inline u16 bnxt_find_next_agg_idx(struct bnxt_rx_ring_info *rxr, u16 idx)
 	return next;
 }
 
-static inline int bnxt_alloc_rx_page(struct bnxt *bp,
-				     struct bnxt_rx_ring_info *rxr,
-				     u16 prod, gfp_t gfp)
+static inline int bnxt_alloc_rx_netmem(struct bnxt *bp,
+				       struct bnxt_rx_ring_info *rxr,
+				       u16 prod, gfp_t gfp)
 {
 	struct rx_bd *rxbd =
 		&rxr->rx_agg_desc_ring[RX_AGG_RING(bp, prod)][RX_IDX(prod)];
 	struct bnxt_sw_rx_agg_bd *rx_agg_buf;
-	struct page *page;
+	netmem_ref netmem;
 	dma_addr_t mapping;
 	u16 sw_prod = rxr->rx_sw_agg_prod;
 	unsigned int offset = 0;
 
-	page = __bnxt_alloc_rx_page(bp, &mapping, rxr, &offset, gfp);
+	netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, &offset, gfp);
 
-	if (!page)
+	if (!netmem)
 		return -ENOMEM;
 
 	if (unlikely(test_bit(sw_prod, rxr->rx_agg_bmap)))
@@ -996,7 +1013,7 @@  static inline int bnxt_alloc_rx_page(struct bnxt *bp,
 	rx_agg_buf = &rxr->rx_agg_ring[sw_prod];
 	rxr->rx_sw_agg_prod = RING_RX_AGG(bp, NEXT_RX_AGG(sw_prod));
 
-	rx_agg_buf->page = page;
+	rx_agg_buf->netmem = netmem;
 	rx_agg_buf->offset = offset;
 	rx_agg_buf->mapping = mapping;
 	rxbd->rx_bd_haddr = cpu_to_le64(mapping);
@@ -1044,7 +1061,7 @@  static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx,
 		struct rx_agg_cmp *agg;
 		struct bnxt_sw_rx_agg_bd *cons_rx_buf, *prod_rx_buf;
 		struct rx_bd *prod_bd;
-		struct page *page;
+		netmem_ref netmem;
 
 		if (p5_tpa)
 			agg = bnxt_get_tpa_agg_p5(bp, rxr, idx, start + i);
@@ -1061,11 +1078,11 @@  static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx,
 		cons_rx_buf = &rxr->rx_agg_ring[cons];
 
 		/* It is possible for sw_prod to be equal to cons, so
-		 * set cons_rx_buf->page to NULL first.
+		 * set cons_rx_buf->netmem to 0 first.
 		 */
-		page = cons_rx_buf->page;
-		cons_rx_buf->page = NULL;
-		prod_rx_buf->page = page;
+		netmem = cons_rx_buf->netmem;
+		cons_rx_buf->netmem = 0;
+		prod_rx_buf->netmem = netmem;
 		prod_rx_buf->offset = cons_rx_buf->offset;
 
 		prod_rx_buf->mapping = cons_rx_buf->mapping;
@@ -1192,6 +1209,7 @@  static struct sk_buff *bnxt_rx_skb(struct bnxt *bp,
 
 static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
 			       struct bnxt_cp_ring_info *cpr,
+			       struct sk_buff *skb,
 			       struct skb_shared_info *shinfo,
 			       u16 idx, u32 agg_bufs, bool tpa,
 			       struct xdp_buff *xdp)
@@ -1211,7 +1229,7 @@  static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
 		u16 cons, frag_len;
 		struct rx_agg_cmp *agg;
 		struct bnxt_sw_rx_agg_bd *cons_rx_buf;
-		struct page *page;
+		netmem_ref netmem;
 		dma_addr_t mapping;
 
 		if (p5_tpa)
@@ -1223,9 +1241,15 @@  static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
 			    RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT;
 
 		cons_rx_buf = &rxr->rx_agg_ring[cons];
-		skb_frag_fill_page_desc(frag, cons_rx_buf->page,
-					cons_rx_buf->offset, frag_len);
-		shinfo->nr_frags = i + 1;
+		if (skb) {
+			skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem,
+					       cons_rx_buf->offset, frag_len,
+					       BNXT_RX_PAGE_SIZE);
+		} else {
+			skb_frag_fill_page_desc(frag, netmem_to_page(cons_rx_buf->netmem),
+						cons_rx_buf->offset, frag_len);
+			shinfo->nr_frags = i + 1;
+		}
 		__clear_bit(cons, rxr->rx_agg_bmap);
 
 		/* It is possible for bnxt_alloc_rx_page() to allocate
@@ -1233,15 +1257,15 @@  static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
 		 * need to clear the cons entry now.
 		 */
 		mapping = cons_rx_buf->mapping;
-		page = cons_rx_buf->page;
-		cons_rx_buf->page = NULL;
+		netmem = cons_rx_buf->netmem;
+		cons_rx_buf->netmem = 0;
 
-		if (xdp && page_is_pfmemalloc(page))
+		if (xdp && page_is_pfmemalloc(netmem_to_page(netmem)))
 			xdp_buff_set_frag_pfmemalloc(xdp);
 
-		if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_ATOMIC) != 0) {
+		if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_ATOMIC) != 0) {
 			--shinfo->nr_frags;
-			cons_rx_buf->page = page;
+			cons_rx_buf->netmem = netmem;
 
 			/* Update prod since possibly some pages have been
 			 * allocated already.
@@ -1269,7 +1293,7 @@  static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp,
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	u32 total_frag_len = 0;
 
-	total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, idx,
+	total_frag_len = __bnxt_rx_agg_pages(bp, cpr, skb, shinfo, idx,
 					     agg_bufs, tpa, NULL);
 	if (!total_frag_len) {
 		skb_mark_for_recycle(skb);
@@ -1277,9 +1301,6 @@  static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp,
 		return NULL;
 	}
 
-	skb->data_len += total_frag_len;
-	skb->len += total_frag_len;
-	skb->truesize += BNXT_RX_PAGE_SIZE * agg_bufs;
 	return skb;
 }
 
@@ -1294,7 +1315,7 @@  static u32 bnxt_rx_agg_pages_xdp(struct bnxt *bp,
 	if (!xdp_buff_has_frags(xdp))
 		shinfo->nr_frags = 0;
 
-	total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo,
+	total_frag_len = __bnxt_rx_agg_pages(bp, cpr, NULL, shinfo,
 					     idx, agg_bufs, tpa, xdp);
 	if (total_frag_len) {
 		xdp_buff_set_frags_flag(xdp);
@@ -3342,15 +3363,15 @@  static void bnxt_free_one_rx_agg_ring(struct bnxt *bp, struct bnxt_rx_ring_info
 
 	for (i = 0; i < max_idx; i++) {
 		struct bnxt_sw_rx_agg_bd *rx_agg_buf = &rxr->rx_agg_ring[i];
-		struct page *page = rx_agg_buf->page;
+		netmem_ref netmem = rx_agg_buf->netmem;
 
-		if (!page)
+		if (!netmem)
 			continue;
 
-		rx_agg_buf->page = NULL;
+		rx_agg_buf->netmem = 0;
 		__clear_bit(i, rxr->rx_agg_bmap);
 
-		page_pool_recycle_direct(rxr->page_pool, page);
+		page_pool_put_full_netmem(rxr->page_pool, netmem, true);
 	}
 }
 
@@ -3608,9 +3629,11 @@  static void bnxt_free_rx_rings(struct bnxt *bp)
 
 static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
 				   struct bnxt_rx_ring_info *rxr,
+				   int queue_idx,
 				   int numa_node)
 {
 	struct page_pool_params pp = { 0 };
+	struct netdev_rx_queue *rxq;
 
 	pp.pool_size = bp->rx_agg_ring_size;
 	if (BNXT_RX_PAGE_MODE(bp))
@@ -3621,8 +3644,15 @@  static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
 	pp.dev = &bp->pdev->dev;
 	pp.dma_dir = bp->rx_dir;
 	pp.max_len = PAGE_SIZE;
-	pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
+	pp.order = 0;
+
+	rxq = __netif_get_rx_queue(bp->dev, queue_idx);
+	if (rxq->mp_params.mp_priv)
+		pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM;
+	else
+		pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
 
+	pp.queue_idx = queue_idx;
 	rxr->page_pool = page_pool_create(&pp);
 	if (IS_ERR(rxr->page_pool)) {
 		int err = PTR_ERR(rxr->page_pool);
@@ -3655,7 +3685,7 @@  static int bnxt_alloc_rx_rings(struct bnxt *bp)
 		cpu_node = cpu_to_node(cpu);
 		netdev_dbg(bp->dev, "Allocating page pool for rx_ring[%d] on numa_node: %d\n",
 			   i, cpu_node);
-		rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node);
+		rc = bnxt_alloc_rx_page_pool(bp, rxr, i, cpu_node);
 		if (rc)
 			return rc;
 
@@ -4154,7 +4184,7 @@  static void bnxt_alloc_one_rx_ring_page(struct bnxt *bp,
 
 	prod = rxr->rx_agg_prod;
 	for (i = 0; i < bp->rx_agg_ring_size; i++) {
-		if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_KERNEL)) {
+		if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_KERNEL)) {
 			netdev_warn(bp->dev, "init'ed rx ring %d with %d/%d pages only\n",
 				    ring_nr, i, bp->rx_ring_size);
 			break;
@@ -15063,7 +15093,7 @@  static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx)
 	clone->rx_sw_agg_prod = 0;
 	clone->rx_next_cons = 0;
 
-	rc = bnxt_alloc_rx_page_pool(bp, clone, rxr->page_pool->p.nid);
+	rc = bnxt_alloc_rx_page_pool(bp, clone, idx, rxr->page_pool->p.nid);
 	if (rc)
 		return rc;
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 48f390519c35..3cf57a3c7664 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -895,7 +895,7 @@  struct bnxt_sw_rx_bd {
 };
 
 struct bnxt_sw_rx_agg_bd {
-	struct page		*page;
+	netmem_ref		netmem;
 	unsigned int		offset;
 	dma_addr_t		mapping;
 };