Message ID | 20241003160620.1521626-8-ap420073@gmail.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | bnxt_en: implement device memory TCP for bnxt | expand |
On Thu, Oct 3, 2024 at 9:07 AM Taehee Yoo <ap420073@gmail.com> wrote: > > Currently, bnxt_en driver satisfies the requirements of Device memory > TCP, which is tcp-data-split. > So, it implements Device memory TCP for bnxt_en driver. > > From now on, the aggregation ring handles netmem_ref instead of page > regardless of the on/off of netmem. > So, for the aggregation ring, memory will be handled with the netmem > page_pool API instead of generic page_pool API. > > If Devmem is enabled, netmem_ref is used as-is and if Devmem is not > enabled, netmem_ref will be converted to page and that is used. > > Driver recognizes whether the devmem is set or unset based on the > mp_params.mp_priv is not NULL. > Only if devmem is set, it passes PP_FLAG_ALLOW_UNREADABLE_NETMEM. > > Signed-off-by: Taehee Yoo <ap420073@gmail.com> > --- > > v3: > - Patch added > > drivers/net/ethernet/broadcom/Kconfig | 1 + > drivers/net/ethernet/broadcom/bnxt/bnxt.c | 98 +++++++++++++++-------- > drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +- > 3 files changed, 66 insertions(+), 35 deletions(-) > > diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig > index 75ca3ddda1f5..f37ff12d4746 100644 > --- a/drivers/net/ethernet/broadcom/Kconfig > +++ b/drivers/net/ethernet/broadcom/Kconfig > @@ -211,6 +211,7 @@ config BNXT > select FW_LOADER > select LIBCRC32C > select NET_DEVLINK > + select NET_DEVMEM > select PAGE_POOL > select DIMLIB > select AUXILIARY_BUS > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c > index 872b15842b11..64e07d247f97 100644 > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c > @@ -55,6 +55,7 @@ > #include <net/page_pool/helpers.h> > #include <linux/align.h> > #include <net/netdev_queues.h> > +#include <net/netdev_rx_queue.h> > > #include "bnxt_hsi.h" > #include "bnxt.h" > @@ -863,6 +864,22 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) > bnapi->events &= ~BNXT_TX_CMP_EVENT; > } > > +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping, > + struct bnxt_rx_ring_info *rxr, > + unsigned int *offset, > + gfp_t gfp) > +{ > + netmem_ref netmem; > + > + netmem = page_pool_alloc_netmem(rxr->page_pool, GFP_ATOMIC); > + if (!netmem) > + return 0; > + *offset = 0; > + > + *mapping = page_pool_get_dma_addr_netmem(netmem) + *offset; > + return netmem; > +} > + > static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, > struct bnxt_rx_ring_info *rxr, > unsigned int *offset, > @@ -972,21 +989,21 @@ static inline u16 bnxt_find_next_agg_idx(struct bnxt_rx_ring_info *rxr, u16 idx) > return next; > } > > -static inline int bnxt_alloc_rx_page(struct bnxt *bp, > - struct bnxt_rx_ring_info *rxr, > - u16 prod, gfp_t gfp) > +static inline int bnxt_alloc_rx_netmem(struct bnxt *bp, > + struct bnxt_rx_ring_info *rxr, > + u16 prod, gfp_t gfp) > { > struct rx_bd *rxbd = > &rxr->rx_agg_desc_ring[RX_AGG_RING(bp, prod)][RX_IDX(prod)]; > struct bnxt_sw_rx_agg_bd *rx_agg_buf; > - struct page *page; > + netmem_ref netmem; > dma_addr_t mapping; > u16 sw_prod = rxr->rx_sw_agg_prod; > unsigned int offset = 0; > > - page = __bnxt_alloc_rx_page(bp, &mapping, rxr, &offset, gfp); > + netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, &offset, gfp); Does __bnxt_alloc_rx_page become dead code after this change? Or is it still used for something? > > - if (!page) > + if (!netmem) > return -ENOMEM; > > if (unlikely(test_bit(sw_prod, rxr->rx_agg_bmap))) > @@ -996,7 +1013,7 @@ static inline int bnxt_alloc_rx_page(struct bnxt *bp, > rx_agg_buf = &rxr->rx_agg_ring[sw_prod]; > rxr->rx_sw_agg_prod = RING_RX_AGG(bp, NEXT_RX_AGG(sw_prod)); > > - rx_agg_buf->page = page; > + rx_agg_buf->netmem = netmem; > rx_agg_buf->offset = offset; > rx_agg_buf->mapping = mapping; > rxbd->rx_bd_haddr = cpu_to_le64(mapping); > @@ -1044,7 +1061,7 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx, > struct rx_agg_cmp *agg; > struct bnxt_sw_rx_agg_bd *cons_rx_buf, *prod_rx_buf; > struct rx_bd *prod_bd; > - struct page *page; > + netmem_ref netmem; > > if (p5_tpa) > agg = bnxt_get_tpa_agg_p5(bp, rxr, idx, start + i); > @@ -1061,11 +1078,11 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx, > cons_rx_buf = &rxr->rx_agg_ring[cons]; > > /* It is possible for sw_prod to be equal to cons, so > - * set cons_rx_buf->page to NULL first. > + * set cons_rx_buf->netmem to 0 first. > */ > - page = cons_rx_buf->page; > - cons_rx_buf->page = NULL; > - prod_rx_buf->page = page; > + netmem = cons_rx_buf->netmem; > + cons_rx_buf->netmem = 0; > + prod_rx_buf->netmem = netmem; > prod_rx_buf->offset = cons_rx_buf->offset; > > prod_rx_buf->mapping = cons_rx_buf->mapping; > @@ -1192,6 +1209,7 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp, > > static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > struct bnxt_cp_ring_info *cpr, > + struct sk_buff *skb, > struct skb_shared_info *shinfo, > u16 idx, u32 agg_bufs, bool tpa, > struct xdp_buff *xdp) > @@ -1211,7 +1229,7 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > u16 cons, frag_len; > struct rx_agg_cmp *agg; > struct bnxt_sw_rx_agg_bd *cons_rx_buf; > - struct page *page; > + netmem_ref netmem; > dma_addr_t mapping; > > if (p5_tpa) > @@ -1223,9 +1241,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT; > > cons_rx_buf = &rxr->rx_agg_ring[cons]; > - skb_frag_fill_page_desc(frag, cons_rx_buf->page, > - cons_rx_buf->offset, frag_len); > - shinfo->nr_frags = i + 1; > + if (skb) { > + skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem, > + cons_rx_buf->offset, frag_len, > + BNXT_RX_PAGE_SIZE); > + } else { > + skb_frag_fill_page_desc(frag, netmem_to_page(cons_rx_buf->netmem), > + cons_rx_buf->offset, frag_len); Our intention with the whole netmem design is that drivers should never have to call netmem_to_page(). I.e. the driver should use netmem unaware of whether it's page or non-page underneath, to minimize complexity driver needs to handle. This netmem_to_page() call can be removed by using skb_frag_fill_netmem_desc() instead of the page variant. But, more improtantly, why did the code change here? The code before calls skb_frag_fill_page_desc, but the new code sometimes will skb_frag_fill_netmem_desc() and sometimes will skb_add_rx_frag_netmem. I'm not sure why that logic changed. > + shinfo->nr_frags = i + 1; > + } > __clear_bit(cons, rxr->rx_agg_bmap); > > /* It is possible for bnxt_alloc_rx_page() to allocate > @@ -1233,15 +1257,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > * need to clear the cons entry now. > */ > mapping = cons_rx_buf->mapping; > - page = cons_rx_buf->page; > - cons_rx_buf->page = NULL; > + netmem = cons_rx_buf->netmem; > + cons_rx_buf->netmem = 0; > > - if (xdp && page_is_pfmemalloc(page)) > + if (xdp && page_is_pfmemalloc(netmem_to_page(netmem))) Similarly, add netmem_is_pfmemalloc to netmem.h, instead of doing a netmem_to_page() call here I think. > xdp_buff_set_frag_pfmemalloc(xdp); > > - if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_ATOMIC) != 0) { > + if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_ATOMIC) != 0) { > --shinfo->nr_frags; > - cons_rx_buf->page = page; > + cons_rx_buf->netmem = netmem; > > /* Update prod since possibly some pages have been > * allocated already. > @@ -1269,7 +1293,7 @@ static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp, > struct skb_shared_info *shinfo = skb_shinfo(skb); > u32 total_frag_len = 0; > > - total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, idx, > + total_frag_len = __bnxt_rx_agg_pages(bp, cpr, skb, shinfo, idx, > agg_bufs, tpa, NULL); > if (!total_frag_len) { > skb_mark_for_recycle(skb); > @@ -1277,9 +1301,6 @@ static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp, > return NULL; > } > > - skb->data_len += total_frag_len; > - skb->len += total_frag_len; > - skb->truesize += BNXT_RX_PAGE_SIZE * agg_bufs; > return skb; > } > > @@ -1294,7 +1315,7 @@ static u32 bnxt_rx_agg_pages_xdp(struct bnxt *bp, > if (!xdp_buff_has_frags(xdp)) > shinfo->nr_frags = 0; > > - total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, > + total_frag_len = __bnxt_rx_agg_pages(bp, cpr, NULL, shinfo, > idx, agg_bufs, tpa, xdp); > if (total_frag_len) { > xdp_buff_set_frags_flag(xdp); > @@ -3342,15 +3363,15 @@ static void bnxt_free_one_rx_agg_ring(struct bnxt *bp, struct bnxt_rx_ring_info > > for (i = 0; i < max_idx; i++) { > struct bnxt_sw_rx_agg_bd *rx_agg_buf = &rxr->rx_agg_ring[i]; > - struct page *page = rx_agg_buf->page; > + netmem_ref netmem = rx_agg_buf->netmem; > > - if (!page) > + if (!netmem) > continue; > > - rx_agg_buf->page = NULL; > + rx_agg_buf->netmem = 0; > __clear_bit(i, rxr->rx_agg_bmap); > > - page_pool_recycle_direct(rxr->page_pool, page); > + page_pool_put_full_netmem(rxr->page_pool, netmem, true); > } > } > > @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp) > > static int bnxt_alloc_rx_page_pool(struct bnxt *bp, > struct bnxt_rx_ring_info *rxr, > + int queue_idx, > int numa_node) > { > struct page_pool_params pp = { 0 }; > + struct netdev_rx_queue *rxq; > > pp.pool_size = bp->rx_agg_ring_size; > if (BNXT_RX_PAGE_MODE(bp)) > @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, > pp.dev = &bp->pdev->dev; > pp.dma_dir = bp->rx_dir; > pp.max_len = PAGE_SIZE; > - pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > + pp.order = 0; > + > + rxq = __netif_get_rx_queue(bp->dev, queue_idx); > + if (rxq->mp_params.mp_priv) > + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM; This is not the intended use of PP_FLAG_ALLOW_UNREADABLE_NETMEM. The driver should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when it's able to handle unreadable netmem, it should not worry about whether rxq->mp_params.mp_priv is set or not. You should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when HDS is enabled. Let core figure out if mp_params.mp_priv is enabled. All the driver needs to report is whether it's configured to be able to handle unreadable netmem (which practically means HDS is enabled). > + else > + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > > + pp.queue_idx = queue_idx; > rxr->page_pool = page_pool_create(&pp); > if (IS_ERR(rxr->page_pool)) { > int err = PTR_ERR(rxr->page_pool); > @@ -3655,7 +3685,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp) > cpu_node = cpu_to_node(cpu); > netdev_dbg(bp->dev, "Allocating page pool for rx_ring[%d] on numa_node: %d\n", > i, cpu_node); > - rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node); > + rc = bnxt_alloc_rx_page_pool(bp, rxr, i, cpu_node); > if (rc) > return rc; > > @@ -4154,7 +4184,7 @@ static void bnxt_alloc_one_rx_ring_page(struct bnxt *bp, > > prod = rxr->rx_agg_prod; > for (i = 0; i < bp->rx_agg_ring_size; i++) { > - if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_KERNEL)) { > + if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_KERNEL)) { > netdev_warn(bp->dev, "init'ed rx ring %d with %d/%d pages only\n", > ring_nr, i, bp->rx_ring_size); > break; > @@ -15063,7 +15093,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx) > clone->rx_sw_agg_prod = 0; > clone->rx_next_cons = 0; > > - rc = bnxt_alloc_rx_page_pool(bp, clone, rxr->page_pool->p.nid); > + rc = bnxt_alloc_rx_page_pool(bp, clone, idx, rxr->page_pool->p.nid); > if (rc) > return rc; > > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h > index 48f390519c35..3cf57a3c7664 100644 > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h > @@ -895,7 +895,7 @@ struct bnxt_sw_rx_bd { > }; > > struct bnxt_sw_rx_agg_bd { > - struct page *page; > + netmem_ref netmem; > unsigned int offset; > dma_addr_t mapping; > }; > -- > 2.34.1 >
On Fri, Oct 4, 2024 at 3:43 AM Mina Almasry <almasrymina@google.com> wrote: > Hi Mina, Thanks a lot for your review! > On Thu, Oct 3, 2024 at 9:07 AM Taehee Yoo <ap420073@gmail.com> wrote: > > > > Currently, bnxt_en driver satisfies the requirements of Device memory > > TCP, which is tcp-data-split. > > So, it implements Device memory TCP for bnxt_en driver. > > > > From now on, the aggregation ring handles netmem_ref instead of page > > regardless of the on/off of netmem. > > So, for the aggregation ring, memory will be handled with the netmem > > page_pool API instead of generic page_pool API. > > > > If Devmem is enabled, netmem_ref is used as-is and if Devmem is not > > enabled, netmem_ref will be converted to page and that is used. > > > > Driver recognizes whether the devmem is set or unset based on the > > mp_params.mp_priv is not NULL. > > Only if devmem is set, it passes PP_FLAG_ALLOW_UNREADABLE_NETMEM. > > > > Signed-off-by: Taehee Yoo <ap420073@gmail.com> > > --- > > > > v3: > > - Patch added > > > > drivers/net/ethernet/broadcom/Kconfig | 1 + > > drivers/net/ethernet/broadcom/bnxt/bnxt.c | 98 +++++++++++++++-------- > > drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +- > > 3 files changed, 66 insertions(+), 35 deletions(-) > > > > diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig > > index 75ca3ddda1f5..f37ff12d4746 100644 > > --- a/drivers/net/ethernet/broadcom/Kconfig > > +++ b/drivers/net/ethernet/broadcom/Kconfig > > @@ -211,6 +211,7 @@ config BNXT > > select FW_LOADER > > select LIBCRC32C > > select NET_DEVLINK > > + select NET_DEVMEM > > select PAGE_POOL > > select DIMLIB > > select AUXILIARY_BUS > > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c > > index 872b15842b11..64e07d247f97 100644 > > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c > > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c > > @@ -55,6 +55,7 @@ > > #include <net/page_pool/helpers.h> > > #include <linux/align.h> > > #include <net/netdev_queues.h> > > +#include <net/netdev_rx_queue.h> > > > > #include "bnxt_hsi.h" > > #include "bnxt.h" > > @@ -863,6 +864,22 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) > > bnapi->events &= ~BNXT_TX_CMP_EVENT; > > } > > > > +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping, > > + struct bnxt_rx_ring_info *rxr, > > + unsigned int *offset, > > + gfp_t gfp) > > +{ > > + netmem_ref netmem; > > + > > + netmem = page_pool_alloc_netmem(rxr->page_pool, GFP_ATOMIC); > > + if (!netmem) > > + return 0; > > + *offset = 0; > > + > > + *mapping = page_pool_get_dma_addr_netmem(netmem) + *offset; > > + return netmem; > > +} > > + > > static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, > > struct bnxt_rx_ring_info *rxr, > > unsigned int *offset, > > @@ -972,21 +989,21 @@ static inline u16 bnxt_find_next_agg_idx(struct bnxt_rx_ring_info *rxr, u16 idx) > > return next; > > } > > > > -static inline int bnxt_alloc_rx_page(struct bnxt *bp, > > - struct bnxt_rx_ring_info *rxr, > > - u16 prod, gfp_t gfp) > > +static inline int bnxt_alloc_rx_netmem(struct bnxt *bp, > > + struct bnxt_rx_ring_info *rxr, > > + u16 prod, gfp_t gfp) > > { > > struct rx_bd *rxbd = > > &rxr->rx_agg_desc_ring[RX_AGG_RING(bp, prod)][RX_IDX(prod)]; > > struct bnxt_sw_rx_agg_bd *rx_agg_buf; > > - struct page *page; > > + netmem_ref netmem; > > dma_addr_t mapping; > > u16 sw_prod = rxr->rx_sw_agg_prod; > > unsigned int offset = 0; > > > > - page = __bnxt_alloc_rx_page(bp, &mapping, rxr, &offset, gfp); > > + netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, &offset, gfp); > > Does __bnxt_alloc_rx_page become dead code after this change? Or is it > still used for something? __bnxt_alloc_rx_page() is still used. > > > > > - if (!page) > > + if (!netmem) > > return -ENOMEM; > > > > if (unlikely(test_bit(sw_prod, rxr->rx_agg_bmap))) > > @@ -996,7 +1013,7 @@ static inline int bnxt_alloc_rx_page(struct bnxt *bp, > > rx_agg_buf = &rxr->rx_agg_ring[sw_prod]; > > rxr->rx_sw_agg_prod = RING_RX_AGG(bp, NEXT_RX_AGG(sw_prod)); > > > > - rx_agg_buf->page = page; > > + rx_agg_buf->netmem = netmem; > > rx_agg_buf->offset = offset; > > rx_agg_buf->mapping = mapping; > > rxbd->rx_bd_haddr = cpu_to_le64(mapping); > > @@ -1044,7 +1061,7 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx, > > struct rx_agg_cmp *agg; > > struct bnxt_sw_rx_agg_bd *cons_rx_buf, *prod_rx_buf; > > struct rx_bd *prod_bd; > > - struct page *page; > > + netmem_ref netmem; > > > > if (p5_tpa) > > agg = bnxt_get_tpa_agg_p5(bp, rxr, idx, start + i); > > @@ -1061,11 +1078,11 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx, > > cons_rx_buf = &rxr->rx_agg_ring[cons]; > > > > /* It is possible for sw_prod to be equal to cons, so > > - * set cons_rx_buf->page to NULL first.If I misunderstand about > > + * set cons_rx_buf->netmem to 0 first. > > */ > > - page = cons_rx_buf->page; > > - cons_rx_buf->page = NULL; > > - prod_rx_buf->page = page; > > + netmem = cons_rx_buf->netmem; > > + cons_rx_buf->netmem = 0; > > + prod_rx_buf->netmem = netmem; > > prod_rx_buf->offset = cons_rx_buf->offset; > > > > prod_rx_buf->mapping = cons_rx_buf->mapping; > > @@ -1192,6 +1209,7 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp, > > > > static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > > struct bnxt_cp_ring_info *cpr, > > + struct sk_buff *skb, > > struct skb_shared_info *shinfo, > > u16 idx, u32 agg_bufs, bool tpa, > > struct xdp_buff *xdp) > > @@ -1211,7 +1229,7 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > > u16 cons, frag_len; > > struct rx_agg_cmp *agg; > > struct bnxt_sw_rx_agg_bd *cons_rx_buf; > > - struct page *page; > > + netmem_ref netmem; > > dma_addr_t mapping; > > > > if (p5_tpa) > > @@ -1223,9 +1241,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > > RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT; > > > > cons_rx_buf = &rxr->rx_agg_ring[cons]; > > - skb_frag_fill_page_desc(frag, cons_rx_buf->page, > > - cons_rx_buf->offset, frag_len); > > - shinfo->nr_frags = i + 1; > > + if (skb) { > > + skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem, > > + cons_rx_buf->offset, frag_len, > > + BNXT_RX_PAGE_SIZE); > > + } else { > > + skb_frag_fill_page_desc(frag, netmem_to_page(cons_rx_buf->netmem), > > + cons_rx_buf->offset, frag_len); > > Our intention with the whole netmem design is that drivers should > never have to call netmem_to_page(). I.e. the driver should use netmem > unaware of whether it's page or non-page underneath, to minimize > complexity driver needs to handle. > > This netmem_to_page() call can be removed by using > skb_frag_fill_netmem_desc() instead of the page variant. But, more > improtantly, why did the code change here? The code before calls > skb_frag_fill_page_desc, but the new code sometimes will > skb_frag_fill_netmem_desc() and sometimes will skb_add_rx_frag_netmem. > I'm not sure why that logic changed. The reason why skb_add_rx_frag_netmem() is used here is to set skb->unreadable flag. the skb_frag_fill_netmem_desc() doesn't set skb->unreadable because it doesn't handle skb, it only handles frag. As far as I know, skb->unreadable should be set to true for devmem TCP, am I misunderstood? I tested that don't using skb_add_rx_frag_netmem() here, and it immediately fails. The "if (skb)" branch will be hit only when devmem TCP path. Normal packet and XDP path will hit "else" branch. I will use skb_frag_fill_netmem_desc() instead of skb_frag_fill_page_desc() in the "else" branch. With this change, as you said, there is no netmem_to_page() in bnxt_en driver, Thanks! > > > + shinfo->nr_frags = i + 1; > > + } > > __clear_bit(cons, rxr->rx_agg_bmap); > > > > /* It is possible for bnxt_alloc_rx_page() to allocate > > @@ -1233,15 +1257,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > > * need to clear the cons entry now. > > */ > > mapping = cons_rx_buf->mapping; > > - page = cons_rx_buf->page; > > - cons_rx_buf->page = NULL; > > + netmem = cons_rx_buf->netmem; > > + cons_rx_buf->netmem = 0; > > > > - if (xdp && page_is_pfmemalloc(page)) > > + if (xdp && page_is_pfmemalloc(netmem_to_page(netmem))) > > Similarly, add netmem_is_pfmemalloc to netmem.h, instead of doing a > netmem_to_page() call here I think. Thanks, I will add netmem_is_pfmemalloc() to netmem.h in a v4 patch. > > > xdp_buff_set_frag_pfmemalloc(xdp); > > > > - if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_ATOMIC) != 0) { > > + if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_ATOMIC) != 0) { > > --shinfo->nr_frags; > > - cons_rx_buf->page = page; > > + cons_rx_buf->netmem = netmem; > > > > /* Update prod since possibly some pages have been > > * allocated already. > > @@ -1269,7 +1293,7 @@ static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp, > > struct skb_shared_info *shinfo = skb_shinfo(skb); > > u32 total_frag_len = 0; > > > > - total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, idx, > > + total_frag_len = __bnxt_rx_agg_pages(bp, cpr, skb, shinfo, idx, > > agg_bufs, tpa, NULL); > > if (!total_frag_len) { > > skb_mark_for_recycle(skb); > > @@ -1277,9 +1301,6 @@ static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp, > > return NULL; > > } > > > > - skb->data_len += total_frag_len; > > - skb->len += total_frag_len; > > - skb->truesize += BNXT_RX_PAGE_SIZE * agg_bufs; > > return skb; > > } > > > > @@ -1294,7 +1315,7 @@ static u32 bnxt_rx_agg_pages_xdp(struct bnxt *bp, > > if (!xdp_buff_has_frags(xdp)) > > shinfo->nr_frags = 0; > > > > - total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, > > + total_frag_len = __bnxt_rx_agg_pages(bp, cpr, NULL, shinfo, > > idx, agg_bufs, tpa, xdp); > > if (total_frag_len) { > > xdp_buff_set_frags_flag(xdp); > > @@ -3342,15 +3363,15 @@ static void bnxt_free_one_rx_agg_ring(struct bnxt *bp, struct bnxt_rx_ring_info > > > > for (i = 0; i < max_idx; i++) { > > struct bnxt_sw_rx_agg_bd *rx_agg_buf = &rxr->rx_agg_ring[i]; > > - struct page *page = rx_agg_buf->page; > > + netmem_ref netmem = rx_agg_buf->netmem; > > > > - if (!page) > > + if (!netmem) > > continue; > > > > - rx_agg_buf->page = NULL; > > + rx_agg_buf->netmem = 0; > > __clear_bit(i, rxr->rx_agg_bmap); > > > > - page_pool_recycle_direct(rxr->page_pool, page); > > + page_pool_put_full_netmem(rxr->page_pool, netmem, true); > > } > > } > > > > @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp) > > > > static int bnxt_alloc_rx_page_pool(struct bnxt *bp, > > struct bnxt_rx_ring_info *rxr, > > + int queue_idx, > > int numa_node) > > { > > struct page_pool_params pp = { 0 }; > > + struct netdev_rx_queue *rxq; > > > > pp.pool_size = bp->rx_agg_ring_size; > > if (BNXT_RX_PAGE_MODE(bp)) > > @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, > > pp.dev = &bp->pdev->dev; > > pp.dma_dir = bp->rx_dir; > > pp.max_len = PAGE_SIZE; > > - pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > > + pp.order = 0; > > + > > + rxq = __netif_get_rx_queue(bp->dev, queue_idx); > > + if (rxq->mp_params.mp_priv) > > + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM; > > This is not the intended use of PP_FLAG_ALLOW_UNREADABLE_NETMEM. > > The driver should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when it's able > to handle unreadable netmem, it should not worry about whether > rxq->mp_params.mp_priv is set or not. > > You should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when HDS is enabled. > Let core figure out if mp_params.mp_priv is enabled. All the driver > needs to report is whether it's configured to be able to handle > unreadable netmem (which practically means HDS is enabled). The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM flag can't be used with PP_FLAG_DMA_SYNC_DEV. 228 if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { 229 /* In order to request DMA-sync-for-device the page 230 * needs to be mapped 231 */ 232 if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) 233 return -EINVAL; 234 235 if (!pool->p.max_len) 236 return -EINVAL; 237 238 pool->dma_sync = true; //here 239 240 /* pool->p.offset has to be set according to the address 241 * offset used by the DMA engine to start copying rx data 242 */ 243 } If PP_FLAG_DMA_SYNC_DEV is set, page->dma_sync is set to true. 347 int mp_dmabuf_devmem_init(struct page_pool *pool) 348 { 349 struct net_devmem_dmabuf_binding *binding = pool->mp_priv; 350 351 if (!binding) 352 return -EINVAL; 353 354 if (!pool->dma_map) 355 return -EOPNOTSUPP; 356 357 if (pool->dma_sync) //here 358 return -EOPNOTSUPP; 359 360 if (pool->p.order != 0) 361 return -E2BIG; 362 363 net_devmem_dmabuf_binding_get(binding); 364 return 0; 365 } In the mp_dmabuf_devmem_init(), it fails when pool->dma_sync is true. tcp-data-split can be used for normal cases, not only devmem TCP case. If we enable tcp-data-split and disable devmem TCP, page_pool doesn't have PP_FLAG_DMA_SYNC_DEV. So I think mp_params.mp_priv is still useful. Thanks a lot, Taehee Yoo > > > + else > > + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > > > > + pp.queue_idx = queue_idx; > > rxr->page_pool = page_pool_create(&pp); > > if (IS_ERR(rxr->page_pool)) { > > int err = PTR_ERR(rxr->page_pool); > > @@ -3655,7 +3685,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp) > > cpu_node = cpu_to_node(cpu); > > netdev_dbg(bp->dev, "Allocating page pool for rx_ring[%d] on numa_node: %d\n", > > i, cpu_node); > > - rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node); > > + rc = bnxt_alloc_rx_page_pool(bp, rxr, i, cpu_node); > > if (rc) > > return rc; > > > > @@ -4154,7 +4184,7 @@ static void bnxt_alloc_one_rx_ring_page(struct bnxt *bp, > > > > prod = rxr->rx_agg_prod; > > for (i = 0; i < bp->rx_agg_ring_size; i++) { > > - if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_KERNEL)) { > > + if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_KERNEL)) { > > netdev_warn(bp->dev, "init'ed rx ring %d with %d/%d pages only\n", > > ring_nr, i, bp->rx_ring_size); > > break; > > @@ -15063,7 +15093,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx) > > clone->rx_sw_agg_prod = 0; > > clone->rx_next_cons = 0; > > > > - rc = bnxt_alloc_rx_page_pool(bp, clone, rxr->page_pool->p.nid); > > + rc = bnxt_alloc_rx_page_pool(bp, clone, idx, rxr->page_pool->p.nid); > > if (rc) > > return rc; > > > > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h > > index 48f390519c35..3cf57a3c7664 100644 > > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h > > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h > > @@ -895,7 +895,7 @@ struct bnxt_sw_rx_bd { > > }; > > > > struct bnxt_sw_rx_agg_bd { > > - struct page *page; > > + netmem_ref netmem; > > unsigned int offset; > > dma_addr_t mapping; > > }; > > -- > > 2.34.1 > > > > > -- > Thanks, > Mina
Hi Taehee,
kernel test robot noticed the following build warnings:
[auto build test WARNING on net-next/main]
url: https://github.com/intel-lab-lkp/linux/commits/Taehee-Yoo/bnxt_en-add-support-for-rx-copybreak-ethtool-command/20241004-000934
base: net-next/main
patch link: https://lore.kernel.org/r/20241003160620.1521626-8-ap420073%40gmail.com
patch subject: [PATCH net-next v3 7/7] bnxt_en: add support for device memory tcp
config: x86_64-kismet-CONFIG_NET_DEVMEM-CONFIG_BNXT-0-0 (https://download.01.org/0day-ci/archive/20241005/202410051156.r68SYo4V-lkp@intel.com/config)
reproduce: (https://download.01.org/0day-ci/archive/20241005/202410051156.r68SYo4V-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202410051156.r68SYo4V-lkp@intel.com/
kismet warnings: (new ones prefixed by >>)
>> kismet: WARNING: unmet direct dependencies detected for NET_DEVMEM when selected by BNXT
WARNING: unmet direct dependencies detected for NET_DEVMEM
Depends on [n]: NET [=y] && DMA_SHARED_BUFFER [=n] && GENERIC_ALLOCATOR [=y] && PAGE_POOL [=y]
Selected by [y]:
- BNXT [=y] && NETDEVICES [=y] && ETHERNET [=y] && NET_VENDOR_BROADCOM [=y] && PCI [=y] && PTP_1588_CLOCK_OPTIONAL [=y]
On 2024-10-03 09:06, Taehee Yoo wrote: > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c > index 872b15842b11..64e07d247f97 100644 > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c > @@ -55,6 +55,7 @@ > #include <net/page_pool/helpers.h> > #include <linux/align.h> > #include <net/netdev_queues.h> > +#include <net/netdev_rx_queue.h> > > #include "bnxt_hsi.h" > #include "bnxt.h" > @@ -863,6 +864,22 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) > bnapi->events &= ~BNXT_TX_CMP_EVENT; > } > > +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping, > + struct bnxt_rx_ring_info *rxr, > + unsigned int *offset, > + gfp_t gfp) gfp is unused > +{ > + netmem_ref netmem; > + > + netmem = page_pool_alloc_netmem(rxr->page_pool, GFP_ATOMIC); > + if (!netmem) > + return 0; > + *offset = 0; > + > + *mapping = page_pool_get_dma_addr_netmem(netmem) + *offset; offset is always 0 > + return netmem; > +} > + > static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, > struct bnxt_rx_ring_info *rxr, > unsigned int *offset, [...] > @@ -1192,6 +1209,7 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp, > > static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > struct bnxt_cp_ring_info *cpr, > + struct sk_buff *skb, > struct skb_shared_info *shinfo, > u16 idx, u32 agg_bufs, bool tpa, > struct xdp_buff *xdp) > @@ -1211,7 +1229,7 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > u16 cons, frag_len; > struct rx_agg_cmp *agg; > struct bnxt_sw_rx_agg_bd *cons_rx_buf; > - struct page *page; > + netmem_ref netmem; > dma_addr_t mapping; > > if (p5_tpa) > @@ -1223,9 +1241,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT; > > cons_rx_buf = &rxr->rx_agg_ring[cons]; > - skb_frag_fill_page_desc(frag, cons_rx_buf->page, > - cons_rx_buf->offset, frag_len); > - shinfo->nr_frags = i + 1; > + if (skb) { > + skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem, > + cons_rx_buf->offset, frag_len, > + BNXT_RX_PAGE_SIZE); > + } else { > + skb_frag_fill_page_desc(frag, netmem_to_page(cons_rx_buf->netmem), > + cons_rx_buf->offset, frag_len); > + shinfo->nr_frags = i + 1; > + } I feel like this function needs a refactor at some point to split out the skb and xdp paths. > __clear_bit(cons, rxr->rx_agg_bmap); > > /* It is possible for bnxt_alloc_rx_page() to allocate [...] > @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp) > > static int bnxt_alloc_rx_page_pool(struct bnxt *bp, > struct bnxt_rx_ring_info *rxr, > + int queue_idx, To save a parameter, the index is available already in rxr->bnapi->index > int numa_node) > { > struct page_pool_params pp = { 0 }; > + struct netdev_rx_queue *rxq; > > pp.pool_size = bp->rx_agg_ring_size; > if (BNXT_RX_PAGE_MODE(bp)) > @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, > pp.dev = &bp->pdev->dev; > pp.dma_dir = bp->rx_dir; > pp.max_len = PAGE_SIZE; > - pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > + pp.order = 0; > + > + rxq = __netif_get_rx_queue(bp->dev, queue_idx); > + if (rxq->mp_params.mp_priv) > + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM; > + else > + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > > + pp.queue_idx = queue_idx; > rxr->page_pool = page_pool_create(&pp); > if (IS_ERR(rxr->page_pool)) { > int err = PTR_ERR(rxr->page_pool); > @@ -3655,7 +3685,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp) > cpu_node = cpu_to_node(cpu); > netdev_dbg(bp->dev, "Allocating page pool for rx_ring[%d] on numa_node: %d\n", > i, cpu_node); > - rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node); > + rc = bnxt_alloc_rx_page_pool(bp, rxr, i, cpu_node); > if (rc) > return rc; >
On 2024-10-04 03:34, Taehee Yoo wrote: > On Fri, Oct 4, 2024 at 3:43 AM Mina Almasry <almasrymina@google.com> wrote: >>> @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp) >>> >>> static int bnxt_alloc_rx_page_pool(struct bnxt *bp, >>> struct bnxt_rx_ring_info *rxr, >>> + int queue_idx, >>> int numa_node) >>> { >>> struct page_pool_params pp = { 0 }; >>> + struct netdev_rx_queue *rxq; >>> >>> pp.pool_size = bp->rx_agg_ring_size; >>> if (BNXT_RX_PAGE_MODE(bp)) >>> @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, >>> pp.dev = &bp->pdev->dev; >>> pp.dma_dir = bp->rx_dir; >>> pp.max_len = PAGE_SIZE; >>> - pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; >>> + pp.order = 0; >>> + >>> + rxq = __netif_get_rx_queue(bp->dev, queue_idx); >>> + if (rxq->mp_params.mp_priv) >>> + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM; >> >> This is not the intended use of PP_FLAG_ALLOW_UNREADABLE_NETMEM. >> >> The driver should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when it's able >> to handle unreadable netmem, it should not worry about whether >> rxq->mp_params.mp_priv is set or not. >> >> You should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when HDS is enabled. >> Let core figure out if mp_params.mp_priv is enabled. All the driver >> needs to report is whether it's configured to be able to handle >> unreadable netmem (which practically means HDS is enabled). > > The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM > flag can't be used with PP_FLAG_DMA_SYNC_DEV. > > 228 if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { > 229 /* In order to request DMA-sync-for-device the page > 230 * needs to be mapped > 231 */ > 232 if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) > 233 return -EINVAL; > 234 > 235 if (!pool->p.max_len) > 236 return -EINVAL; > 237 > 238 pool->dma_sync = true; //here > 239 > 240 /* pool->p.offset has to be set according to the address > 241 * offset used by the DMA engine to start copying rx data > 242 */ > 243 } > > If PP_FLAG_DMA_SYNC_DEV is set, page->dma_sync is set to true. > > 347 int mp_dmabuf_devmem_init(struct page_pool *pool) > 348 { > 349 struct net_devmem_dmabuf_binding *binding = pool->mp_priv; > 350 > 351 if (!binding) > 352 return -EINVAL; > 353 > 354 if (!pool->dma_map) > 355 return -EOPNOTSUPP; > 356 > 357 if (pool->dma_sync) //here > 358 return -EOPNOTSUPP; > 359 > 360 if (pool->p.order != 0) > 361 return -E2BIG; > 362 > 363 net_devmem_dmabuf_binding_get(binding); > 364 return 0; > 365 } > > In the mp_dmabuf_devmem_init(), it fails when pool->dma_sync is true. This won't work for io_uring zero copy into user memory. We need all PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | PP_FLAG_ALLOW_UNREADABLE_NETMEM set. I agree with Mina that the driver should not be poking at the mp_priv fields. How about setting all the flags and then letting the mp->init() figure it out? mp_dmabuf_devmem_init() is called within page_pool_init() so as long as it resets dma_sync if set I don't see any issues. > > tcp-data-split can be used for normal cases, not only devmem TCP case. > If we enable tcp-data-split and disable devmem TCP, page_pool doesn't > have PP_FLAG_DMA_SYNC_DEV. > So I think mp_params.mp_priv is still useful. > > Thanks a lot, > Taehee Yoo > >> >> >> -- >> Thanks, >> Mina
On Tue, Oct 8, 2024 at 11:45 AM David Wei <dw@davidwei.uk> wrote: > Hi David, Thanks a lot for your review! > On 2024-10-03 09:06, Taehee Yoo wrote: > > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c > > index 872b15842b11..64e07d247f97 100644 > > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c > > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c > > @@ -55,6 +55,7 @@ > > #include <net/page_pool/helpers.h> > > #include <linux/align.h> > > #include <net/netdev_queues.h> > > +#include <net/netdev_rx_queue.h> > > > > #include "bnxt_hsi.h" > > #include "bnxt.h" > > @@ -863,6 +864,22 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) > > bnapi->events &= ~BNXT_TX_CMP_EVENT; > > } > > > > +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping, > > + struct bnxt_rx_ring_info *rxr, > > + unsigned int *offset, > > + gfp_t gfp) > > gfp is unused I will remove unnecessary gfp parameter in v4. > > > +{ > > + netmem_ref netmem; > > + > > + netmem = page_pool_alloc_netmem(rxr->page_pool, GFP_ATOMIC); > > + if (!netmem) > > + return 0; > > + *offset = 0; > > + > > + *mapping = page_pool_get_dma_addr_netmem(netmem) + *offset; > > offset is always 0 Okay, I will remove this too in v4. > > > + return netmem; > > +} > > + > > static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, > > struct bnxt_rx_ring_info *rxr, > > unsigned int *offset, > > [...] > > > @@ -1192,6 +1209,7 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp, > > > > static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > > struct bnxt_cp_ring_info *cpr, > > + struct sk_buff *skb, > > struct skb_shared_info *shinfo, > > u16 idx, u32 agg_bufs, bool tpa, > > struct xdp_buff *xdp) > > @@ -1211,7 +1229,7 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > > u16 cons, frag_len; > > struct rx_agg_cmp *agg; > > struct bnxt_sw_rx_agg_bd *cons_rx_buf; > > - struct page *page; > > + netmem_ref netmem; > > dma_addr_t mapping; > > > > if (p5_tpa) > > @@ -1223,9 +1241,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > > RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT; > > > > cons_rx_buf = &rxr->rx_agg_ring[cons]; > > - skb_frag_fill_page_desc(frag, cons_rx_buf->page, > > - cons_rx_buf->offset, frag_len); > > - shinfo->nr_frags = i + 1; > > + if (skb) { > > + skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem, > > + cons_rx_buf->offset, frag_len, > > + BNXT_RX_PAGE_SIZE); > > + } else { > > + skb_frag_fill_page_desc(frag, netmem_to_page(cons_rx_buf->netmem), > > + cons_rx_buf->offset, frag_len); > > + shinfo->nr_frags = i + 1; > > + } > > I feel like this function needs a refactor at some point to split out > the skb and xdp paths. Okay, I will add __bnxt_rx_agg_netmem() in v4 patch. > > > __clear_bit(cons, rxr->rx_agg_bmap); > > > > /* It is possible for bnxt_alloc_rx_page() to allocate > > [...] > > > @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp) > > > > static int bnxt_alloc_rx_page_pool(struct bnxt *bp, > > struct bnxt_rx_ring_info *rxr, > > + int queue_idx, > > To save a parameter, the index is available already in rxr->bnapi->index Okay, I also remove the queue_idx parameter in v4. > > > int numa_node) > > { > > struct page_pool_params pp = { 0 }; > > + struct netdev_rx_queue *rxq; > > > > pp.pool_size = bp->rx_agg_ring_size; > > if (BNXT_RX_PAGE_MODE(bp)) > > @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, > > pp.dev = &bp->pdev->dev; > > pp.dma_dir = bp->rx_dir; > > pp.max_len = PAGE_SIZE; > > - pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > > + pp.order = 0; > > + > > + rxq = __netif_get_rx_queue(bp->dev, queue_idx); > > + if (rxq->mp_params.mp_priv) > > + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM; > > + else > > + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > > > > + pp.queue_idx = queue_idx; > > rxr->page_pool = page_pool_create(&pp); > > if (IS_ERR(rxr->page_pool)) { > > int err = PTR_ERR(rxr->page_pool); > > @@ -3655,7 +3685,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp) > > cpu_node = cpu_to_node(cpu); > > netdev_dbg(bp->dev, "Allocating page pool for rx_ring[%d] on numa_node: %d\n", > > i, cpu_node); > > - rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node); > > + rc = bnxt_alloc_rx_page_pool(bp, rxr, i, cpu_node); > > if (rc) > > return rc; > > Thanks a lot for catching things, I will send v4 if there is no problem after some tests. Thanks! Taehee Yoo
On Tue, Oct 8, 2024 at 12:54 PM Taehee Yoo <ap420073@gmail.com> wrote: > > On Tue, Oct 8, 2024 at 11:45 AM David Wei <dw@davidwei.uk> wrote: > > > > Hi David, > Thanks a lot for your review! > > > On 2024-10-03 09:06, Taehee Yoo wrote: > > > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c > > > index 872b15842b11..64e07d247f97 100644 > > > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c > > > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c > > > @@ -55,6 +55,7 @@ > > > #include <net/page_pool/helpers.h> > > > #include <linux/align.h> > > > #include <net/netdev_queues.h> > > > +#include <net/netdev_rx_queue.h> > > > > > > #include "bnxt_hsi.h" > > > #include "bnxt.h" > > > @@ -863,6 +864,22 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) > > > bnapi->events &= ~BNXT_TX_CMP_EVENT; > > > } > > > > > > +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping, > > > + struct bnxt_rx_ring_info *rxr, > > > + unsigned int *offset, > > > + gfp_t gfp) > > > > gfp is unused > > I will remove unnecessary gfp parameter in v4. Oh sorry, I will use gfp parameter, not remove it. > > > > > > +{ > > > + netmem_ref netmem; > > > + > > > + netmem = page_pool_alloc_netmem(rxr->page_pool, GFP_ATOMIC); > > > + if (!netmem) > > > + return 0; > > > + *offset = 0; > > > + > > > + *mapping = page_pool_get_dma_addr_netmem(netmem) + *offset; > > > > offset is always 0 > > Okay, I will remove this too in v4. > > > > > > + return netmem; > > > +} > > > + > > > static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, > > > struct bnxt_rx_ring_info *rxr, > > > unsigned int *offset, > > > > [...] > > > > > @@ -1192,6 +1209,7 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp, > > > > > > static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > > > struct bnxt_cp_ring_info *cpr, > > > + struct sk_buff *skb, > > > struct skb_shared_info *shinfo, > > > u16 idx, u32 agg_bufs, bool tpa, > > > struct xdp_buff *xdp) > > > @@ -1211,7 +1229,7 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > > > u16 cons, frag_len; > > > struct rx_agg_cmp *agg; > > > struct bnxt_sw_rx_agg_bd *cons_rx_buf; > > > - struct page *page; > > > + netmem_ref netmem; > > > dma_addr_t mapping; > > > > > > if (p5_tpa) > > > @@ -1223,9 +1241,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, > > > RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT; > > > > > > cons_rx_buf = &rxr->rx_agg_ring[cons]; > > > - skb_frag_fill_page_desc(frag, cons_rx_buf->page, > > > - cons_rx_buf->offset, frag_len); > > > - shinfo->nr_frags = i + 1; > > > + if (skb) { > > > + skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem, > > > + cons_rx_buf->offset, frag_len, > > > + BNXT_RX_PAGE_SIZE); > > > + } else { > > > + skb_frag_fill_page_desc(frag, netmem_to_page(cons_rx_buf->netmem), > > > + cons_rx_buf->offset, frag_len); > > > + shinfo->nr_frags = i + 1; > > > + } > > > > I feel like this function needs a refactor at some point to split out > > the skb and xdp paths. > > Okay, I will add __bnxt_rx_agg_netmem() in v4 patch. > > > > > > __clear_bit(cons, rxr->rx_agg_bmap); > > > > > > /* It is possible for bnxt_alloc_rx_page() to allocate > > > > [...] > > > > > @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp) > > > > > > static int bnxt_alloc_rx_page_pool(struct bnxt *bp, > > > struct bnxt_rx_ring_info *rxr, > > > + int queue_idx, > > > > To save a parameter, the index is available already in rxr->bnapi->index > > Okay, I also remove the queue_idx parameter in v4. > > > > > > int numa_node) > > > { > > > struct page_pool_params pp = { 0 }; > > > + struct netdev_rx_queue *rxq; > > > > > > pp.pool_size = bp->rx_agg_ring_size; > > > if (BNXT_RX_PAGE_MODE(bp)) > > > @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, > > > pp.dev = &bp->pdev->dev; > > > pp.dma_dir = bp->rx_dir; > > > pp.max_len = PAGE_SIZE; > > > - pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > > > + pp.order = 0; > > > + > > > + rxq = __netif_get_rx_queue(bp->dev, queue_idx); > > > + if (rxq->mp_params.mp_priv) > > > + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM; > > > + else > > > + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > > > > > > + pp.queue_idx = queue_idx; > > > rxr->page_pool = page_pool_create(&pp); > > > if (IS_ERR(rxr->page_pool)) { > > > int err = PTR_ERR(rxr->page_pool); > > > @@ -3655,7 +3685,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp) > > > cpu_node = cpu_to_node(cpu); > > > netdev_dbg(bp->dev, "Allocating page pool for rx_ring[%d] on numa_node: %d\n", > > > i, cpu_node); > > > - rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node); > > > + rc = bnxt_alloc_rx_page_pool(bp, rxr, i, cpu_node); > > > if (rc) > > > return rc; > > > > > Thanks a lot for catching things, > I will send v4 if there is no problem after some tests. > > Thanks! > Taehee Yoo
On Fri, 4 Oct 2024 19:34:45 +0900 Taehee Yoo wrote: > > Our intention with the whole netmem design is that drivers should > > never have to call netmem_to_page(). I.e. the driver should use netmem > > unaware of whether it's page or non-page underneath, to minimize > > complexity driver needs to handle. > > > > This netmem_to_page() call can be removed by using > > skb_frag_fill_netmem_desc() instead of the page variant. But, more > > improtantly, why did the code change here? The code before calls > > skb_frag_fill_page_desc, but the new code sometimes will > > skb_frag_fill_netmem_desc() and sometimes will skb_add_rx_frag_netmem. > > I'm not sure why that logic changed. > > The reason why skb_add_rx_frag_netmem() is used here is to set > skb->unreadable flag. the skb_frag_fill_netmem_desc() doesn't set > skb->unreadable because it doesn't handle skb, it only handles frag. > As far as I know, skb->unreadable should be set to true for devmem > TCP, am I misunderstood? > I tested that don't using skb_add_rx_frag_netmem() here, and it > immediately fails. Yes, but netmem_ref can be either a net_iov or a normal page, and skb_add_rx_frag_netmem() and similar helpers should automatically set skb->unreadable or not. IOW you should be able to always use netmem-aware APIs, no? > > This is not the intended use of PP_FLAG_ALLOW_UNREADABLE_NETMEM. > > > > The driver should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when it's able > > to handle unreadable netmem, it should not worry about whether > > rxq->mp_params.mp_priv is set or not. > > > > You should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when HDS is enabled. > > Let core figure out if mp_params.mp_priv is enabled. All the driver > > needs to report is whether it's configured to be able to handle > > unreadable netmem (which practically means HDS is enabled). > > The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM > flag can't be used with PP_FLAG_DMA_SYNC_DEV. Hm. Isn't the existing check the wrong way around? Is the driver supposed to sync the buffers for device before passing them down?
On Tue, Oct 8, 2024 at 11:57 AM David Wei <dw@davidwei.uk> wrote: > > On 2024-10-04 03:34, Taehee Yoo wrote: > > On Fri, Oct 4, 2024 at 3:43 AM Mina Almasry <almasrymina@google.com> wrote: > >>> @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp) > >>> > >>> static int bnxt_alloc_rx_page_pool(struct bnxt *bp, > >>> struct bnxt_rx_ring_info *rxr, > >>> + int queue_idx, > >>> int numa_node) > >>> { > >>> struct page_pool_params pp = { 0 }; > >>> + struct netdev_rx_queue *rxq; > >>> > >>> pp.pool_size = bp->rx_agg_ring_size; > >>> if (BNXT_RX_PAGE_MODE(bp)) > >>> @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, > >>> pp.dev = &bp->pdev->dev; > >>> pp.dma_dir = bp->rx_dir; > >>> pp.max_len = PAGE_SIZE; > >>> - pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > >>> + pp.order = 0; > >>> + > >>> + rxq = __netif_get_rx_queue(bp->dev, queue_idx); > >>> + if (rxq->mp_params.mp_priv) > >>> + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM; > >> > >> This is not the intended use of PP_FLAG_ALLOW_UNREADABLE_NETMEM. > >> > >> The driver should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when it's able > >> to handle unreadable netmem, it should not worry about whether > >> rxq->mp_params.mp_priv is set or not. > >> > >> You should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when HDS is enabled. > >> Let core figure out if mp_params.mp_priv is enabled. All the driver > >> needs to report is whether it's configured to be able to handle > >> unreadable netmem (which practically means HDS is enabled). > > > > The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM > > flag can't be used with PP_FLAG_DMA_SYNC_DEV. > > > > 228 if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { > > 229 /* In order to request DMA-sync-for-device the page > > 230 * needs to be mapped > > 231 */ > > 232 if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) > > 233 return -EINVAL; > > 234 > > 235 if (!pool->p.max_len) > > 236 return -EINVAL; > > 237 > > 238 pool->dma_sync = true; //here > > 239 > > 240 /* pool->p.offset has to be set according to the address > > 241 * offset used by the DMA engine to start copying rx data > > 242 */ > > 243 } > > > > If PP_FLAG_DMA_SYNC_DEV is set, page->dma_sync is set to true. > > > > 347 int mp_dmabuf_devmem_init(struct page_pool *pool) > > 348 { > > 349 struct net_devmem_dmabuf_binding *binding = pool->mp_priv; > > 350 > > 351 if (!binding) > > 352 return -EINVAL; > > 353 > > 354 if (!pool->dma_map) > > 355 return -EOPNOTSUPP; > > 356 > > 357 if (pool->dma_sync) //here > > 358 return -EOPNOTSUPP; > > 359 > > 360 if (pool->p.order != 0) > > 361 return -E2BIG; > > 362 > > 363 net_devmem_dmabuf_binding_get(binding); > > 364 return 0; > > 365 } > > > > In the mp_dmabuf_devmem_init(), it fails when pool->dma_sync is true. > > This won't work for io_uring zero copy into user memory. We need all > PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | PP_FLAG_ALLOW_UNREADABLE_NETMEM > set. > > I agree with Mina that the driver should not be poking at the mp_priv > fields. How about setting all the flags and then letting the mp->init() > figure it out? mp_dmabuf_devmem_init() is called within page_pool_init() > so as long as it resets dma_sync if set I don't see any issues. > Ah, I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV for dmabuf may be wrong. IIUC this flag indicates sync between device and CPU. But device memory TCP is not related to sync between device and CPU. So, I think we need to remove this condition check code in the core. How do you think about it? > > > > tcp-data-split can be used for normal cases, not only devmem TCP case. > > If we enable tcp-data-split and disable devmem TCP, page_pool doesn't > > have PP_FLAG_DMA_SYNC_DEV. > > So I think mp_params.mp_priv is still useful. > > > > Thanks a lot, > > Taehee Yoo > > > >> > >> > >> -- > >> Thanks, > >> Mina
On Wed, Oct 9, 2024 at 4:50 AM Jakub Kicinski <kuba@kernel.org> wrote: > > On Fri, 4 Oct 2024 19:34:45 +0900 Taehee Yoo wrote: > > > Our intention with the whole netmem design is that drivers should > > > never have to call netmem_to_page(). I.e. the driver should use netmem > > > unaware of whether it's page or non-page underneath, to minimize > > > complexity driver needs to handle. > > > > > > This netmem_to_page() call can be removed by using > > > skb_frag_fill_netmem_desc() instead of the page variant. But, more > > > improtantly, why did the code change here? The code before calls > > > skb_frag_fill_page_desc, but the new code sometimes will > > > skb_frag_fill_netmem_desc() and sometimes will skb_add_rx_frag_netmem. > > > I'm not sure why that logic changed. > > > > The reason why skb_add_rx_frag_netmem() is used here is to set > > skb->unreadable flag. the skb_frag_fill_netmem_desc() doesn't set > > skb->unreadable because it doesn't handle skb, it only handles frag. > > As far as I know, skb->unreadable should be set to true for devmem > > TCP, am I misunderstood? > > I tested that don't using skb_add_rx_frag_netmem() here, and it > > immediately fails. > > Yes, but netmem_ref can be either a net_iov or a normal page, > and skb_add_rx_frag_netmem() and similar helpers should automatically > set skb->unreadable or not. > > IOW you should be able to always use netmem-aware APIs, no? I'm not sure the update skb->unreadable flag is possible because frag API like skb_add_rx_frag_netmem(), receives only frag, not skb. How about an additional API to update skb->unreadable flag? skb_update_unreadable() or skb_update_netmem()? > > > > This is not the intended use of PP_FLAG_ALLOW_UNREADABLE_NETMEM. > > > > > > The driver should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when it's able > > > to handle unreadable netmem, it should not worry about whether > > > rxq->mp_params.mp_priv is set or not. > > > > > > You should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when HDS is enabled. > > > Let core figure out if mp_params.mp_priv is enabled. All the driver > > > needs to report is whether it's configured to be able to handle > > > unreadable netmem (which practically means HDS is enabled). > > > > The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM > > flag can't be used with PP_FLAG_DMA_SYNC_DEV. > > Hm. Isn't the existing check the wrong way around? Is the driver > supposed to sync the buffers for device before passing them down? I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV for dmabuf may be wrong. I think device memory TCP is not related to this flag. So device memory TCP core API should not return failure when PP_FLAG_DMA_SYNC_DEV flag is set. How about removing this condition check code in device memory TCP core?
On Thu, 10 Oct 2024 00:37:49 +0900 Taehee Yoo wrote: > > Yes, but netmem_ref can be either a net_iov or a normal page, > > and skb_add_rx_frag_netmem() and similar helpers should automatically > > set skb->unreadable or not. > > > > IOW you should be able to always use netmem-aware APIs, no? > > I'm not sure the update skb->unreadable flag is possible because > frag API like skb_add_rx_frag_netmem(), receives only frag, not skb. > How about an additional API to update skb->unreadable flag? > skb_update_unreadable() or skb_update_netmem()? Ah, the case where we don't get skb is because we're just building XDP frame at that stage. And XDP can't be netmem. In that case switching to skb_frag_fill_netmem_desc() should be enough. > > > The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM > > > flag can't be used with PP_FLAG_DMA_SYNC_DEV. > > > > Hm. Isn't the existing check the wrong way around? Is the driver > > supposed to sync the buffers for device before passing them down? > > I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV > for dmabuf may be wrong. > I think device memory TCP is not related to this flag. > So device memory TCP core API should not return failure when > PP_FLAG_DMA_SYNC_DEV flag is set. > How about removing this condition check code in device memory TCP core? I think we need to invert the check.. Mina, WDYT? diff --git a/net/core/devmem.c b/net/core/devmem.c index 11b91c12ee11..c5cace3f9831 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -331,12 +331,6 @@ int mp_dmabuf_devmem_init(struct page_pool *pool) if (!binding) return -EINVAL; - if (!pool->dma_map) - return -EOPNOTSUPP; - - if (pool->dma_sync) - return -EOPNOTSUPP; - if (pool->p.order != 0) return -E2BIG; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a813d30d2135..c8dbbf262de3 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -287,6 +287,12 @@ static int page_pool_init(struct page_pool *pool, } if (pool->mp_priv) { + if (!pool->dma_map || !pool->dma_sync) + return -EOPNOTSUPP; + + /* Memory provider is responsible for syncing the pages. */ + pool->dma_sync = 0; + err = mp_dmabuf_devmem_init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__,
On Wed, Oct 9, 2024 at 5:01 PM Jakub Kicinski <kuba@kernel.org> wrote: > > On Thu, 10 Oct 2024 00:37:49 +0900 Taehee Yoo wrote: > > > Yes, but netmem_ref can be either a net_iov or a normal page, > > > and skb_add_rx_frag_netmem() and similar helpers should automatically > > > set skb->unreadable or not. > > > > > > IOW you should be able to always use netmem-aware APIs, no? > > > > I'm not sure the update skb->unreadable flag is possible because > > frag API like skb_add_rx_frag_netmem(), receives only frag, not skb. > > How about an additional API to update skb->unreadable flag? > > skb_update_unreadable() or skb_update_netmem()? > > Ah, the case where we don't get skb is because we're just building XDP > frame at that stage. And XDP can't be netmem. > > In that case switching to skb_frag_fill_netmem_desc() should be enough. > > > > > The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM > > > > flag can't be used with PP_FLAG_DMA_SYNC_DEV. > > > > > > Hm. Isn't the existing check the wrong way around? Is the driver > > > supposed to sync the buffers for device before passing them down? > > > > I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV > > for dmabuf may be wrong. > > I think device memory TCP is not related to this flag. > > So device memory TCP core API should not return failure when > > PP_FLAG_DMA_SYNC_DEV flag is set. > > How about removing this condition check code in device memory TCP core? > > I think we need to invert the check.. > Mina, WDYT? > On a closer look, my feeling is similar to Taehee, PP_FLAG_DMA_SYNC_DEV should be orthogonal to memory providers. The memory providers allocate the memory and provide the dma-addr, but need not dma-sync the dma-addr, right? The driver can sync the dma-addr if it wants and the driver can delegate the syncing to the pp via PP_FLAG_DMA_SYNC_DEV if it wants. AFAICT I think the check should be removed, not inverted, but I could be missing something.
On Thu, 10 Oct 2024 10:44:38 -0700 Mina Almasry wrote: > > > I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV > > > for dmabuf may be wrong. > > > I think device memory TCP is not related to this flag. > > > So device memory TCP core API should not return failure when > > > PP_FLAG_DMA_SYNC_DEV flag is set. > > > How about removing this condition check code in device memory TCP core? > > > > I think we need to invert the check.. > > Mina, WDYT? > > On a closer look, my feeling is similar to Taehee, > PP_FLAG_DMA_SYNC_DEV should be orthogonal to memory providers. The > memory providers allocate the memory and provide the dma-addr, but > need not dma-sync the dma-addr, right? The driver can sync the > dma-addr if it wants and the driver can delegate the syncing to the pp > via PP_FLAG_DMA_SYNC_DEV if it wants. AFAICT I think the check should > be removed, not inverted, but I could be missing something. I don't know much about dmabuf but it hinges on the question whether doing DMA sync for device on a dmabuf address is : - a good thing - a noop - a bad thing If it's a good thing or a noop - agreed. Similar question for the sync for CPU. I agree that intuitively it should be all fine. But the fact that dmabuf has a bespoke API for accessing the memory by the CPU makes me worried that there may be assumptions about these addresses not getting randomly fed into the normal DMA API..
On Thu, Oct 10, 2024 at 6:34 PM Jakub Kicinski <kuba@kernel.org> wrote: > > On Thu, 10 Oct 2024 10:44:38 -0700 Mina Almasry wrote: > > > > I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV > > > > for dmabuf may be wrong. > > > > I think device memory TCP is not related to this flag. > > > > So device memory TCP core API should not return failure when > > > > PP_FLAG_DMA_SYNC_DEV flag is set. > > > > How about removing this condition check code in device memory TCP core? > > > > > > I think we need to invert the check.. > > > Mina, WDYT? > > > > On a closer look, my feeling is similar to Taehee, > > PP_FLAG_DMA_SYNC_DEV should be orthogonal to memory providers. The > > memory providers allocate the memory and provide the dma-addr, but > > need not dma-sync the dma-addr, right? The driver can sync the > > dma-addr if it wants and the driver can delegate the syncing to the pp > > via PP_FLAG_DMA_SYNC_DEV if it wants. AFAICT I think the check should > > be removed, not inverted, but I could be missing something. > > I don't know much about dmabuf but it hinges on the question whether > doing DMA sync for device on a dmabuf address is : > - a good thing > - a noop > - a bad thing > > If it's a good thing or a noop - agreed. > > Similar question for the sync for CPU. > > I agree that intuitively it should be all fine. But the fact that dmabuf > has a bespoke API for accessing the memory by the CPU makes me worried > that there may be assumptions about these addresses not getting > randomly fed into the normal DMA API.. Sorry I'm also a bit unsure what is the right thing to do here. The code that we've been running in GVE does a dma-sync for cpu unconditionally on RX for dma-buf and non-dmabuf dma-addrs and we haven't been seeing issues. It never does dma-sync for device. My first question is why is dma-sync for device needed on RX path at all for some drivers in the first place? For incoming (non-dmabuf) data, the data is written by the device and read by the cpu, so sync for cpu is really what's needed. Is the sync for device for XDP? Or is it that buffers should be dma-syncd for device before they are re-posted to the NIC? Christian/Jason, sorry quick question: are dma_sync_single_for_{device|cpu} needed or wanted when the dma-addrs come from a dma-buf? Or these dma-addrs to be treated like any other with the normal dma_sync_for_{device|cpu} rules? -- Thanks, Mina
On Fri, Oct 11, 2024 at 10:33:43AM -0700, Mina Almasry wrote: > On Thu, Oct 10, 2024 at 6:34 PM Jakub Kicinski <kuba@kernel.org> wrote: > > > > On Thu, 10 Oct 2024 10:44:38 -0700 Mina Almasry wrote: > > > > > I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV > > > > > for dmabuf may be wrong. > > > > > I think device memory TCP is not related to this flag. > > > > > So device memory TCP core API should not return failure when > > > > > PP_FLAG_DMA_SYNC_DEV flag is set. > > > > > How about removing this condition check code in device memory TCP core? > > > > > > > > I think we need to invert the check.. > > > > Mina, WDYT? > > > > > > On a closer look, my feeling is similar to Taehee, > > > PP_FLAG_DMA_SYNC_DEV should be orthogonal to memory providers. The > > > memory providers allocate the memory and provide the dma-addr, but > > > need not dma-sync the dma-addr, right? The driver can sync the > > > dma-addr if it wants and the driver can delegate the syncing to the pp > > > via PP_FLAG_DMA_SYNC_DEV if it wants. AFAICT I think the check should > > > be removed, not inverted, but I could be missing something. > > > > I don't know much about dmabuf but it hinges on the question whether > > doing DMA sync for device on a dmabuf address is : > > - a good thing > > - a noop > > - a bad thing > > > > If it's a good thing or a noop - agreed. > > > > Similar question for the sync for CPU. > > > > I agree that intuitively it should be all fine. But the fact that dmabuf > > has a bespoke API for accessing the memory by the CPU makes me worried > > that there may be assumptions about these addresses not getting > > randomly fed into the normal DMA API.. > > Sorry I'm also a bit unsure what is the right thing to do here. The > code that we've been running in GVE does a dma-sync for cpu > unconditionally on RX for dma-buf and non-dmabuf dma-addrs and we > haven't been seeing issues. It never does dma-sync for device. > > My first question is why is dma-sync for device needed on RX path at > all for some drivers in the first place? For incoming (non-dmabuf) > data, the data is written by the device and read by the cpu, so sync > for cpu is really what's needed. Is the sync for device for XDP? Or is > it that buffers should be dma-syncd for device before they are > re-posted to the NIC? > > Christian/Jason, sorry quick question: are > dma_sync_single_for_{device|cpu} needed or wanted when the dma-addrs > come from a dma-buf? Or these dma-addrs to be treated like any other > with the normal dma_sync_for_{device|cpu} rules? Um, I think because dma-buf hacks things up and generates illegal scatterlist entries with weird dma_map_resource() addresses for the typical P2P case the dma sync API should not be used on those things. However, there is no way to know if the dma-buf has does this, and there are valid case where the scatterlist is not ill formed and the sync is necessary. We are getting soo close to being able to start fixing these API issues in dmabuf, I hope next cylce we can begin.. Fingers crossed. From a CPU architecture perspective you do not need to cache flush PCI MMIO BAR memory, and perhaps doing so be might be problematic on some arches (???). But you do need to flush normal cachable CPU memory if that is in the DMA buf. Jason
On Sat, Oct 12, 2024 at 2:42 AM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Fri, Oct 11, 2024 at 10:33:43AM -0700, Mina Almasry wrote: > > On Thu, Oct 10, 2024 at 6:34 PM Jakub Kicinski <kuba@kernel.org> wrote: > > > > > > On Thu, 10 Oct 2024 10:44:38 -0700 Mina Almasry wrote: > > > > > > I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV > > > > > > for dmabuf may be wrong. > > > > > > I think device memory TCP is not related to this flag. > > > > > > So device memory TCP core API should not return failure when > > > > > > PP_FLAG_DMA_SYNC_DEV flag is set. > > > > > > How about removing this condition check code in device memory TCP core? > > > > > > > > > > I think we need to invert the check.. > > > > > Mina, WDYT? > > > > > > > > On a closer look, my feeling is similar to Taehee, > > > > PP_FLAG_DMA_SYNC_DEV should be orthogonal to memory providers. The > > > > memory providers allocate the memory and provide the dma-addr, but > > > > need not dma-sync the dma-addr, right? The driver can sync the > > > > dma-addr if it wants and the driver can delegate the syncing to the pp > > > > via PP_FLAG_DMA_SYNC_DEV if it wants. AFAICT I think the check should > > > > be removed, not inverted, but I could be missing something. > > > > > > I don't know much about dmabuf but it hinges on the question whether > > > doing DMA sync for device on a dmabuf address is : > > > - a good thing > > > - a noop > > > - a bad thing > > > > > > If it's a good thing or a noop - agreed. > > > > > > Similar question for the sync for CPU. > > > > > > I agree that intuitively it should be all fine. But the fact that dmabuf > > > has a bespoke API for accessing the memory by the CPU makes me worried > > > that there may be assumptions about these addresses not getting > > > randomly fed into the normal DMA API.. > > > > Sorry I'm also a bit unsure what is the right thing to do here. The > > code that we've been running in GVE does a dma-sync for cpu > > unconditionally on RX for dma-buf and non-dmabuf dma-addrs and we > > haven't been seeing issues. It never does dma-sync for device. > > > > My first question is why is dma-sync for device needed on RX path at > > all for some drivers in the first place? For incoming (non-dmabuf) > > data, the data is written by the device and read by the cpu, so sync > > for cpu is really what's needed. Is the sync for device for XDP? Or is > > it that buffers should be dma-syncd for device before they are > > re-posted to the NIC? > > > > Christian/Jason, sorry quick question: are > > dma_sync_single_for_{device|cpu} needed or wanted when the dma-addrs > > come from a dma-buf? Or these dma-addrs to be treated like any other > > with the normal dma_sync_for_{device|cpu} rules? > > Um, I think because dma-buf hacks things up and generates illegal > scatterlist entries with weird dma_map_resource() addresses for the > typical P2P case the dma sync API should not be used on those things. > > However, there is no way to know if the dma-buf has does this, and > there are valid case where the scatterlist is not ill formed and the > sync is necessary. > > We are getting soo close to being able to start fixing these API > issues in dmabuf, I hope next cylce we can begin.. Fingers crossed. > > From a CPU architecture perspective you do not need to cache flush PCI > MMIO BAR memory, and perhaps doing so be might be problematic on some > arches (???). But you do need to flush normal cachable CPU memory if > that is in the DMA buf. > Thanks Jason. In that case I agree with Jakub we should take in his change here: https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/ With this change the driver would delegate dma_sync_for_device to the page_pool, and the page_pool will skip it altogether for the dma-buf memory provider. -- Thanks, Mina
On Tue, 15 Oct 2024 01:38:20 +0300 Mina Almasry wrote: > Thanks Jason. In that case I agree with Jakub we should take in his change here: > > https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/ > > With this change the driver would delegate dma_sync_for_device to the > page_pool, and the page_pool will skip it altogether for the dma-buf > memory provider. And we need a wrapper for a sync for CPU which will skip if the page comes from an unreadable pool?
On Tue, Oct 15, 2024 at 3:16 AM Jakub Kicinski <kuba@kernel.org> wrote: > > On Tue, 15 Oct 2024 01:38:20 +0300 Mina Almasry wrote: > > Thanks Jason. In that case I agree with Jakub we should take in his change here: > > > > https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/ > > > > With this change the driver would delegate dma_sync_for_device to the > > page_pool, and the page_pool will skip it altogether for the dma-buf > > memory provider. > > And we need a wrapper for a sync for CPU which will skip if the page > comes from an unreadable pool? This is where it gets a bit tricky, no? Our production code does a dma_sync_for_cpu but no dma_sync_for_device. That has been working reliably for us with GPU dmabufs and udmabuf, but we haven't of course tested every dma-buf. I'm comfortable enforcing the 'no dma_sync_for_device' now since it brings upstream in line with our well tested setup. I'm not sure I'm 100% comfortable enforcing the 'no dma_sync_for_cpu' now since it's a deviation. The dma_sync_for_cpu is very very likely a no-op since we don't really access the data from cpu ever with devmem TCP, but who knows. Is it possible to give me a couple of weeks to make this change locally and run it through some testing to see if it breaks anything? But if you or Jason think that enforcing the 'no dma_buf_sync_for_cpu' now is critical, no problem. We can also provide this patch, and seek to revert it or fix it up properly later in the event it turns out it causes issues. Note that io_uring provider, or other non-dmabuf providers may need to do a dma-sync, but that bridge can be crossed in David's patchset.
On Tue, Oct 15, 2024 at 04:10:44AM +0300, Mina Almasry wrote: > On Tue, Oct 15, 2024 at 3:16 AM Jakub Kicinski <kuba@kernel.org> wrote: > > > > On Tue, 15 Oct 2024 01:38:20 +0300 Mina Almasry wrote: > > > Thanks Jason. In that case I agree with Jakub we should take in his change here: > > > > > > https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/ > > > > > > With this change the driver would delegate dma_sync_for_device to the > > > page_pool, and the page_pool will skip it altogether for the dma-buf > > > memory provider. > > > > And we need a wrapper for a sync for CPU which will skip if the page > > comes from an unreadable pool? > > This is where it gets a bit tricky, no? > > Our production code does a dma_sync_for_cpu but no > dma_sync_for_device. That has been working reliably for us with GPU Those functions are all NOP on systems you are testing on. The question is what is correct to do on systems where it is not a NOP, and none of this is really right, as I explained.. > But if you or Jason think that enforcing the 'no dma_buf_sync_for_cpu' > now is critical, no problem. We can also provide this patch, and seek > to revert it or fix it up properly later in the event it turns out it > causes issues. What is important is you organize things going forward to be able to do this properly, which means the required sync type is dependent on the actual page being synced and you will eventually somehow learn which is required from the dmabuf. Most likely nobody will ever run this code on system where dma_sync is not a NOP, but we should still use the DMA API properly and things should make architectural sense. Jason
On 10/14/24 23:38, Mina Almasry wrote: > On Sat, Oct 12, 2024 at 2:42 AM Jason Gunthorpe <jgg@ziepe.ca> wrote: >> >> On Fri, Oct 11, 2024 at 10:33:43AM -0700, Mina Almasry wrote: >>> On Thu, Oct 10, 2024 at 6:34 PM Jakub Kicinski <kuba@kernel.org> wrote: >>>> >>>> On Thu, 10 Oct 2024 10:44:38 -0700 Mina Almasry wrote: >>>>>>> I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV >>>>>>> for dmabuf may be wrong. >>>>>>> I think device memory TCP is not related to this flag. >>>>>>> So device memory TCP core API should not return failure when >>>>>>> PP_FLAG_DMA_SYNC_DEV flag is set. >>>>>>> How about removing this condition check code in device memory TCP core? >>>>>> >>>>>> I think we need to invert the check.. >>>>>> Mina, WDYT? >>>>> >>>>> On a closer look, my feeling is similar to Taehee, >>>>> PP_FLAG_DMA_SYNC_DEV should be orthogonal to memory providers. The >>>>> memory providers allocate the memory and provide the dma-addr, but >>>>> need not dma-sync the dma-addr, right? The driver can sync the >>>>> dma-addr if it wants and the driver can delegate the syncing to the pp >>>>> via PP_FLAG_DMA_SYNC_DEV if it wants. AFAICT I think the check should >>>>> be removed, not inverted, but I could be missing something. >>>> >>>> I don't know much about dmabuf but it hinges on the question whether >>>> doing DMA sync for device on a dmabuf address is : >>>> - a good thing >>>> - a noop >>>> - a bad thing >>>> >>>> If it's a good thing or a noop - agreed. >>>> >>>> Similar question for the sync for CPU. >>>> >>>> I agree that intuitively it should be all fine. But the fact that dmabuf >>>> has a bespoke API for accessing the memory by the CPU makes me worried >>>> that there may be assumptions about these addresses not getting >>>> randomly fed into the normal DMA API.. >>> >>> Sorry I'm also a bit unsure what is the right thing to do here. The >>> code that we've been running in GVE does a dma-sync for cpu >>> unconditionally on RX for dma-buf and non-dmabuf dma-addrs and we >>> haven't been seeing issues. It never does dma-sync for device. >>> >>> My first question is why is dma-sync for device needed on RX path at >>> all for some drivers in the first place? For incoming (non-dmabuf) >>> data, the data is written by the device and read by the cpu, so sync >>> for cpu is really what's needed. Is the sync for device for XDP? Or is >>> it that buffers should be dma-syncd for device before they are >>> re-posted to the NIC? >>> >>> Christian/Jason, sorry quick question: are >>> dma_sync_single_for_{device|cpu} needed or wanted when the dma-addrs >>> come from a dma-buf? Or these dma-addrs to be treated like any other >>> with the normal dma_sync_for_{device|cpu} rules? >> >> Um, I think because dma-buf hacks things up and generates illegal >> scatterlist entries with weird dma_map_resource() addresses for the >> typical P2P case the dma sync API should not be used on those things. >> >> However, there is no way to know if the dma-buf has does this, and >> there are valid case where the scatterlist is not ill formed and the >> sync is necessary. >> >> We are getting soo close to being able to start fixing these API >> issues in dmabuf, I hope next cylce we can begin.. Fingers crossed. >> >> From a CPU architecture perspective you do not need to cache flush PCI >> MMIO BAR memory, and perhaps doing so be might be problematic on some >> arches (???). But you do need to flush normal cachable CPU memory if >> that is in the DMA buf. >> > > Thanks Jason. In that case I agree with Jakub we should take in his change here: > > https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/ > > With this change the driver would delegate dma_sync_for_device to the > page_pool, and the page_pool will skip it altogether for the dma-buf > memory provider. Requiring ->dma_map should be common to all providers as page pool shouldn't be dipping to net_iovs figuring out how to map them. However, looking at this discussion seems that the ->dma_sync concern is devmem specific and should be discarded by pp providers using dmabufs, i.e. in devmem.c:mp_dmabuf_devmem_init().
On 2024-10-15 07:29, Pavel Begunkov wrote: > On 10/14/24 23:38, Mina Almasry wrote: >> On Sat, Oct 12, 2024 at 2:42 AM Jason Gunthorpe <jgg@ziepe.ca> wrote: >>> >>> On Fri, Oct 11, 2024 at 10:33:43AM -0700, Mina Almasry wrote: >>>> On Thu, Oct 10, 2024 at 6:34 PM Jakub Kicinski <kuba@kernel.org> wrote: >>>>> >>>>> On Thu, 10 Oct 2024 10:44:38 -0700 Mina Almasry wrote: >>>>>>>> I haven't thought the failure of PP_FLAG_DMA_SYNC_DEV >>>>>>>> for dmabuf may be wrong. >>>>>>>> I think device memory TCP is not related to this flag. >>>>>>>> So device memory TCP core API should not return failure when >>>>>>>> PP_FLAG_DMA_SYNC_DEV flag is set. >>>>>>>> How about removing this condition check code in device memory TCP core? >>>>>>> >>>>>>> I think we need to invert the check.. >>>>>>> Mina, WDYT? >>>>>> >>>>>> On a closer look, my feeling is similar to Taehee, >>>>>> PP_FLAG_DMA_SYNC_DEV should be orthogonal to memory providers. The >>>>>> memory providers allocate the memory and provide the dma-addr, but >>>>>> need not dma-sync the dma-addr, right? The driver can sync the >>>>>> dma-addr if it wants and the driver can delegate the syncing to the pp >>>>>> via PP_FLAG_DMA_SYNC_DEV if it wants. AFAICT I think the check should >>>>>> be removed, not inverted, but I could be missing something. >>>>> >>>>> I don't know much about dmabuf but it hinges on the question whether >>>>> doing DMA sync for device on a dmabuf address is : >>>>> - a good thing >>>>> - a noop >>>>> - a bad thing >>>>> >>>>> If it's a good thing or a noop - agreed. >>>>> >>>>> Similar question for the sync for CPU. >>>>> >>>>> I agree that intuitively it should be all fine. But the fact that dmabuf >>>>> has a bespoke API for accessing the memory by the CPU makes me worried >>>>> that there may be assumptions about these addresses not getting >>>>> randomly fed into the normal DMA API.. >>>> >>>> Sorry I'm also a bit unsure what is the right thing to do here. The >>>> code that we've been running in GVE does a dma-sync for cpu >>>> unconditionally on RX for dma-buf and non-dmabuf dma-addrs and we >>>> haven't been seeing issues. It never does dma-sync for device. >>>> >>>> My first question is why is dma-sync for device needed on RX path at >>>> all for some drivers in the first place? For incoming (non-dmabuf) >>>> data, the data is written by the device and read by the cpu, so sync >>>> for cpu is really what's needed. Is the sync for device for XDP? Or is >>>> it that buffers should be dma-syncd for device before they are >>>> re-posted to the NIC? >>>> >>>> Christian/Jason, sorry quick question: are >>>> dma_sync_single_for_{device|cpu} needed or wanted when the dma-addrs >>>> come from a dma-buf? Or these dma-addrs to be treated like any other >>>> with the normal dma_sync_for_{device|cpu} rules? >>> >>> Um, I think because dma-buf hacks things up and generates illegal >>> scatterlist entries with weird dma_map_resource() addresses for the >>> typical P2P case the dma sync API should not be used on those things. >>> >>> However, there is no way to know if the dma-buf has does this, and >>> there are valid case where the scatterlist is not ill formed and the >>> sync is necessary. >>> >>> We are getting soo close to being able to start fixing these API >>> issues in dmabuf, I hope next cylce we can begin.. Fingers crossed. >>> >>> From a CPU architecture perspective you do not need to cache flush PCI >>> MMIO BAR memory, and perhaps doing so be might be problematic on some >>> arches (???). But you do need to flush normal cachable CPU memory if >>> that is in the DMA buf. >>> >> >> Thanks Jason. In that case I agree with Jakub we should take in his change here: >> >> https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/ >> >> With this change the driver would delegate dma_sync_for_device to the >> page_pool, and the page_pool will skip it altogether for the dma-buf >> memory provider. > > Requiring ->dma_map should be common to all providers as page pool > shouldn't be dipping to net_iovs figuring out how to map them. However, > looking at this discussion seems that the ->dma_sync concern is devmem > specific and should be discarded by pp providers using dmabufs, i.e. in > devmem.c:mp_dmabuf_devmem_init(). Yes, that's my preference as well, see my earlier reply. >
On Tue, Oct 15, 2024 at 3:44 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Tue, Oct 15, 2024 at 04:10:44AM +0300, Mina Almasry wrote: > > On Tue, Oct 15, 2024 at 3:16 AM Jakub Kicinski <kuba@kernel.org> wrote: > > > > > > On Tue, 15 Oct 2024 01:38:20 +0300 Mina Almasry wrote: > > > > Thanks Jason. In that case I agree with Jakub we should take in his change here: > > > > > > > > https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/ > > > > > > > > With this change the driver would delegate dma_sync_for_device to the > > > > page_pool, and the page_pool will skip it altogether for the dma-buf > > > > memory provider. > > > > > > And we need a wrapper for a sync for CPU which will skip if the page > > > comes from an unreadable pool? > > > > This is where it gets a bit tricky, no? > > > > Our production code does a dma_sync_for_cpu but no > > dma_sync_for_device. That has been working reliably for us with GPU > > Those functions are all NOP on systems you are testing on. > OK, thanks. This is what I wanted to confirm. If you already know this here then there is no need to wait for me to confirm. > The question is what is correct to do on systems where it is not a > NOP, and none of this is really right, as I explained.. > > > But if you or Jason think that enforcing the 'no dma_buf_sync_for_cpu' > > now is critical, no problem. We can also provide this patch, and seek > > to revert it or fix it up properly later in the event it turns out it > > causes issues. > > What is important is you organize things going forward to be able to > do this properly, which means the required sync type is dependent on > the actual page being synced and you will eventually somehow learn > which is required from the dmabuf. > > Most likely nobody will ever run this code on system where dma_sync is > not a NOP, but we should still use the DMA API properly and things > should make architectural sense. > Makes sense. OK, we can do what Jakub suggested in the thread earlier. I.e. likely some wrapper which skips the dma_sync_for_cpu if the netmem is unreadable. -- Thanks, Mina
On Fri, Oct 18, 2024 at 5:25 PM Mina Almasry <almasrymina@google.com> wrote: > > On Tue, Oct 15, 2024 at 3:44 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > On Tue, Oct 15, 2024 at 04:10:44AM +0300, Mina Almasry wrote: > > > On Tue, Oct 15, 2024 at 3:16 AM Jakub Kicinski <kuba@kernel.org> wrote: > > > > > > > > On Tue, 15 Oct 2024 01:38:20 +0300 Mina Almasry wrote: > > > > > Thanks Jason. In that case I agree with Jakub we should take in his change here: > > > > > > > > > > https://lore.kernel.org/netdev/20241009170102.1980ed1d@kernel.org/ > > > > > > > > > > With this change the driver would delegate dma_sync_for_device to the > > > > > page_pool, and the page_pool will skip it altogether for the dma-buf > > > > > memory provider. > > > > > > > > And we need a wrapper for a sync for CPU which will skip if the page > > > > comes from an unreadable pool? > > > > > > This is where it gets a bit tricky, no? > > > > > > Our production code does a dma_sync_for_cpu but no > > > dma_sync_for_device. That has been working reliably for us with GPU > > > > Those functions are all NOP on systems you are testing on. > > > > OK, thanks. This is what I wanted to confirm. If you already know this > here then there is no need to wait for me to confirm. > > > The question is what is correct to do on systems where it is not a > > NOP, and none of this is really right, as I explained.. > > > > > But if you or Jason think that enforcing the 'no dma_buf_sync_for_cpu' > > > now is critical, no problem. We can also provide this patch, and seek > > > to revert it or fix it up properly later in the event it turns out it > > > causes issues. > > > > What is important is you organize things going forward to be able to > > do this properly, which means the required sync type is dependent on > > the actual page being synced and you will eventually somehow learn > > which is required from the dmabuf. > > > > Most likely nobody will ever run this code on system where dma_sync is > > not a NOP, but we should still use the DMA API properly and things > > should make architectural sense. > > > > Makes sense. OK, we can do what Jakub suggested in the thread earlier. > I.e. likely some wrapper which skips the dma_sync_for_cpu if the > netmem is unreadable. > Thanks a lot for confirmation about it. I will pass the PP_FLAG_ALLOW_UNREADABLE_NETMEM flag regardless of enabling/disabling devmem TCP in a v4 patch. The page_pool core logic will handle flags properly. I think patches for changes of page_pool are worked on by Mina, so I will not include changes for page_pool in a v4 patch. If you think I missed something, please let me know :) Thanks a lot! Taehee Yoo
diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig index 75ca3ddda1f5..f37ff12d4746 100644 --- a/drivers/net/ethernet/broadcom/Kconfig +++ b/drivers/net/ethernet/broadcom/Kconfig @@ -211,6 +211,7 @@ config BNXT select FW_LOADER select LIBCRC32C select NET_DEVLINK + select NET_DEVMEM select PAGE_POOL select DIMLIB select AUXILIARY_BUS diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 872b15842b11..64e07d247f97 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -55,6 +55,7 @@ #include <net/page_pool/helpers.h> #include <linux/align.h> #include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> #include "bnxt_hsi.h" #include "bnxt.h" @@ -863,6 +864,22 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) bnapi->events &= ~BNXT_TX_CMP_EVENT; } +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping, + struct bnxt_rx_ring_info *rxr, + unsigned int *offset, + gfp_t gfp) +{ + netmem_ref netmem; + + netmem = page_pool_alloc_netmem(rxr->page_pool, GFP_ATOMIC); + if (!netmem) + return 0; + *offset = 0; + + *mapping = page_pool_get_dma_addr_netmem(netmem) + *offset; + return netmem; +} + static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, struct bnxt_rx_ring_info *rxr, unsigned int *offset, @@ -972,21 +989,21 @@ static inline u16 bnxt_find_next_agg_idx(struct bnxt_rx_ring_info *rxr, u16 idx) return next; } -static inline int bnxt_alloc_rx_page(struct bnxt *bp, - struct bnxt_rx_ring_info *rxr, - u16 prod, gfp_t gfp) +static inline int bnxt_alloc_rx_netmem(struct bnxt *bp, + struct bnxt_rx_ring_info *rxr, + u16 prod, gfp_t gfp) { struct rx_bd *rxbd = &rxr->rx_agg_desc_ring[RX_AGG_RING(bp, prod)][RX_IDX(prod)]; struct bnxt_sw_rx_agg_bd *rx_agg_buf; - struct page *page; + netmem_ref netmem; dma_addr_t mapping; u16 sw_prod = rxr->rx_sw_agg_prod; unsigned int offset = 0; - page = __bnxt_alloc_rx_page(bp, &mapping, rxr, &offset, gfp); + netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, &offset, gfp); - if (!page) + if (!netmem) return -ENOMEM; if (unlikely(test_bit(sw_prod, rxr->rx_agg_bmap))) @@ -996,7 +1013,7 @@ static inline int bnxt_alloc_rx_page(struct bnxt *bp, rx_agg_buf = &rxr->rx_agg_ring[sw_prod]; rxr->rx_sw_agg_prod = RING_RX_AGG(bp, NEXT_RX_AGG(sw_prod)); - rx_agg_buf->page = page; + rx_agg_buf->netmem = netmem; rx_agg_buf->offset = offset; rx_agg_buf->mapping = mapping; rxbd->rx_bd_haddr = cpu_to_le64(mapping); @@ -1044,7 +1061,7 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx, struct rx_agg_cmp *agg; struct bnxt_sw_rx_agg_bd *cons_rx_buf, *prod_rx_buf; struct rx_bd *prod_bd; - struct page *page; + netmem_ref netmem; if (p5_tpa) agg = bnxt_get_tpa_agg_p5(bp, rxr, idx, start + i); @@ -1061,11 +1078,11 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx, cons_rx_buf = &rxr->rx_agg_ring[cons]; /* It is possible for sw_prod to be equal to cons, so - * set cons_rx_buf->page to NULL first. + * set cons_rx_buf->netmem to 0 first. */ - page = cons_rx_buf->page; - cons_rx_buf->page = NULL; - prod_rx_buf->page = page; + netmem = cons_rx_buf->netmem; + cons_rx_buf->netmem = 0; + prod_rx_buf->netmem = netmem; prod_rx_buf->offset = cons_rx_buf->offset; prod_rx_buf->mapping = cons_rx_buf->mapping; @@ -1192,6 +1209,7 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp, static u32 __bnxt_rx_agg_pages(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, + struct sk_buff *skb, struct skb_shared_info *shinfo, u16 idx, u32 agg_bufs, bool tpa, struct xdp_buff *xdp) @@ -1211,7 +1229,7 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, u16 cons, frag_len; struct rx_agg_cmp *agg; struct bnxt_sw_rx_agg_bd *cons_rx_buf; - struct page *page; + netmem_ref netmem; dma_addr_t mapping; if (p5_tpa) @@ -1223,9 +1241,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT; cons_rx_buf = &rxr->rx_agg_ring[cons]; - skb_frag_fill_page_desc(frag, cons_rx_buf->page, - cons_rx_buf->offset, frag_len); - shinfo->nr_frags = i + 1; + if (skb) { + skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem, + cons_rx_buf->offset, frag_len, + BNXT_RX_PAGE_SIZE); + } else { + skb_frag_fill_page_desc(frag, netmem_to_page(cons_rx_buf->netmem), + cons_rx_buf->offset, frag_len); + shinfo->nr_frags = i + 1; + } __clear_bit(cons, rxr->rx_agg_bmap); /* It is possible for bnxt_alloc_rx_page() to allocate @@ -1233,15 +1257,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, * need to clear the cons entry now. */ mapping = cons_rx_buf->mapping; - page = cons_rx_buf->page; - cons_rx_buf->page = NULL; + netmem = cons_rx_buf->netmem; + cons_rx_buf->netmem = 0; - if (xdp && page_is_pfmemalloc(page)) + if (xdp && page_is_pfmemalloc(netmem_to_page(netmem))) xdp_buff_set_frag_pfmemalloc(xdp); - if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_ATOMIC) != 0) { + if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_ATOMIC) != 0) { --shinfo->nr_frags; - cons_rx_buf->page = page; + cons_rx_buf->netmem = netmem; /* Update prod since possibly some pages have been * allocated already. @@ -1269,7 +1293,7 @@ static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp, struct skb_shared_info *shinfo = skb_shinfo(skb); u32 total_frag_len = 0; - total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, idx, + total_frag_len = __bnxt_rx_agg_pages(bp, cpr, skb, shinfo, idx, agg_bufs, tpa, NULL); if (!total_frag_len) { skb_mark_for_recycle(skb); @@ -1277,9 +1301,6 @@ static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp, return NULL; } - skb->data_len += total_frag_len; - skb->len += total_frag_len; - skb->truesize += BNXT_RX_PAGE_SIZE * agg_bufs; return skb; } @@ -1294,7 +1315,7 @@ static u32 bnxt_rx_agg_pages_xdp(struct bnxt *bp, if (!xdp_buff_has_frags(xdp)) shinfo->nr_frags = 0; - total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, + total_frag_len = __bnxt_rx_agg_pages(bp, cpr, NULL, shinfo, idx, agg_bufs, tpa, xdp); if (total_frag_len) { xdp_buff_set_frags_flag(xdp); @@ -3342,15 +3363,15 @@ static void bnxt_free_one_rx_agg_ring(struct bnxt *bp, struct bnxt_rx_ring_info for (i = 0; i < max_idx; i++) { struct bnxt_sw_rx_agg_bd *rx_agg_buf = &rxr->rx_agg_ring[i]; - struct page *page = rx_agg_buf->page; + netmem_ref netmem = rx_agg_buf->netmem; - if (!page) + if (!netmem) continue; - rx_agg_buf->page = NULL; + rx_agg_buf->netmem = 0; __clear_bit(i, rxr->rx_agg_bmap); - page_pool_recycle_direct(rxr->page_pool, page); + page_pool_put_full_netmem(rxr->page_pool, netmem, true); } } @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp) static int bnxt_alloc_rx_page_pool(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, + int queue_idx, int numa_node) { struct page_pool_params pp = { 0 }; + struct netdev_rx_queue *rxq; pp.pool_size = bp->rx_agg_ring_size; if (BNXT_RX_PAGE_MODE(bp)) @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, pp.dev = &bp->pdev->dev; pp.dma_dir = bp->rx_dir; pp.max_len = PAGE_SIZE; - pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; + pp.order = 0; + + rxq = __netif_get_rx_queue(bp->dev, queue_idx); + if (rxq->mp_params.mp_priv) + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM; + else + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; + pp.queue_idx = queue_idx; rxr->page_pool = page_pool_create(&pp); if (IS_ERR(rxr->page_pool)) { int err = PTR_ERR(rxr->page_pool); @@ -3655,7 +3685,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp) cpu_node = cpu_to_node(cpu); netdev_dbg(bp->dev, "Allocating page pool for rx_ring[%d] on numa_node: %d\n", i, cpu_node); - rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node); + rc = bnxt_alloc_rx_page_pool(bp, rxr, i, cpu_node); if (rc) return rc; @@ -4154,7 +4184,7 @@ static void bnxt_alloc_one_rx_ring_page(struct bnxt *bp, prod = rxr->rx_agg_prod; for (i = 0; i < bp->rx_agg_ring_size; i++) { - if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_KERNEL)) { + if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_KERNEL)) { netdev_warn(bp->dev, "init'ed rx ring %d with %d/%d pages only\n", ring_nr, i, bp->rx_ring_size); break; @@ -15063,7 +15093,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx) clone->rx_sw_agg_prod = 0; clone->rx_next_cons = 0; - rc = bnxt_alloc_rx_page_pool(bp, clone, rxr->page_pool->p.nid); + rc = bnxt_alloc_rx_page_pool(bp, clone, idx, rxr->page_pool->p.nid); if (rc) return rc; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 48f390519c35..3cf57a3c7664 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -895,7 +895,7 @@ struct bnxt_sw_rx_bd { }; struct bnxt_sw_rx_agg_bd { - struct page *page; + netmem_ref netmem; unsigned int offset; dma_addr_t mapping; };
Currently, bnxt_en driver satisfies the requirements of Device memory TCP, which is tcp-data-split. So, it implements Device memory TCP for bnxt_en driver. From now on, the aggregation ring handles netmem_ref instead of page regardless of the on/off of netmem. So, for the aggregation ring, memory will be handled with the netmem page_pool API instead of generic page_pool API. If Devmem is enabled, netmem_ref is used as-is and if Devmem is not enabled, netmem_ref will be converted to page and that is used. Driver recognizes whether the devmem is set or unset based on the mp_params.mp_priv is not NULL. Only if devmem is set, it passes PP_FLAG_ALLOW_UNREADABLE_NETMEM. Signed-off-by: Taehee Yoo <ap420073@gmail.com> --- v3: - Patch added drivers/net/ethernet/broadcom/Kconfig | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 98 +++++++++++++++-------- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +- 3 files changed, 66 insertions(+), 35 deletions(-)