diff mbox series

[RFC,net-next,11/11] net: enetc: add TX support for zero-copy XDP sockets

Message ID 20230206100837.451300-12-vladimir.oltean@nxp.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series NXP ENETC AF_XDP zero-copy sockets | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 7 this patch: 7
netdev/cc_maintainers success CCed 11 of 11 maintainers
netdev/build_clang fail Errors and warnings before: 4 this patch: 4
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 7 this patch: 7
netdev/checkpatch warning WARNING: Duplicate signature
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Vladimir Oltean Feb. 6, 2023, 10:08 a.m. UTC
Schedule NAPI by hand from enetc_xsk_wakeup(), and send frames from the
XSK TX queue from NAPI context. Add them to the completion queue from
the enetc_clean_tx_ring() procedure which is common for all kinds of
traffic.

We reuse one of the TX rings for XDP (XDP_TX/XDP_REDIRECT) for XSK as
well. They are already cropped from the TX rings that the network stack
can use when XDP is enabled (with or without AF_XDP).

As for XDP_REDIRECT calling enetc's ndo_xdp_xmit, I'm not sure if that
can run simultaneously with enetc_poll() (on different CPUs, but towards
the same TXQ). I guess it probably can, but idk what to do about it.
The problem is that enetc_xdp_xmit() sends to
priv->xdp_tx_ring[smp_processor_id()], while enetc_xsk_xmit() and XDP_TX
send to priv->xdp_tx_ring[NAPI instance]. So when the NAPI instance runs
on a different CPU that the one it is numerically equal to, we should
have a lock that serializes XDP_REDIRECT with the others.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 drivers/net/ethernet/freescale/enetc/enetc.c | 102 ++++++++++++++++++-
 drivers/net/ethernet/freescale/enetc/enetc.h |   2 +
 2 files changed, 99 insertions(+), 5 deletions(-)

Comments

Vladimir Oltean Feb. 6, 2023, 10:19 a.m. UTC | #1
Yikes, ran git format-patch one patch too soon. There's also a hidden,
bonus patch "12/11" below. Doesn't affect first 11 patches in any way,
though. Here it is.

>From f7f10232622309d66a7a1ae1932d0c081936d546 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 25 Oct 2022 02:32:24 +0300
Subject: [RFC PATCH net-next 12/12] net: enetc: add support for XDP_TX
 with zero-copy XDP sockets

Add support for the case when the BPF program attached to a ring with an
XSK pool returns the XDP_TX verdict. The frame needs to go back on the
interface it came from.

No dma_map or dma_sync_for_device is necessary, just a small impedance
matching logic with the XDP_TX procedure we have in place for non-XSK,
since the data structures are different (xdp_buff vs xdp_frame; cannot
have multi-buffer with XSK).

In the TX confirmation routine, just release the RX buffer (as opposed
to non-XSK XDP_TX). Recycling might be possible, but I haven't
experimented with it.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 drivers/net/ethernet/freescale/enetc/enetc.c | 35 ++++++++++++++++++--
 drivers/net/ethernet/freescale/enetc/enetc.h |  1 +
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c
index bc0db788afc7..06aaf0334dc3 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc.c
@@ -855,7 +855,9 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget,
 				tx_win_drop++;
 		}
 
-		if (tx_swbd->is_xsk)
+		if (tx_swbd->is_xsk && tx_swbd->is_xdp_tx)
+			xsk_buff_free(tx_swbd->xsk_buff);
+		else if (tx_swbd->is_xsk)
 			(*xsk_confirmed)++;
 		else if (tx_swbd->is_xdp_tx)
 			enetc_recycle_xdp_tx_buff(tx_ring, tx_swbd);
@@ -1661,6 +1663,21 @@ static int enetc_rx_swbd_to_xdp_tx_swbd(struct enetc_tx_swbd *xdp_tx_arr,
 	return n;
 }
 
+static bool enetc_xsk_xdp_tx(struct enetc_bdr *tx_ring,
+			     struct xdp_buff *xsk_buff)
+{
+	struct enetc_tx_swbd tx_swbd = {
+		.dma = xsk_buff_xdp_get_dma(xsk_buff),
+		.len = xdp_get_buff_len(xsk_buff),
+		.is_xdp_tx = true,
+		.is_xsk = true,
+		.is_eof = true,
+		.xsk_buff = xsk_buff,
+	};
+
+	return enetc_xdp_tx(tx_ring, &tx_swbd, 1);
+}
+
 static void enetc_xdp_drop(struct enetc_bdr *rx_ring, int rx_ring_first,
 			   int rx_ring_last)
 {
@@ -1839,10 +1856,12 @@ static int enetc_clean_rx_ring_xsk(struct enetc_bdr *rx_ring,
 				   struct bpf_prog *prog,
 				   struct xsk_buff_pool *pool)
 {
+	struct enetc_ndev_priv *priv = netdev_priv(rx_ring->ndev);
+	struct enetc_bdr *tx_ring = priv->xdp_tx_ring[rx_ring->index];
+	int xdp_redirect_frm_cnt = 0, xdp_tx_frm_cnt = 0;
 	struct net_device *ndev = rx_ring->ndev;
 	union enetc_rx_bd *rxbd, *orig_rxbd;
 	int rx_frm_cnt = 0, rx_byte_cnt = 0;
-	int xdp_redirect_frm_cnt = 0;
 	struct xdp_buff *xsk_buff;
 	int buffs_missing, err, i;
 	bool wakeup_xsk = false;
@@ -1895,6 +1914,15 @@ static int enetc_clean_rx_ring_xsk(struct enetc_bdr *rx_ring,
 			enetc_xsk_buff_to_skb(xsk_buff, rx_ring, orig_rxbd,
 					      napi);
 			break;
+		case XDP_TX:
+			if (enetc_xsk_xdp_tx(tx_ring, xsk_buff)) {
+				xdp_tx_frm_cnt++;
+				tx_ring->stats.xdp_tx++;
+			} else {
+				xsk_buff_free(xsk_buff);
+				tx_ring->stats.xdp_tx_drops++;
+			}
+			break;
 		case XDP_REDIRECT:
 			err = xdp_do_redirect(ndev, xsk_buff, prog);
 			if (unlikely(err)) {
@@ -1919,6 +1947,9 @@ static int enetc_clean_rx_ring_xsk(struct enetc_bdr *rx_ring,
 	if (xdp_redirect_frm_cnt)
 		xdp_do_flush_map();
 
+	if (xdp_tx_frm_cnt)
+		enetc_update_tx_ring_tail(tx_ring);
+
 	if (xsk_uses_need_wakeup(pool)) {
 		if (wakeup_xsk)
 			xsk_set_rx_need_wakeup(pool);
diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h
index 403f40473b52..58df6c32cfb3 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc.h
+++ b/drivers/net/ethernet/freescale/enetc/enetc.h
@@ -24,6 +24,7 @@ struct enetc_tx_swbd {
 	union {
 		struct sk_buff *skb;
 		struct xdp_frame *xdp_frame;
+		struct xdp_buff *xsk_buff;
 	};
 	dma_addr_t dma;
 	struct page *page;	/* valid only if is_xdp_tx */
Maciej Fijalkowski Feb. 8, 2023, 4:37 p.m. UTC | #2
On Mon, Feb 06, 2023 at 12:08:37PM +0200, Vladimir Oltean wrote:

Hey Vladimir,

> Schedule NAPI by hand from enetc_xsk_wakeup(), and send frames from the
> XSK TX queue from NAPI context. Add them to the completion queue from
> the enetc_clean_tx_ring() procedure which is common for all kinds of
> traffic.
> 
> We reuse one of the TX rings for XDP (XDP_TX/XDP_REDIRECT) for XSK as
> well. They are already cropped from the TX rings that the network stack
> can use when XDP is enabled (with or without AF_XDP).
> 
> As for XDP_REDIRECT calling enetc's ndo_xdp_xmit, I'm not sure if that
> can run simultaneously with enetc_poll() (on different CPUs, but towards
> the same TXQ). I guess it probably can, but idk what to do about it.
> The problem is that enetc_xdp_xmit() sends to
> priv->xdp_tx_ring[smp_processor_id()], while enetc_xsk_xmit() and XDP_TX
> send to priv->xdp_tx_ring[NAPI instance]. So when the NAPI instance runs

Why not use cpu id on the latter then?

> on a different CPU that the one it is numerically equal to, we should
> have a lock that serializes XDP_REDIRECT with the others.
> 
> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
> ---
>  drivers/net/ethernet/freescale/enetc/enetc.c | 102 ++++++++++++++++++-
>  drivers/net/ethernet/freescale/enetc/enetc.h |   2 +
>  2 files changed, 99 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c
> index 3990c006c011..bc0db788afc7 100644
> --- a/drivers/net/ethernet/freescale/enetc/enetc.c
> +++ b/drivers/net/ethernet/freescale/enetc/enetc.c
> @@ -84,7 +84,7 @@ static void enetc_free_tx_swbd(struct enetc_bdr *tx_ring,
>  	struct xdp_frame *xdp_frame = enetc_tx_swbd_get_xdp_frame(tx_swbd);
>  	struct sk_buff *skb = enetc_tx_swbd_get_skb(tx_swbd);
>  
> -	if (tx_swbd->dma)
> +	if (!tx_swbd->is_xsk && tx_swbd->dma)
>  		enetc_unmap_tx_buff(tx_ring, tx_swbd);
>  
>  	if (xdp_frame) {
> @@ -817,7 +817,8 @@ static void enetc_recycle_xdp_tx_buff(struct enetc_bdr *tx_ring,
>  	rx_ring->xdp.xdp_tx_in_flight--;
>  }
>  
> -static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget)
> +static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget,
> +				int *xsk_confirmed)
>  {
>  	int tx_frm_cnt = 0, tx_byte_cnt = 0, tx_win_drop = 0;
>  	struct net_device *ndev = tx_ring->ndev;
> @@ -854,7 +855,9 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget)
>  				tx_win_drop++;
>  		}
>  
> -		if (tx_swbd->is_xdp_tx)
> +		if (tx_swbd->is_xsk)
> +			(*xsk_confirmed)++;
> +		else if (tx_swbd->is_xdp_tx)
>  			enetc_recycle_xdp_tx_buff(tx_ring, tx_swbd);
>  		else if (likely(tx_swbd->dma))
>  			enetc_unmap_tx_buff(tx_ring, tx_swbd);
> @@ -1465,6 +1468,58 @@ int enetc_xdp_xmit(struct net_device *ndev, int num_frames,
>  }
>  EXPORT_SYMBOL_GPL(enetc_xdp_xmit);
>  
> +static void enetc_xsk_map_tx_desc(struct enetc_tx_swbd *tx_swbd,
> +				  const struct xdp_desc *xsk_desc,
> +				  struct xsk_buff_pool *pool)
> +{
> +	dma_addr_t dma;
> +
> +	dma = xsk_buff_raw_get_dma(pool, xsk_desc->addr);
> +	xsk_buff_raw_dma_sync_for_device(pool, dma, xsk_desc->len);
> +
> +	tx_swbd->dma = dma;
> +	tx_swbd->len = xsk_desc->len;
> +	tx_swbd->is_xsk = true;
> +	tx_swbd->is_eof = true;
> +}
> +
> +static bool enetc_xsk_xmit(struct net_device *ndev, struct xsk_buff_pool *pool,
> +			   u32 queue_id)
> +{
> +	struct enetc_ndev_priv *priv = netdev_priv(ndev);
> +	struct xdp_desc *xsk_descs = pool->tx_descs;
> +	struct enetc_tx_swbd tx_swbd = {0};
> +	struct enetc_bdr *tx_ring;
> +	u32 budget, batch;
> +	int i, k;
> +
> +	tx_ring = priv->xdp_tx_ring[queue_id];
> +
> +	/* Shouldn't race with anyone because we are running in the softirq
> +	 * of the only CPU that sends packets to this TX ring
> +	 */
> +	budget = min(enetc_bd_unused(tx_ring) - 1, ENETC_XSK_TX_BATCH);
> +
> +	batch = xsk_tx_peek_release_desc_batch(pool, budget);
> +	if (!batch)
> +		return true;
> +
> +	i = tx_ring->next_to_use;
> +
> +	for (k = 0; k < batch; k++) {
> +		enetc_xsk_map_tx_desc(&tx_swbd, &xsk_descs[k], pool);
> +		enetc_xdp_map_tx_buff(tx_ring, i, &tx_swbd, tx_swbd.len);
> +		enetc_bdr_idx_inc(tx_ring, &i);
> +	}
> +
> +	tx_ring->next_to_use = i;
> +
> +	xsk_tx_release(pool);

xsk_tx_release() is not needed if you're using
xsk_tx_peek_release_desc_batch() above, it will do this for you at the end
of its job.

> +	enetc_update_tx_ring_tail(tx_ring);
> +
> +	return budget != batch;
> +}
> +
>  static void enetc_map_rx_buff_to_xdp(struct enetc_bdr *rx_ring, int i,
>  				     struct xdp_buff *xdp_buff, u16 size)
>  {
> @@ -1881,6 +1936,7 @@ static int enetc_poll(struct napi_struct *napi, int budget)
>  	struct enetc_bdr *rx_ring = &v->rx_ring;
>  	struct xsk_buff_pool *pool;
>  	struct bpf_prog *prog;
> +	int xsk_confirmed = 0;
>  	bool complete = true;
>  	int work_done;
>  	int i;
> @@ -1888,7 +1944,8 @@ static int enetc_poll(struct napi_struct *napi, int budget)
>  	enetc_lock_mdio();
>  
>  	for (i = 0; i < v->count_tx_rings; i++)
> -		if (!enetc_clean_tx_ring(&v->tx_ring[i], budget))
> +		if (!enetc_clean_tx_ring(&v->tx_ring[i], budget,
> +					 &xsk_confirmed))
>  			complete = false;
>  
>  	prog = rx_ring->xdp.prog;
> @@ -1901,6 +1958,17 @@ static int enetc_poll(struct napi_struct *napi, int budget)
>  	else
>  		work_done = enetc_clean_rx_ring(rx_ring, napi, budget);
>  
> +	if (pool) {
> +		if (xsk_confirmed)
> +			xsk_tx_completed(pool, xsk_confirmed);
> +
> +		if (xsk_uses_need_wakeup(pool))
> +			xsk_set_tx_need_wakeup(pool);
> +
> +		if (!enetc_xsk_xmit(rx_ring->ndev, pool, rx_ring->index))
> +			complete = false;
> +	}
> +
>  	if (work_done == budget)
>  		complete = false;
>  	if (work_done)
> @@ -3122,7 +3190,31 @@ static int enetc_setup_xsk_pool(struct net_device *ndev,
>  
>  int enetc_xsk_wakeup(struct net_device *ndev, u32 queue_id, u32 flags)
>  {
> -	/* xp_assign_dev() wants this; nothing needed for RX */
> +	struct enetc_ndev_priv *priv = netdev_priv(ndev);
> +	struct enetc_int_vector *v;
> +	struct enetc_bdr *rx_ring;
> +
> +	if (!netif_running(ndev) || !netif_carrier_ok(ndev))
> +		return -ENETDOWN;
> +
> +	if (queue_id >= priv->bdr_int_num)
> +		return -ERANGE;
> +
> +	v = priv->int_vector[queue_id];
> +	rx_ring = &v->rx_ring;
> +
> +	if (!rx_ring->xdp.xsk_pool || !rx_ring->xdp.prog)
> +		return -EINVAL;
> +
> +	/* No way to kick TX by triggering a hardirq right away =>
> +	 * raise softirq. This might schedule NAPI on a CPU different than the
> +	 * smp_affinity of its IRQ would suggest, but that would happen anyway
> +	 * if, say, we change that affinity under heavy traffic.
> +	 * So enetc_poll() has to be prepared for it anyway.
> +	 */
> +	if (!napi_if_scheduled_mark_missed(&v->napi))
> +		napi_schedule(&v->napi);
> +
>  	return 0;
>  }
>  
> diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h
> index e1a746e37c9a..403f40473b52 100644
> --- a/drivers/net/ethernet/freescale/enetc/enetc.h
> +++ b/drivers/net/ethernet/freescale/enetc/enetc.h
> @@ -36,6 +36,7 @@ struct enetc_tx_swbd {
>  	u8 is_eof:1;
>  	u8 is_xdp_tx:1;
>  	u8 is_xdp_redirect:1;
> +	u8 is_xsk:1;
>  	u8 qbv_en:1;
>  };
>  
> @@ -86,6 +87,7 @@ struct enetc_xdp_data {
>  #define ENETC_RX_RING_DEFAULT_SIZE	2048
>  #define ENETC_TX_RING_DEFAULT_SIZE	2048
>  #define ENETC_DEFAULT_TX_WORK		(ENETC_TX_RING_DEFAULT_SIZE / 2)
> +#define ENETC_XSK_TX_BATCH		ENETC_DEFAULT_TX_WORK
>  
>  struct enetc_bdr_resource {
>  	/* Input arguments saved for teardown */
> -- 
> 2.34.1
>
Maciej Fijalkowski Feb. 8, 2023, 4:38 p.m. UTC | #3
On Mon, Feb 06, 2023 at 12:19:21PM +0200, Vladimir Oltean wrote:
> Yikes, ran git format-patch one patch too soon. There's also a hidden,
> bonus patch "12/11" below. Doesn't affect first 11 patches in any way,
> though. Here it is.
> 
> From f7f10232622309d66a7a1ae1932d0c081936d546 Mon Sep 17 00:00:00 2001
> From: Vladimir Oltean <vladimir.oltean@nxp.com>
> Date: Tue, 25 Oct 2022 02:32:24 +0300
> Subject: [RFC PATCH net-next 12/12] net: enetc: add support for XDP_TX
>  with zero-copy XDP sockets
> 
> Add support for the case when the BPF program attached to a ring with an
> XSK pool returns the XDP_TX verdict. The frame needs to go back on the
> interface it came from.
> 
> No dma_map or dma_sync_for_device is necessary, just a small impedance
> matching logic with the XDP_TX procedure we have in place for non-XSK,
> since the data structures are different (xdp_buff vs xdp_frame; cannot
> have multi-buffer with XSK).
> 
> In the TX confirmation routine, just release the RX buffer (as opposed
> to non-XSK XDP_TX). Recycling might be possible, but I haven't
> experimented with it.

ugh premature response on 10/11.

> 
> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
> ---
>  drivers/net/ethernet/freescale/enetc/enetc.c | 35 ++++++++++++++++++--
>  drivers/net/ethernet/freescale/enetc/enetc.h |  1 +
>  2 files changed, 34 insertions(+), 2 deletions(-)
Vladimir Oltean Feb. 8, 2023, 5:08 p.m. UTC | #4
Hi Maciej,

On Wed, Feb 08, 2023 at 05:37:35PM +0100, Maciej Fijalkowski wrote:
> On Mon, Feb 06, 2023 at 12:08:37PM +0200, Vladimir Oltean wrote:
> 
> Hey Vladimir,
> 
> > Schedule NAPI by hand from enetc_xsk_wakeup(), and send frames from the
> > XSK TX queue from NAPI context. Add them to the completion queue from
> > the enetc_clean_tx_ring() procedure which is common for all kinds of
> > traffic.
> > 
> > We reuse one of the TX rings for XDP (XDP_TX/XDP_REDIRECT) for XSK as
> > well. They are already cropped from the TX rings that the network stack
> > can use when XDP is enabled (with or without AF_XDP).
> > 
> > As for XDP_REDIRECT calling enetc's ndo_xdp_xmit, I'm not sure if that
> > can run simultaneously with enetc_poll() (on different CPUs, but towards
> > the same TXQ). I guess it probably can, but idk what to do about it.
> > The problem is that enetc_xdp_xmit() sends to
> > priv->xdp_tx_ring[smp_processor_id()], while enetc_xsk_xmit() and XDP_TX
> > send to priv->xdp_tx_ring[NAPI instance]. So when the NAPI instance runs
> 
> Why not use cpu id on the latter then?

Hmm, because I want the sendto() syscall to trigger wakeup of the NAPI
that sends traffic to the proper queue_id, rather than to the queue_id
affine to the CPU that the sendto() syscall was made?
Maciej Fijalkowski Feb. 8, 2023, 5:17 p.m. UTC | #5
On Wed, Feb 08, 2023 at 07:08:15PM +0200, Vladimir Oltean wrote:
> Hi Maciej,
> 
> On Wed, Feb 08, 2023 at 05:37:35PM +0100, Maciej Fijalkowski wrote:
> > On Mon, Feb 06, 2023 at 12:08:37PM +0200, Vladimir Oltean wrote:
> > 
> > Hey Vladimir,
> > 
> > > Schedule NAPI by hand from enetc_xsk_wakeup(), and send frames from the
> > > XSK TX queue from NAPI context. Add them to the completion queue from
> > > the enetc_clean_tx_ring() procedure which is common for all kinds of
> > > traffic.
> > > 
> > > We reuse one of the TX rings for XDP (XDP_TX/XDP_REDIRECT) for XSK as
> > > well. They are already cropped from the TX rings that the network stack
> > > can use when XDP is enabled (with or without AF_XDP).
> > > 
> > > As for XDP_REDIRECT calling enetc's ndo_xdp_xmit, I'm not sure if that
> > > can run simultaneously with enetc_poll() (on different CPUs, but towards
> > > the same TXQ). I guess it probably can, but idk what to do about it.
> > > The problem is that enetc_xdp_xmit() sends to
> > > priv->xdp_tx_ring[smp_processor_id()], while enetc_xsk_xmit() and XDP_TX
> > > send to priv->xdp_tx_ring[NAPI instance]. So when the NAPI instance runs
> > 
> > Why not use cpu id on the latter then?
> 
> Hmm, because I want the sendto() syscall to trigger wakeup of the NAPI
> that sends traffic to the proper queue_id, rather than to the queue_id
> affine to the CPU that the sendto() syscall was made?

Ok i was referring to the thing that using cpu id on both sides would
address concurrency. Regarding your need, what i did for ice was that i
assign xdp_ring to the rx_ring and within ndo_xsk_wakeup() i pick the
rx_ring based on queue_id that comes as an arg:

https://lore.kernel.org/bpf/20220822163257.2382487-3-anthony.l.nguyen@intel.com/
Vladimir Oltean March 20, 2023, 4:30 p.m. UTC | #6
Hi Maciej,

On Wed, Feb 08, 2023 at 06:17:52PM +0100, Maciej Fijalkowski wrote:
> Ok i was referring to the thing that using cpu id on both sides would
> address concurrency. Regarding your need, what i did for ice was that i
> assign xdp_ring to the rx_ring and within ndo_xsk_wakeup() i pick the
> rx_ring based on queue_id that comes as an arg:
> 
> https://lore.kernel.org/bpf/20220822163257.2382487-3-anthony.l.nguyen@intel.com/

I looked at your response a few times in the past month, and I'm sorry
to say, but I don't have enough background in the ice driver to really
understand what you're saying and how that helps in the given situation.
If you could rephrase or explain some more, that would be great.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c
index 3990c006c011..bc0db788afc7 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc.c
@@ -84,7 +84,7 @@  static void enetc_free_tx_swbd(struct enetc_bdr *tx_ring,
 	struct xdp_frame *xdp_frame = enetc_tx_swbd_get_xdp_frame(tx_swbd);
 	struct sk_buff *skb = enetc_tx_swbd_get_skb(tx_swbd);
 
-	if (tx_swbd->dma)
+	if (!tx_swbd->is_xsk && tx_swbd->dma)
 		enetc_unmap_tx_buff(tx_ring, tx_swbd);
 
 	if (xdp_frame) {
@@ -817,7 +817,8 @@  static void enetc_recycle_xdp_tx_buff(struct enetc_bdr *tx_ring,
 	rx_ring->xdp.xdp_tx_in_flight--;
 }
 
-static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget)
+static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget,
+				int *xsk_confirmed)
 {
 	int tx_frm_cnt = 0, tx_byte_cnt = 0, tx_win_drop = 0;
 	struct net_device *ndev = tx_ring->ndev;
@@ -854,7 +855,9 @@  static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget)
 				tx_win_drop++;
 		}
 
-		if (tx_swbd->is_xdp_tx)
+		if (tx_swbd->is_xsk)
+			(*xsk_confirmed)++;
+		else if (tx_swbd->is_xdp_tx)
 			enetc_recycle_xdp_tx_buff(tx_ring, tx_swbd);
 		else if (likely(tx_swbd->dma))
 			enetc_unmap_tx_buff(tx_ring, tx_swbd);
@@ -1465,6 +1468,58 @@  int enetc_xdp_xmit(struct net_device *ndev, int num_frames,
 }
 EXPORT_SYMBOL_GPL(enetc_xdp_xmit);
 
+static void enetc_xsk_map_tx_desc(struct enetc_tx_swbd *tx_swbd,
+				  const struct xdp_desc *xsk_desc,
+				  struct xsk_buff_pool *pool)
+{
+	dma_addr_t dma;
+
+	dma = xsk_buff_raw_get_dma(pool, xsk_desc->addr);
+	xsk_buff_raw_dma_sync_for_device(pool, dma, xsk_desc->len);
+
+	tx_swbd->dma = dma;
+	tx_swbd->len = xsk_desc->len;
+	tx_swbd->is_xsk = true;
+	tx_swbd->is_eof = true;
+}
+
+static bool enetc_xsk_xmit(struct net_device *ndev, struct xsk_buff_pool *pool,
+			   u32 queue_id)
+{
+	struct enetc_ndev_priv *priv = netdev_priv(ndev);
+	struct xdp_desc *xsk_descs = pool->tx_descs;
+	struct enetc_tx_swbd tx_swbd = {0};
+	struct enetc_bdr *tx_ring;
+	u32 budget, batch;
+	int i, k;
+
+	tx_ring = priv->xdp_tx_ring[queue_id];
+
+	/* Shouldn't race with anyone because we are running in the softirq
+	 * of the only CPU that sends packets to this TX ring
+	 */
+	budget = min(enetc_bd_unused(tx_ring) - 1, ENETC_XSK_TX_BATCH);
+
+	batch = xsk_tx_peek_release_desc_batch(pool, budget);
+	if (!batch)
+		return true;
+
+	i = tx_ring->next_to_use;
+
+	for (k = 0; k < batch; k++) {
+		enetc_xsk_map_tx_desc(&tx_swbd, &xsk_descs[k], pool);
+		enetc_xdp_map_tx_buff(tx_ring, i, &tx_swbd, tx_swbd.len);
+		enetc_bdr_idx_inc(tx_ring, &i);
+	}
+
+	tx_ring->next_to_use = i;
+
+	xsk_tx_release(pool);
+	enetc_update_tx_ring_tail(tx_ring);
+
+	return budget != batch;
+}
+
 static void enetc_map_rx_buff_to_xdp(struct enetc_bdr *rx_ring, int i,
 				     struct xdp_buff *xdp_buff, u16 size)
 {
@@ -1881,6 +1936,7 @@  static int enetc_poll(struct napi_struct *napi, int budget)
 	struct enetc_bdr *rx_ring = &v->rx_ring;
 	struct xsk_buff_pool *pool;
 	struct bpf_prog *prog;
+	int xsk_confirmed = 0;
 	bool complete = true;
 	int work_done;
 	int i;
@@ -1888,7 +1944,8 @@  static int enetc_poll(struct napi_struct *napi, int budget)
 	enetc_lock_mdio();
 
 	for (i = 0; i < v->count_tx_rings; i++)
-		if (!enetc_clean_tx_ring(&v->tx_ring[i], budget))
+		if (!enetc_clean_tx_ring(&v->tx_ring[i], budget,
+					 &xsk_confirmed))
 			complete = false;
 
 	prog = rx_ring->xdp.prog;
@@ -1901,6 +1958,17 @@  static int enetc_poll(struct napi_struct *napi, int budget)
 	else
 		work_done = enetc_clean_rx_ring(rx_ring, napi, budget);
 
+	if (pool) {
+		if (xsk_confirmed)
+			xsk_tx_completed(pool, xsk_confirmed);
+
+		if (xsk_uses_need_wakeup(pool))
+			xsk_set_tx_need_wakeup(pool);
+
+		if (!enetc_xsk_xmit(rx_ring->ndev, pool, rx_ring->index))
+			complete = false;
+	}
+
 	if (work_done == budget)
 		complete = false;
 	if (work_done)
@@ -3122,7 +3190,31 @@  static int enetc_setup_xsk_pool(struct net_device *ndev,
 
 int enetc_xsk_wakeup(struct net_device *ndev, u32 queue_id, u32 flags)
 {
-	/* xp_assign_dev() wants this; nothing needed for RX */
+	struct enetc_ndev_priv *priv = netdev_priv(ndev);
+	struct enetc_int_vector *v;
+	struct enetc_bdr *rx_ring;
+
+	if (!netif_running(ndev) || !netif_carrier_ok(ndev))
+		return -ENETDOWN;
+
+	if (queue_id >= priv->bdr_int_num)
+		return -ERANGE;
+
+	v = priv->int_vector[queue_id];
+	rx_ring = &v->rx_ring;
+
+	if (!rx_ring->xdp.xsk_pool || !rx_ring->xdp.prog)
+		return -EINVAL;
+
+	/* No way to kick TX by triggering a hardirq right away =>
+	 * raise softirq. This might schedule NAPI on a CPU different than the
+	 * smp_affinity of its IRQ would suggest, but that would happen anyway
+	 * if, say, we change that affinity under heavy traffic.
+	 * So enetc_poll() has to be prepared for it anyway.
+	 */
+	if (!napi_if_scheduled_mark_missed(&v->napi))
+		napi_schedule(&v->napi);
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h
index e1a746e37c9a..403f40473b52 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc.h
+++ b/drivers/net/ethernet/freescale/enetc/enetc.h
@@ -36,6 +36,7 @@  struct enetc_tx_swbd {
 	u8 is_eof:1;
 	u8 is_xdp_tx:1;
 	u8 is_xdp_redirect:1;
+	u8 is_xsk:1;
 	u8 qbv_en:1;
 };
 
@@ -86,6 +87,7 @@  struct enetc_xdp_data {
 #define ENETC_RX_RING_DEFAULT_SIZE	2048
 #define ENETC_TX_RING_DEFAULT_SIZE	2048
 #define ENETC_DEFAULT_TX_WORK		(ENETC_TX_RING_DEFAULT_SIZE / 2)
+#define ENETC_XSK_TX_BATCH		ENETC_DEFAULT_TX_WORK
 
 struct enetc_bdr_resource {
 	/* Input arguments saved for teardown */