diff mbox series

[net-next,07/10] net: enetc: use dedicated TX rings for XDP

Message ID 20210416212225.3576792-8-olteanv@gmail.com (mailing list archive)
State Accepted
Commit 7eab503b11ee1c4bb4a28866a6b029b0bbbeadfe
Delegated to: Netdev Maintainers
Headers show
Series Fixups for XDP on NXP ENETC | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 5 maintainers not CCed: hawk@kernel.org daniel@iogearbox.net bpf@vger.kernel.org ast@kernel.org john.fastabend@gmail.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 3 this patch: 3
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 3 this patch: 3
netdev/header_inline success Link

Commit Message

Vladimir Oltean April 16, 2021, 9:22 p.m. UTC
From: Vladimir Oltean <vladimir.oltean@nxp.com>

It is possible for one CPU to perform TX hashing (see netdev_pick_tx)
between the 8 ENETC TX rings, and the TX hashing to select TX queue 1.

At the same time, it is possible for the other CPU to already use TX
ring 1 for XDP (either XDP_TX or XDP_REDIRECT). Since there is no mutual
exclusion between XDP and the network stack, we run into an issue
because the ENETC TX procedure is not reentrant.

The obvious approach would be to just make XDP take the lock of the
network stack's TX queue corresponding to the ring it's about to enqueue
in.

For XDP_REDIRECT, this is quite straightforward, a lock at the beginning
and end of enetc_xdp_xmit() should do the trick.

But for XDP_TX, it's a bit more complicated. For one, we do TX batching
all by ourselves for frames with the XDP_TX verdict. This is something
we would like to keep the way it is, for performance reasons. But
batching means that the network stack's lock should be kept from the
first enqueued XDP_TX frame and until we ring the doorbell. That is
mostly fine, except for cases when in the same NAPI loop we have mixed
XDP_TX and XDP_REDIRECT frames. So if enetc_xdp_xmit() gets called while
we are holding the lock from the RX NAPI, then bam, deadlock. The naive
answer could be 'just flush the XDP_TX frames first, then release the
network stack's TX queue lock, then call xdp_do_flush_map()'. But even
xdp_do_redirect() is capable of flushing the batched XDP_REDIRECT
frames, so unless we unlock/relock the TX queue around xdp_do_redirect(),
there simply isn't any clean way to protect XDP_TX from concurrent
network stack .ndo_start_xmit() on another CPU.

So we need to take a different approach, and that is to reserve two
rings for the sole use of XDP. We leave TX rings
0..ndev->real_num_tx_queues-1 to be handled by the network stack, and we
pick them from the end of the priv->tx_ring array.

We make an effort to keep the mapping done by enetc_alloc_msix() which
decides which CPU handles the TX completions of which TX ring in its
NAPI poll. So the XDP TX ring of CPU 0 is handled by TX ring 6, and the
XDP TX ring of CPU 1 is handled by TX ring 7.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 drivers/net/ethernet/freescale/enetc/enetc.c | 46 +++++++++++++++++---
 drivers/net/ethernet/freescale/enetc/enetc.h |  1 +
 2 files changed, 40 insertions(+), 7 deletions(-)
diff mbox series

Patch

diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c
index c7b940979314..56190d861bb9 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc.c
@@ -9,6 +9,26 @@ 
 #include <linux/ptp_classify.h>
 #include <net/pkt_sched.h>
 
+static int enetc_num_stack_tx_queues(struct enetc_ndev_priv *priv)
+{
+	int num_tx_rings = priv->num_tx_rings;
+	int i;
+
+	for (i = 0; i < priv->num_rx_rings; i++)
+		if (priv->rx_ring[i]->xdp.prog)
+			return num_tx_rings - num_possible_cpus();
+
+	return num_tx_rings;
+}
+
+static struct enetc_bdr *enetc_rx_ring_from_xdp_tx_ring(struct enetc_ndev_priv *priv,
+							struct enetc_bdr *tx_ring)
+{
+	int index = &priv->tx_ring[tx_ring->index] - priv->xdp_tx_ring;
+
+	return priv->rx_ring[index];
+}
+
 static struct sk_buff *enetc_tx_swbd_get_skb(struct enetc_tx_swbd *tx_swbd)
 {
 	if (tx_swbd->is_xdp_tx || tx_swbd->is_xdp_redirect)
@@ -468,7 +488,6 @@  static void enetc_recycle_xdp_tx_buff(struct enetc_bdr *tx_ring,
 				      struct enetc_tx_swbd *tx_swbd)
 {
 	struct enetc_ndev_priv *priv = netdev_priv(tx_ring->ndev);
-	struct enetc_bdr *rx_ring = priv->rx_ring[tx_ring->index];
 	struct enetc_rx_swbd rx_swbd = {
 		.dma = tx_swbd->dma,
 		.page = tx_swbd->page,
@@ -476,6 +495,9 @@  static void enetc_recycle_xdp_tx_buff(struct enetc_bdr *tx_ring,
 		.dir = tx_swbd->dir,
 		.len = tx_swbd->len,
 	};
+	struct enetc_bdr *rx_ring;
+
+	rx_ring = enetc_rx_ring_from_xdp_tx_ring(priv, tx_ring);
 
 	if (likely(enetc_swbd_unused(rx_ring))) {
 		enetc_reuse_page(rx_ring, &rx_swbd);
@@ -1059,7 +1081,7 @@  int enetc_xdp_xmit(struct net_device *ndev, int num_frames,
 	int xdp_tx_bd_cnt, i, k;
 	int xdp_tx_frm_cnt = 0;
 
-	tx_ring = priv->tx_ring[smp_processor_id()];
+	tx_ring = priv->xdp_tx_ring[smp_processor_id()];
 
 	prefetchw(ENETC_TXBD(*tx_ring, tx_ring->next_to_use));
 
@@ -1221,8 +1243,8 @@  static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring,
 	int xdp_tx_bd_cnt, xdp_tx_frm_cnt = 0, xdp_redirect_frm_cnt = 0;
 	struct enetc_tx_swbd xdp_tx_arr[ENETC_MAX_SKB_FRAGS] = {0};
 	struct enetc_ndev_priv *priv = netdev_priv(rx_ring->ndev);
-	struct enetc_bdr *tx_ring = priv->tx_ring[rx_ring->index];
 	int rx_frm_cnt = 0, rx_byte_cnt = 0;
+	struct enetc_bdr *tx_ring;
 	int cleaned_cnt, i;
 	u32 xdp_act;
 
@@ -1280,6 +1302,7 @@  static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring,
 			napi_gro_receive(napi, skb);
 			break;
 		case XDP_TX:
+			tx_ring = priv->xdp_tx_ring[rx_ring->index];
 			xdp_tx_bd_cnt = enetc_rx_swbd_to_xdp_tx_swbd(xdp_tx_arr,
 								     rx_ring,
 								     orig_i, i);
@@ -2022,6 +2045,7 @@  void enetc_start(struct net_device *ndev)
 int enetc_open(struct net_device *ndev)
 {
 	struct enetc_ndev_priv *priv = netdev_priv(ndev);
+	int num_stack_tx_queues;
 	int err;
 
 	err = enetc_setup_irqs(priv);
@@ -2040,7 +2064,9 @@  int enetc_open(struct net_device *ndev)
 	if (err)
 		goto err_alloc_rx;
 
-	err = netif_set_real_num_tx_queues(ndev, priv->num_tx_rings);
+	num_stack_tx_queues = enetc_num_stack_tx_queues(priv);
+
+	err = netif_set_real_num_tx_queues(ndev, num_stack_tx_queues);
 	if (err)
 		goto err_set_queues;
 
@@ -2113,15 +2139,17 @@  static int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data)
 	struct enetc_ndev_priv *priv = netdev_priv(ndev);
 	struct tc_mqprio_qopt *mqprio = type_data;
 	struct enetc_bdr *tx_ring;
+	int num_stack_tx_queues;
 	u8 num_tc;
 	int i;
 
+	num_stack_tx_queues = enetc_num_stack_tx_queues(priv);
 	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
 	num_tc = mqprio->num_tc;
 
 	if (!num_tc) {
 		netdev_reset_tc(ndev);
-		netif_set_real_num_tx_queues(ndev, priv->num_tx_rings);
+		netif_set_real_num_tx_queues(ndev, num_stack_tx_queues);
 
 		/* Reset all ring priorities to 0 */
 		for (i = 0; i < priv->num_tx_rings; i++) {
@@ -2133,7 +2161,7 @@  static int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data)
 	}
 
 	/* Check if we have enough BD rings available to accommodate all TCs */
-	if (num_tc > priv->num_tx_rings) {
+	if (num_tc > num_stack_tx_queues) {
 		netdev_err(ndev, "Max %d traffic classes supported\n",
 			   priv->num_tx_rings);
 		return -EINVAL;
@@ -2421,8 +2449,9 @@  int enetc_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd)
 int enetc_alloc_msix(struct enetc_ndev_priv *priv)
 {
 	struct pci_dev *pdev = priv->si->pdev;
-	int v_tx_rings;
+	int first_xdp_tx_ring;
 	int i, n, err, nvec;
+	int v_tx_rings;
 
 	nvec = ENETC_BDR_INT_BASE_IDX + priv->bdr_int_num;
 	/* allocate MSIX for both messaging and Rx/Tx interrupts */
@@ -2497,6 +2526,9 @@  int enetc_alloc_msix(struct enetc_ndev_priv *priv)
 		}
 	}
 
+	first_xdp_tx_ring = priv->num_tx_rings - num_possible_cpus();
+	priv->xdp_tx_ring = &priv->tx_ring[first_xdp_tx_ring];
+
 	return 0;
 
 fail:
diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h
index 6f818e33e03b..3de71669e317 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc.h
+++ b/drivers/net/ethernet/freescale/enetc/enetc.h
@@ -317,6 +317,7 @@  struct enetc_ndev_priv {
 
 	u32 speed; /* store speed for compare update pspeed */
 
+	struct enetc_bdr **xdp_tx_ring;
 	struct enetc_bdr *tx_ring[16];
 	struct enetc_bdr *rx_ring[16];