diff mbox series

[RFC,Optimizing,veth,xsk,performance,05/10] veth: use send queue tx napi to xmit xsk tx desc

Message ID 20230803140441.53596-6-huangjie.albert@bytedance.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [RFC,Optimizing,veth,xsk,performance,01/10] veth: Implement ethtool's get_ringparam() callback | expand

Checks

Context Check Description
netdev/series_format warning Target tree name not specified in the subject
netdev/tree_selection success Guessed tree name to be net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 1328 this patch: 1331
netdev/cc_maintainers success CCed 10 of 10 maintainers
netdev/build_clang success Errors and warnings before: 1351 this patch: 1351
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 1351 this patch: 1354
netdev/checkpatch fail CHECK: Please don't use multiple blank lines ERROR: space required before the open parenthesis '(' WARNING: Block comments should align the * on each line WARNING: Missing a blank line after declarations WARNING: Missing commit description - Add an appropriate one WARNING: braces {} are not necessary for single statement blocks WARNING: line length of 108 exceeds 80 columns WARNING: line length of 113 exceeds 80 columns WARNING: line length of 121 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: networking block comments don't use an empty /* line, use /* Comment...
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

黄杰 Aug. 3, 2023, 2:04 p.m. UTC
Signed-off-by: huangjie.albert <huangjie.albert@bytedance.com>
---
 drivers/net/veth.c | 265 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 264 insertions(+), 1 deletion(-)

Comments

Simon Horman Aug. 4, 2023, 8:59 p.m. UTC | #1
On Thu, Aug 03, 2023 at 10:04:31PM +0800, huangjie.albert wrote:

Please include a patch description.

> Signed-off-by: huangjie.albert <huangjie.albert@bytedance.com>

Please consider formatting this as:

	... Albert Huang <huangjie.albert@bytedance.com>

> ---
>  drivers/net/veth.c | 265 ++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 264 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index 63c3ebe4c5d0..944761807ca4 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -27,6 +27,8 @@
>  #include <linux/bpf_trace.h>
>  #include <linux/net_tstamp.h>
>  #include <net/page_pool.h>
> +#include <net/xdp_sock_drv.h>
> +#include <net/xdp.h>
>  
>  #define DRV_NAME	"veth"
>  #define DRV_VERSION	"1.0"

> @@ -1061,6 +1063,176 @@ static int veth_poll(struct napi_struct *napi, int budget)
>  	return done;
>  }
>  
> +static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool, int budget)
> +{
> +	struct veth_priv *priv, *peer_priv;
> +	struct net_device *dev, *peer_dev;
> +	struct veth_rq *peer_rq;
> +	struct veth_stats peer_stats = {};
> +	struct veth_stats stats = {};
> +	struct veth_xdp_tx_bq bq;
> +	struct xdp_desc desc;
> +	void *xdpf;
> +	int done = 0;

Please try to use reverse xmas tree ordering - longest line to shortest -
for local variable declarations in new Networking code.

https://github.com/ecree-solarflare/xmastree is your friend here.

> +
> +	bq.count = 0;
> +	dev = sq->dev;
> +	priv = netdev_priv(dev);
> +	peer_dev = priv->peer;

Sparse seems a bit unhappy about this.

  .../veth.c:1081:18: warning: incorrect type in assignment (different address spaces)
  .../veth.c:1081:18:    expected struct net_device *peer_dev
  .../veth.c:1081:18:    got struct net_device [noderef] __rcu *peer

Looking over existing code in this file, perhaps this is appropriate:

	peer_dev = rtnl_dereference(priv->peer);

Likewise in a few other places in this patch.

...
diff mbox series

Patch

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 63c3ebe4c5d0..944761807ca4 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -27,6 +27,8 @@ 
 #include <linux/bpf_trace.h>
 #include <linux/net_tstamp.h>
 #include <net/page_pool.h>
+#include <net/xdp_sock_drv.h>
+#include <net/xdp.h>
 
 #define DRV_NAME	"veth"
 #define DRV_VERSION	"1.0"
@@ -1061,6 +1063,176 @@  static int veth_poll(struct napi_struct *napi, int budget)
 	return done;
 }
 
+static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool, int budget)
+{
+	struct veth_priv *priv, *peer_priv;
+	struct net_device *dev, *peer_dev;
+	struct veth_rq *peer_rq;
+	struct veth_stats peer_stats = {};
+	struct veth_stats stats = {};
+	struct veth_xdp_tx_bq bq;
+	struct xdp_desc desc;
+	void *xdpf;
+	int done = 0;
+
+	bq.count = 0;
+	dev = sq->dev;
+	priv = netdev_priv(dev);
+	peer_dev = priv->peer;
+	peer_priv = netdev_priv(peer_dev);
+
+	/* todo: queue index must set before this */
+	peer_rq = &peer_priv->rq[sq->queue_index];
+
+	/* set xsk wake up flag, to do: where to disable */
+	if (xsk_uses_need_wakeup(xsk_pool))
+		xsk_set_tx_need_wakeup(xsk_pool);
+
+	while (budget-- > 0) {
+		unsigned int truesize = 0;
+		struct xdp_frame *p_frame;
+		struct page *page;
+		void *new_addr;
+		void *addr;
+
+		/*
+		* get a desc from xsk pool
+		*/
+		if (!xsk_tx_peek_desc(xsk_pool, &desc)) {
+			break;
+		}
+
+		/*
+		* Get a xmit addr
+		* desc.addr is a offset, so we should to convert to real virtual address
+		*/
+		addr = xsk_buff_raw_get_data(xsk_pool, desc.addr);
+
+		/* can not hold all data in a page */
+		truesize =  SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + desc.len + sizeof(struct xdp_frame);
+		if (truesize > PAGE_SIZE) {
+			stats.xdp_drops++;
+			xsk_tx_completed_addr(xsk_pool, desc.addr);
+			continue;
+		}
+
+		page = dev_alloc_page();
+		if (!page) {
+			/*
+			* error , release xdp frame and increase drops
+			*/
+			xsk_tx_completed_addr(xsk_pool, desc.addr);
+			stats.xdp_drops++;
+			break;
+		}
+		new_addr = page_to_virt(page);
+
+		p_frame = new_addr;
+		new_addr += sizeof(struct xdp_frame);
+		p_frame->data = new_addr;
+		p_frame->len = desc.len;
+
+		/* frame should change to the page size, beacause the (struct skb_shared_info)  is so large,
+		 * if we build skb in veth_xdp_rcv_one, skb->tail may larger than skb->end which could triger a skb_panic
+		 */
+		p_frame->headroom = 0;
+		p_frame->metasize = 0;
+		p_frame->frame_sz = PAGE_SIZE;
+		p_frame->flags = 0;
+		p_frame->mem.type = MEM_TYPE_PAGE_SHARED;
+		memcpy(p_frame->data, addr, p_frame->len);
+		xsk_tx_completed_addr(xsk_pool, desc.addr);
+
+		/* if peer have xdp prog, if it has ,just send to peer */
+		p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats);
+		/* if no xdp with this queue, convert to skb to xmit*/
+		if (p_frame) {
+			xdpf = p_frame;
+			veth_xdp_rcv_bulk_skb(peer_rq, &xdpf, 1, &bq, &peer_stats);
+			p_frame = NULL;
+		}
+
+		stats.xdp_bytes += desc.len;
+
+		done++;
+	}
+
+	/* release, move consumer,and wakeup the producer */
+	if (done) {
+		napi_schedule(&peer_rq->xdp_napi);
+		xsk_tx_release(xsk_pool);
+	}
+
+
+
+	/* just for peer rq */
+	if (peer_stats.xdp_tx > 0)
+		veth_xdp_flush(peer_rq, &bq);
+	if (peer_stats.xdp_redirect > 0)
+		xdp_do_flush();
+
+	/* update peer rq stats, or maybe we do not need to do this */
+	u64_stats_update_begin(&peer_rq->stats.syncp);
+	peer_rq->stats.vs.xdp_redirect += peer_stats.xdp_redirect;
+	peer_rq->stats.vs.xdp_packets += done;
+	peer_rq->stats.vs.xdp_bytes += stats.xdp_bytes;
+	peer_rq->stats.vs.xdp_drops += peer_stats.xdp_drops;
+	peer_rq->stats.vs.rx_drops += peer_stats.rx_drops;
+	peer_rq->stats.vs.xdp_tx += peer_stats.xdp_tx;
+	u64_stats_update_end(&peer_rq->stats.syncp);
+
+	/* update sq stats */
+	u64_stats_update_begin(&sq->stats.syncp);
+	sq->stats.vs.xdp_packets += done;
+	sq->stats.vs.xdp_bytes += stats.xdp_bytes;
+	sq->stats.vs.xdp_drops += stats.xdp_drops;
+	u64_stats_update_end(&sq->stats.syncp);
+
+	return done;
+}
+
+static int veth_poll_tx(struct napi_struct *napi, int budget)
+{
+	struct veth_sq *sq = container_of(napi, struct veth_sq, xdp_napi);
+	struct xsk_buff_pool *pool;
+	int done = 0;
+	xdp_set_return_frame_no_direct();
+
+	sq->xsk.last_cpu = smp_processor_id();
+
+	/* xmit for tx queue */
+	rcu_read_lock();
+	pool = rcu_dereference(sq->xsk.pool);
+	if (pool) {
+		done  = veth_xsk_tx_xmit(sq, pool, budget);
+	}
+	rcu_read_unlock();
+
+	if (done < budget) {
+		/* if done < budget, the tx ring is no buffer */
+		napi_complete_done(napi, done);
+	}
+
+	xdp_clear_return_frame_no_direct();
+
+	return done;
+}
+
+
+static int veth_napi_add_tx(struct net_device *dev)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < dev->real_num_rx_queues; i++) {
+		struct veth_sq *sq = &priv->sq[i];
+		netif_napi_add(dev, &sq->xdp_napi, veth_poll_tx);
+		napi_enable(&sq->xdp_napi);
+	}
+
+	return 0;
+}
+
 static int veth_create_page_pool(struct veth_rq *rq)
 {
 	struct page_pool_params pp_params = {
@@ -1153,6 +1325,19 @@  static void veth_napi_del_range(struct net_device *dev, int start, int end)
 	}
 }
 
+static void veth_napi_del_tx(struct net_device *dev)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < dev->real_num_rx_queues; i++) {
+		struct veth_sq *sq = &priv->sq[i];
+
+		napi_disable(&sq->xdp_napi);
+		__netif_napi_del(&sq->xdp_napi);
+	}
+}
+
 static void veth_napi_del(struct net_device *dev)
 {
 	veth_napi_del_range(dev, 0, dev->real_num_rx_queues);
@@ -1360,7 +1545,7 @@  static void veth_set_xdp_features(struct net_device *dev)
 		struct veth_priv *priv_peer = netdev_priv(peer);
 		xdp_features_t val = NETDEV_XDP_ACT_BASIC |
 				     NETDEV_XDP_ACT_REDIRECT |
-				     NETDEV_XDP_ACT_RX_SG;
+				     NETDEV_XDP_ACT_RX_SG | NETDEV_XDP_ACT_XSK_ZEROCOPY;
 
 		if (priv_peer->_xdp_prog || veth_gro_requested(peer))
 			val |= NETDEV_XDP_ACT_NDO_XMIT |
@@ -1737,11 +1922,89 @@  static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	return err;
 }
 
+static int veth_xsk_pool_enable(struct net_device *dev, struct xsk_buff_pool *pool, u16 qid)
+{
+	struct veth_priv *peer_priv;
+	struct veth_priv *priv = netdev_priv(dev);
+	struct net_device *peer_dev = priv->peer;
+	int err = 0;
+
+	if (qid >= dev->real_num_tx_queues)
+		return -EINVAL;
+
+	if(!peer_dev)
+		return -EINVAL;
+
+	/* no dma, so we just skip dma skip in xsk zero copy */
+	pool->dma_check_skip = true;
+
+	peer_priv = netdev_priv(peer_dev);
+	/*
+	*  enable peer tx xdp here, this side
+	*  xdp is enable by veth_xdp_set
+	*  to do: we need to check whther this side is already enable xdp
+	*  maybe it do not have xdp prog
+	*/
+	if (!(peer_priv->_xdp_prog) && (!veth_gro_requested(peer_dev))) {
+		/*  peer should enable napi*/
+		err = veth_napi_enable(peer_dev);
+		if (err)
+			return err;
+	}
+
+	/* Here is already protected by rtnl_lock, so rcu_assign_pointer
+	 * is safe.
+	 */
+	rcu_assign_pointer(priv->sq[qid].xsk.pool, pool);
+
+	veth_napi_add_tx(dev);
+
+	return err;
+}
+
+static int veth_xsk_pool_disable(struct net_device *dev, u16 qid)
+{
+	struct veth_priv *peer_priv;
+	struct veth_priv *priv = netdev_priv(dev);
+	struct net_device *peer_dev = priv->peer;
+	int err = 0;
+
+	if (qid >= dev->real_num_tx_queues)
+		return -EINVAL;
+
+	if(!peer_dev)
+		return -EINVAL;
+
+	peer_priv = netdev_priv(peer_dev);
+
+	/* to do: this may be failed */
+	if (!(peer_priv->_xdp_prog) && (!veth_gro_requested(peer_dev))) {
+		/*  disable peer napi */
+		veth_napi_del(peer_dev);
+	}
+
+	veth_napi_del_tx(dev);
+
+	rcu_assign_pointer(priv->sq[qid].xsk.pool, NULL);
+	return err;
+}
+
+/* this  is for setup xdp */
+static int veth_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	if (xdp->xsk.pool)
+		return veth_xsk_pool_enable(dev, xdp->xsk.pool, xdp->xsk.queue_id);
+	else
+		return veth_xsk_pool_disable(dev, xdp->xsk.queue_id);
+}
+
 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return veth_xdp_set(dev, xdp->prog, xdp->extack);
+	case XDP_SETUP_XSK_POOL:
+		return veth_xsk_pool_setup(dev, xdp);
 	default:
 		return -EINVAL;
 	}