diff mbox series

[RFC,Optimizing,veth,xsk,performance,09/10] veth: support zero copy for af xdp

Message ID 20230803140441.53596-10-huangjie.albert@bytedance.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [RFC,Optimizing,veth,xsk,performance,01/10] veth: Implement ethtool's get_ringparam() callback | expand

Checks

Context Check Description
netdev/series_format warning Target tree name not specified in the subject
netdev/tree_selection success Guessed tree name to be net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1331 this patch: 1331
netdev/cc_maintainers success CCed 10 of 10 maintainers
netdev/build_clang success Errors and warnings before: 1351 this patch: 1351
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1354 this patch: 1354
netdev/checkpatch fail CHECK: Alignment should match open parenthesis CHECK: Unnecessary parentheses around 'xsk_headroom < sizeof(struct xdp_frame)' ERROR: space required before the open parenthesis '(' ERROR: that open brace { should be on the previous line WARNING: 'Signed-off-by:' is the preferred signature form WARNING: Block comments should align the * on each line WARNING: line length of 102 exceeds 80 columns WARNING: line length of 107 exceeds 80 columns WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns WARNING: networking block comments don't use an empty /* line, use /* Comment... WARNING: printk() should include KERN_<LEVEL> facility level WARNING: unnecessary cast may hide bugs, see http://c-faq.com/malloc/mallocnocast.html
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline fail Was 0 now: 1

Commit Message

黄杰 Aug. 3, 2023, 2:04 p.m. UTC
The following conditions need to be satisfied to achieve zero-copy:
1. The tx desc has enough space to store the xdp_frame and skb_share_info.
2. The memory address pointed to by the tx desc is within a page.

test zero copy with libxdp
Performance:
		     |MSS (bytes) | Packet rate (PPS)
AF_XDP               | 1300       | 480k
AF_XDP with zero copy| 1300       | 540K

signed-off-by: huangjie.albert <huangjie.albert@bytedance.com>
---
 drivers/net/veth.c | 207 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 178 insertions(+), 29 deletions(-)

Comments

Simon Horman Aug. 4, 2023, 9:05 p.m. UTC | #1
On Thu, Aug 03, 2023 at 10:04:35PM +0800, huangjie.albert wrote:

...

> +static struct sk_buff *veth_build_skb_zerocopy(struct net_device *dev, struct xsk_buff_pool *pool,
> +					      struct xdp_desc *desc)
> +{
> +	struct veth_seg_info *seg_info;
> +	struct sk_buff *skb;
> +	struct page *page;
> +	void *hard_start;
> +	u32 len, ts;
> +	void *buffer;
> +	int headroom;
> +	u64 addr;
> +	u32 index;
> +
> +	addr = desc->addr;
> +	len = desc->len;
> +	buffer = xsk_buff_raw_get_data(pool, addr);
> +	ts = pool->unaligned ? len : pool->chunk_size;
> +
> +	headroom = offset_in_page(buffer);
> +
> +	/* offset in umem pool buffer */
> +	addr = buffer - pool->addrs;
> +
> +	/* get the page of the desc */
> +	page = pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> +	/* in order to avoid to get freed by kfree_skb */
> +	get_page(page);
> +
> +	hard_start = page_to_virt(page);
> +
> +	skb = veth_build_skb(hard_start, headroom, len, ts);
> +	seg_info = (struct veth_seg_info *)kmalloc(struct_size(seg_info, desc, MAX_SKB_FRAGS), GFP_KERNEL);

There is no need to explicitly case the return value of kmalloc,
as it returns void *.

	seg_info = kmalloc(struct_size(seg_info, desc, MAX_SKB_FRAGS),
			   GFP_KERNEL);

...
diff mbox series

Patch

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 600225e27e9e..e4f1a8345f42 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -103,6 +103,11 @@  struct veth_xdp_tx_bq {
 	unsigned int count;
 };
 
+struct veth_seg_info {
+	u32 segs;
+	u64 desc[] ____cacheline_aligned_in_smp;
+};
+
 /*
  * ethtool interface
  */
@@ -645,6 +650,100 @@  static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp,
 	return 0;
 }
 
+static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
+				      int buflen)
+{
+	struct sk_buff *skb;
+
+	skb = build_skb(head, buflen);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, headroom);
+	skb_put(skb, len);
+
+	return skb;
+}
+
+static void veth_xsk_destruct_skb(struct sk_buff *skb)
+{
+	struct veth_seg_info *seg_info = (struct veth_seg_info *)skb_shinfo(skb)->destructor_arg;
+	struct xsk_buff_pool *pool = (struct xsk_buff_pool *)skb_shinfo(skb)->destructor_arg_xsk_pool;
+	unsigned long flags;
+	u32 index = 0;
+	u64 addr;
+
+	/* release cq */
+	spin_lock_irqsave(&pool->cq_lock, flags);
+	for (index = 0; index < seg_info->segs; index++) {
+		addr = (u64)(long)seg_info->desc[index];
+		xsk_tx_completed_addr(pool, addr);
+	}
+	spin_unlock_irqrestore(&pool->cq_lock, flags);
+
+	kfree(seg_info);
+	skb_shinfo(skb)->destructor_arg = NULL;
+	skb_shinfo(skb)->destructor_arg_xsk_pool = NULL;
+}
+
+static struct sk_buff *veth_build_skb_zerocopy(struct net_device *dev, struct xsk_buff_pool *pool,
+					      struct xdp_desc *desc)
+{
+	struct veth_seg_info *seg_info;
+	struct sk_buff *skb;
+	struct page *page;
+	void *hard_start;
+	u32 len, ts;
+	void *buffer;
+	int headroom;
+	u64 addr;
+	u32 index;
+
+	addr = desc->addr;
+	len = desc->len;
+	buffer = xsk_buff_raw_get_data(pool, addr);
+	ts = pool->unaligned ? len : pool->chunk_size;
+
+	headroom = offset_in_page(buffer);
+
+	/* offset in umem pool buffer */
+	addr = buffer - pool->addrs;
+
+	/* get the page of the desc */
+	page = pool->umem->pgs[addr >> PAGE_SHIFT];
+
+	/* in order to avoid to get freed by kfree_skb */
+	get_page(page);
+
+	hard_start = page_to_virt(page);
+
+	skb = veth_build_skb(hard_start, headroom, len, ts);
+	seg_info = (struct veth_seg_info *)kmalloc(struct_size(seg_info, desc, MAX_SKB_FRAGS), GFP_KERNEL);
+	if (!seg_info)
+	{
+		printk("here must to deal with\n");
+	}
+
+	/* later we will support gso for this */
+	index = skb_shinfo(skb)->gso_segs;
+	seg_info->desc[index] = desc->addr;
+	seg_info->segs = ++index;
+
+	skb->truesize += ts;
+	skb->dev = dev;
+	skb_shinfo(skb)->destructor_arg = (void *)(long)seg_info;
+	skb_shinfo(skb)->destructor_arg_xsk_pool = (void *)(long)pool;
+	skb->destructor = veth_xsk_destruct_skb;
+
+	/* set the mac header */
+	skb->protocol = eth_type_trans(skb, dev);
+
+	/* to do, add skb to sock. may be there is no need to do for this
+	*  refcount_add(ts, &xs->sk.sk_wmem_alloc);
+	*/
+	return skb;
+}
+
 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
 					  struct xdp_frame *frame,
 					  struct veth_xdp_tx_bq *bq,
@@ -1063,6 +1162,20 @@  static int veth_poll(struct napi_struct *napi, int budget)
 	return done;
 }
 
+/*  if buffer contain in a page */
+static inline bool buffer_in_page(void *buffer, u32 len)
+{
+	u32 offset;
+
+	offset = offset_in_page(buffer);
+
+	if(PAGE_SIZE - offset >= len) {
+		return true;
+	} else {
+		return false;
+	}
+}
+
 static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool, int budget)
 {
 	struct veth_priv *priv, *peer_priv;
@@ -1073,6 +1186,9 @@  static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
 	struct veth_xdp_tx_bq bq;
 	struct xdp_desc desc;
 	void *xdpf;
+	struct sk_buff *skb = NULL;
+	bool zc = xsk_pool->umem->zc;
+	u32 xsk_headroom = xsk_pool->headroom;
 	int done = 0;
 
 	bq.count = 0;
@@ -1102,12 +1218,6 @@  static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
 			break;
 		}
 
-		/*
-		* Get a xmit addr
-		* desc.addr is a offset, so we should to convert to real virtual address
-		*/
-		addr = xsk_buff_raw_get_data(xsk_pool, desc.addr);
-
 		/* can not hold all data in a page */
 		truesize =  SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + desc.len + sizeof(struct xdp_frame);
 		if (truesize > PAGE_SIZE) {
@@ -1116,16 +1226,39 @@  static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
 			continue;
 		}
 
-		page = dev_alloc_page();
-		if (!page) {
-			/*
-			* error , release xdp frame and increase drops
-			*/
-			xsk_tx_completed_addr(xsk_pool, desc.addr);
-			stats.xdp_drops++;
-			break;
+		/*
+		* Get a xmit addr
+		* desc.addr is a offset, so we should to convert to real virtual address
+		*/
+		addr = xsk_buff_raw_get_data(xsk_pool, desc.addr);
+
+		/*
+		 * in order to support zero copy, headroom must have enough space to hold xdp_frame
+		 */
+		if (zc && (xsk_headroom < sizeof(struct xdp_frame)))
+			zc = false;
+
+		/*
+		 * if desc not contain in a page, also do not support zero copy
+		*/
+		if (!buffer_in_page(addr, desc.len))
+			zc = false;
+
+		if (zc) {
+			/* headroom is reserved for xdp_frame */
+			new_addr = addr - sizeof(struct xdp_frame);
+		} else {
+			page = dev_alloc_page();
+			if (!page) {
+				/*
+				* error , release xdp frame and increase drops
+				*/
+				xsk_tx_completed_addr(xsk_pool, desc.addr);
+				stats.xdp_drops++;
+				break;
+			}
+			new_addr = page_to_virt(page);
 		}
-		new_addr = page_to_virt(page);
 
 		p_frame = new_addr;
 		new_addr += sizeof(struct xdp_frame);
@@ -1137,19 +1270,37 @@  static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
 		 */
 		p_frame->headroom = 0;
 		p_frame->metasize = 0;
-		p_frame->frame_sz = PAGE_SIZE;
 		p_frame->flags = 0;
-		p_frame->mem.type = MEM_TYPE_PAGE_SHARED;
-		memcpy(p_frame->data, addr, p_frame->len);
-		xsk_tx_completed_addr(xsk_pool, desc.addr);
-
-		/* if peer have xdp prog, if it has ,just send to peer */
-		p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats);
-		/* if no xdp with this queue, convert to skb to xmit*/
-		if (p_frame) {
-			xdpf = p_frame;
-			veth_xdp_rcv_bulk_skb(peer_rq, &xdpf, 1, &bq, &peer_stats);
-			p_frame = NULL;
+
+		if (zc) {
+			p_frame->frame_sz = xsk_pool->frame_len;
+			/* to do: if there is a xdp, how to recycle the tx desc */
+			p_frame->mem.type = MEM_TYPE_XSK_BUFF_POOL_TX;
+			/* no need to copy address for af+xdp */
+			p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats);
+			if (p_frame) {
+				skb = veth_build_skb_zerocopy(peer_dev, xsk_pool, &desc);
+				if (skb) {
+					napi_gro_receive(&peer_rq->xdp_napi, skb);
+					skb = NULL;
+				} else {
+					xsk_tx_completed_addr(xsk_pool, desc.addr);
+				}
+			}
+		} else {
+			p_frame->frame_sz = PAGE_SIZE;
+			p_frame->mem.type = MEM_TYPE_PAGE_SHARED;
+			memcpy(p_frame->data, addr, p_frame->len);
+			xsk_tx_completed_addr(xsk_pool, desc.addr);
+
+			/* if peer have xdp prog, if it has ,just send to peer */
+			p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats);
+			/* if no xdp with this queue, convert to skb to xmit*/
+			if (p_frame) {
+				xdpf = p_frame;
+				veth_xdp_rcv_bulk_skb(peer_rq, &xdpf, 1, &bq, &peer_stats);
+				p_frame = NULL;
+			}
 		}
 
 		stats.xdp_bytes += desc.len;
@@ -1163,8 +1314,6 @@  static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
 		xsk_tx_release(xsk_pool);
 	}
 
-
-
 	/* just for peer rq */
 	if (peer_stats.xdp_tx > 0)
 		veth_xdp_flush(peer_rq, &bq);