diff mbox series

[bpf-next,v3,3/3] xsk: build skb by page

Message ID 340f1dfa40416dd966a56e08507daba82d633088.1611236588.git.xuanzhuo@linux.alibaba.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series xsk: build skb by page | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers success CCed 16 of 16 maintainers
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 1 this patch: 1
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 135 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 1 this patch: 1
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Xuan Zhuo Jan. 21, 2021, 1:47 p.m. UTC
This patch is used to construct skb based on page to save memory copy
overhead.

This function is implemented based on IFF_TX_SKB_NO_LINEAR. Only the
network card priv_flags supports IFF_TX_SKB_NO_LINEAR will use page to
directly construct skb. If this feature is not supported, it is still
necessary to copy data to construct skb.

---------------- Performance Testing ------------

The test environment is Aliyun ECS server.
Test cmd:
```
xdpsock -i eth0 -t  -S -s <msg size>
```

Test result data:

size    64      512     1024    1500
copy    1916747 1775988 1600203 1440054
page    1974058 1953655 1945463 1904478
percent 3.0%    10.0%   21.58%  32.3%

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
---
 net/xdp/xsk.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 86 insertions(+), 18 deletions(-)

Comments

Magnus Karlsson Jan. 21, 2021, 3:17 p.m. UTC | #1
On Thu, Jan 21, 2021 at 2:51 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> This patch is used to construct skb based on page to save memory copy
> overhead.
>
> This function is implemented based on IFF_TX_SKB_NO_LINEAR. Only the
> network card priv_flags supports IFF_TX_SKB_NO_LINEAR will use page to
> directly construct skb. If this feature is not supported, it is still
> necessary to copy data to construct skb.
>
> ---------------- Performance Testing ------------
>
> The test environment is Aliyun ECS server.
> Test cmd:
> ```
> xdpsock -i eth0 -t  -S -s <msg size>
> ```
>
> Test result data:
>
> size    64      512     1024    1500
> copy    1916747 1775988 1600203 1440054
> page    1974058 1953655 1945463 1904478
> percent 3.0%    10.0%   21.58%  32.3%
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> ---
>  net/xdp/xsk.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 86 insertions(+), 18 deletions(-)

Applied, compiled and tried it out on my NIC that does not support
IFF_TX_SKB_NO_LINEAR and it works fine. Thank you Xuan for all your
efforts. Appreciated.

Now it would be nice if we could get some physical NIC drivers to
support this too. Some probably already do and can just set the bit,
while others need some modifications to support this.

Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>

> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 4a83117..38af7f1 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -430,6 +430,87 @@ static void xsk_destruct_skb(struct sk_buff *skb)
>         sock_wfree(skb);
>  }
>
> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> +                                             struct xdp_desc *desc)
> +{
> +       u32 len, offset, copy, copied;
> +       struct sk_buff *skb;
> +       struct page *page;
> +       void *buffer;
> +       int err, i;
> +       u64 addr;
> +
> +       skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> +       if (unlikely(!skb))
> +               return ERR_PTR(err);
> +
> +       addr = desc->addr;
> +       len = desc->len;
> +
> +       buffer = xsk_buff_raw_get_data(xs->pool, addr);
> +       offset = offset_in_page(buffer);
> +       addr = buffer - xs->pool->addrs;
> +
> +       for (copied = 0, i = 0; copied < len; i++) {
> +               page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> +               get_page(page);
> +
> +               copy = min_t(u32, PAGE_SIZE - offset, len - copied);
> +
> +               skb_fill_page_desc(skb, i, page, offset, copy);
> +
> +               copied += copy;
> +               addr += copy;
> +               offset = 0;
> +       }
> +
> +       skb->len += len;
> +       skb->data_len += len;
> +       skb->truesize += len;
> +
> +       refcount_add(len, &xs->sk.sk_wmem_alloc);
> +
> +       return skb;
> +}
> +
> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> +                                    struct xdp_desc *desc)
> +{
> +       struct sk_buff *skb;
> +
> +       if (xs->dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
> +               skb = xsk_build_skb_zerocopy(xs, desc);
> +               if (IS_ERR(skb))
> +                       return skb;
> +       } else {
> +               void *buffer;
> +               u32 len;
> +               int err;
> +
> +               len = desc->len;
> +               skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
> +               if (unlikely(!skb))
> +                       return ERR_PTR(err);
> +
> +               skb_put(skb, len);
> +               buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> +               err = skb_store_bits(skb, 0, buffer, len);
> +               if (unlikely(err)) {
> +                       kfree_skb(skb);
> +                       return ERR_PTR(err);
> +               }
> +       }
> +
> +       skb->dev = xs->dev;
> +       skb->priority = xs->sk.sk_priority;
> +       skb->mark = xs->sk.sk_mark;
> +       skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> +       skb->destructor = xsk_destruct_skb;
> +
> +       return skb;
> +}
> +
>  static int xsk_generic_xmit(struct sock *sk)
>  {
>         struct xdp_sock *xs = xdp_sk(sk);
> @@ -446,43 +527,30 @@ static int xsk_generic_xmit(struct sock *sk)
>                 goto out;
>
>         while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> -               char *buffer;
> -               u64 addr;
> -               u32 len;
> -
>                 if (max_batch-- == 0) {
>                         err = -EAGAIN;
>                         goto out;
>                 }
>
> -               len = desc.len;
> -               skb = sock_alloc_send_skb(sk, len, 1, &err);
> -               if (unlikely(!skb))
> +               skb = xsk_build_skb(xs, &desc);
> +               if (IS_ERR(skb)) {
> +                       err = PTR_ERR(skb);
>                         goto out;
> +               }
>
> -               skb_put(skb, len);
> -               addr = desc.addr;
> -               buffer = xsk_buff_raw_get_data(xs->pool, addr);
> -               err = skb_store_bits(skb, 0, buffer, len);
>                 /* This is the backpressure mechanism for the Tx path.
>                  * Reserve space in the completion queue and only proceed
>                  * if there is space in it. This avoids having to implement
>                  * any buffering in the Tx path.
>                  */
>                 spin_lock_irqsave(&xs->pool->cq_lock, flags);
> -               if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> +               if (xskq_prod_reserve(xs->pool->cq)) {
>                         spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
>                         kfree_skb(skb);
>                         goto out;
>                 }
>                 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
>
> -               skb->dev = xs->dev;
> -               skb->priority = sk->sk_priority;
> -               skb->mark = sk->sk_mark;
> -               skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> -               skb->destructor = xsk_destruct_skb;
> -
>                 err = __dev_direct_xmit(skb, xs->queue_id);
>                 if  (err == NETDEV_TX_BUSY) {
>                         /* Tell user-space to retry the send */
> --
> 1.8.3.1
>
Eric Dumazet Jan. 21, 2021, 3:41 p.m. UTC | #2
On 1/21/21 2:47 PM, Xuan Zhuo wrote:
> This patch is used to construct skb based on page to save memory copy
> overhead.
> 
> This function is implemented based on IFF_TX_SKB_NO_LINEAR. Only the
> network card priv_flags supports IFF_TX_SKB_NO_LINEAR will use page to
> directly construct skb. If this feature is not supported, it is still
> necessary to copy data to construct skb.
> 
> ---------------- Performance Testing ------------
> 
> The test environment is Aliyun ECS server.
> Test cmd:
> ```
> xdpsock -i eth0 -t  -S -s <msg size>
> ```
> 
> Test result data:
> 
> size    64      512     1024    1500
> copy    1916747 1775988 1600203 1440054
> page    1974058 1953655 1945463 1904478
> percent 3.0%    10.0%   21.58%  32.3%
> 
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> ---
>  net/xdp/xsk.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 86 insertions(+), 18 deletions(-)
> 
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 4a83117..38af7f1 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -430,6 +430,87 @@ static void xsk_destruct_skb(struct sk_buff *skb)
>  	sock_wfree(skb);
>  }
>  
> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> +					      struct xdp_desc *desc)
> +{
> +	u32 len, offset, copy, copied;
> +	struct sk_buff *skb;
> +	struct page *page;
> +	void *buffer;
> +	int err, i;
> +	u64 addr;
> +
> +	skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> +	if (unlikely(!skb))
> +		return ERR_PTR(err);
> +
> +	addr = desc->addr;
> +	len = desc->len;
> +
> +	buffer = xsk_buff_raw_get_data(xs->pool, addr);
> +	offset = offset_in_page(buffer);
> +	addr = buffer - xs->pool->addrs;
> +
> +	for (copied = 0, i = 0; copied < len; i++) {
> +		page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> +		get_page(page);
> +
> +		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
> +
> +		skb_fill_page_desc(skb, i, page, offset, copy);
> +
> +		copied += copy;
> +		addr += copy;
> +		offset = 0;
> +	}
> +
> +	skb->len += len;
> +	skb->data_len += len;

> +	skb->truesize += len;

This is not the truesize, unfortunately.

We need to account for the number of pages, not number of bytes.

> +
> +	refcount_add(len, &xs->sk.sk_wmem_alloc);
> +
> +	return skb;
> +}
> +
Alexander Lobakin Jan. 22, 2021, 11:47 a.m. UTC | #3
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 21 Jan 2021 16:41:33 +0100

> On 1/21/21 2:47 PM, Xuan Zhuo wrote:
> > This patch is used to construct skb based on page to save memory copy
> > overhead.
> > 
> > This function is implemented based on IFF_TX_SKB_NO_LINEAR. Only the
> > network card priv_flags supports IFF_TX_SKB_NO_LINEAR will use page to
> > directly construct skb. If this feature is not supported, it is still
> > necessary to copy data to construct skb.
> > 
> > ---------------- Performance Testing ------------
> > 
> > The test environment is Aliyun ECS server.
> > Test cmd:
> > ```
> > xdpsock -i eth0 -t  -S -s <msg size>
> > ```
> > 
> > Test result data:
> > 
> > size    64      512     1024    1500
> > copy    1916747 1775988 1600203 1440054
> > page    1974058 1953655 1945463 1904478
> > percent 3.0%    10.0%   21.58%  32.3%
> > 
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> > ---
> >  net/xdp/xsk.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++----------
> >  1 file changed, 86 insertions(+), 18 deletions(-)
> > 
> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > index 4a83117..38af7f1 100644
> > --- a/net/xdp/xsk.c
> > +++ b/net/xdp/xsk.c
> > @@ -430,6 +430,87 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> >  	sock_wfree(skb);
> >  }
> >  
> > +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > +					      struct xdp_desc *desc)
> > +{
> > +	u32 len, offset, copy, copied;
> > +	struct sk_buff *skb;
> > +	struct page *page;
> > +	void *buffer;
> > +	int err, i;
> > +	u64 addr;
> > +
> > +	skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> > +	if (unlikely(!skb))
> > +		return ERR_PTR(err);
> > +
> > +	addr = desc->addr;
> > +	len = desc->len;
> > +
> > +	buffer = xsk_buff_raw_get_data(xs->pool, addr);
> > +	offset = offset_in_page(buffer);
> > +	addr = buffer - xs->pool->addrs;
> > +
> > +	for (copied = 0, i = 0; copied < len; i++) {
> > +		page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> > +
> > +		get_page(page);
> > +
> > +		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
> > +
> > +		skb_fill_page_desc(skb, i, page, offset, copy);
> > +
> > +		copied += copy;
> > +		addr += copy;
> > +		offset = 0;
> > +	}
> > +
> > +	skb->len += len;
> > +	skb->data_len += len;
> 
> > +	skb->truesize += len;
> 
> This is not the truesize, unfortunately.
> 
> We need to account for the number of pages, not number of bytes.

The easiest solution is:

	skb->truesize += PAGE_SIZE * i;

i would be equal to skb_shinfo(skb)->nr_frags after exiting the loop.

> > +
> > +	refcount_add(len, &xs->sk.sk_wmem_alloc);
> > +
> > +	return skb;
> > +}
> > +

Al
Alexander Lobakin Jan. 22, 2021, 11:55 a.m. UTC | #4
From: Alexander Lobakin <alobakin@pm.me>
Date: Fri, 22 Jan 2021 11:47:45 +0000

> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Thu, 21 Jan 2021 16:41:33 +0100
> 
> > On 1/21/21 2:47 PM, Xuan Zhuo wrote:
> > > This patch is used to construct skb based on page to save memory copy
> > > overhead.
> > > 
> > > This function is implemented based on IFF_TX_SKB_NO_LINEAR. Only the
> > > network card priv_flags supports IFF_TX_SKB_NO_LINEAR will use page to
> > > directly construct skb. If this feature is not supported, it is still
> > > necessary to copy data to construct skb.
> > > 
> > > ---------------- Performance Testing ------------
> > > 
> > > The test environment is Aliyun ECS server.
> > > Test cmd:
> > > ```
> > > xdpsock -i eth0 -t  -S -s <msg size>
> > > ```
> > > 
> > > Test result data:
> > > 
> > > size    64      512     1024    1500
> > > copy    1916747 1775988 1600203 1440054
> > > page    1974058 1953655 1945463 1904478
> > > percent 3.0%    10.0%   21.58%  32.3%
> > > 
> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> > > ---
> > >  net/xdp/xsk.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++----------
> > >  1 file changed, 86 insertions(+), 18 deletions(-)
> > > 
> > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > index 4a83117..38af7f1 100644
> > > --- a/net/xdp/xsk.c
> > > +++ b/net/xdp/xsk.c
> > > @@ -430,6 +430,87 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> > >  	sock_wfree(skb);
> > >  }
> > >  
> > > +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > +					      struct xdp_desc *desc)
> > > +{
> > > +	u32 len, offset, copy, copied;
> > > +	struct sk_buff *skb;
> > > +	struct page *page;
> > > +	void *buffer;
> > > +	int err, i;
> > > +	u64 addr;
> > > +
> > > +	skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> > > +	if (unlikely(!skb))
> > > +		return ERR_PTR(err);
> > > +
> > > +	addr = desc->addr;
> > > +	len = desc->len;
> > > +
> > > +	buffer = xsk_buff_raw_get_data(xs->pool, addr);
> > > +	offset = offset_in_page(buffer);
> > > +	addr = buffer - xs->pool->addrs;
> > > +
> > > +	for (copied = 0, i = 0; copied < len; i++) {
> > > +		page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> > > +
> > > +		get_page(page);
> > > +
> > > +		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
> > > +
> > > +		skb_fill_page_desc(skb, i, page, offset, copy);
> > > +
> > > +		copied += copy;
> > > +		addr += copy;
> > > +		offset = 0;
> > > +	}
> > > +
> > > +	skb->len += len;
> > > +	skb->data_len += len;
> > 
> > > +	skb->truesize += len;
> > 
> > This is not the truesize, unfortunately.
> > 
> > We need to account for the number of pages, not number of bytes.
> 
> The easiest solution is:
> 
> 	skb->truesize += PAGE_SIZE * i;
> 
> i would be equal to skb_shinfo(skb)->nr_frags after exiting the loop.

Oops, pls ignore this. I forgot that XSK buffers are not
"one per page".
We need to count the number of pages manually and then do

	skb->truesize += PAGE_SIZE * npages;

Right.

> > > +
> > > +	refcount_add(len, &xs->sk.sk_wmem_alloc);
> > > +
> > > +	return skb;
> > > +}
> > > +
> 
> Al

Thanks,
Al
Alexander Lobakin Jan. 22, 2021, 12:08 p.m. UTC | #5
From: Alexander Lobakin <alobakin@pm.me>
Date: Fri, 22 Jan 2021 11:55:35 +0000

> From: Alexander Lobakin <alobakin@pm.me>
> Date: Fri, 22 Jan 2021 11:47:45 +0000
> 
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Thu, 21 Jan 2021 16:41:33 +0100
> > 
> > > On 1/21/21 2:47 PM, Xuan Zhuo wrote:
> > > > This patch is used to construct skb based on page to save memory copy
> > > > overhead.
> > > > 
> > > > This function is implemented based on IFF_TX_SKB_NO_LINEAR. Only the
> > > > network card priv_flags supports IFF_TX_SKB_NO_LINEAR will use page to
> > > > directly construct skb. If this feature is not supported, it is still
> > > > necessary to copy data to construct skb.
> > > > 
> > > > ---------------- Performance Testing ------------
> > > > 
> > > > The test environment is Aliyun ECS server.
> > > > Test cmd:
> > > > ```
> > > > xdpsock -i eth0 -t  -S -s <msg size>
> > > > ```
> > > > 
> > > > Test result data:
> > > > 
> > > > size    64      512     1024    1500
> > > > copy    1916747 1775988 1600203 1440054
> > > > page    1974058 1953655 1945463 1904478
> > > > percent 3.0%    10.0%   21.58%  32.3%
> > > > 
> > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> > > > ---
> > > >  net/xdp/xsk.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++----------
> > > >  1 file changed, 86 insertions(+), 18 deletions(-)
> > > > 
> > > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > > index 4a83117..38af7f1 100644
> > > > --- a/net/xdp/xsk.c
> > > > +++ b/net/xdp/xsk.c
> > > > @@ -430,6 +430,87 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> > > >  	sock_wfree(skb);
> > > >  }
> > > >  
> > > > +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > > +					      struct xdp_desc *desc)
> > > > +{
> > > > +	u32 len, offset, copy, copied;
> > > > +	struct sk_buff *skb;
> > > > +	struct page *page;
> > > > +	void *buffer;
> > > > +	int err, i;
> > > > +	u64 addr;
> > > > +
> > > > +	skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);

Also,
maybe we should allocate it with NET_SKB_PAD so NIC drivers could
use some reserved space?

		skb = sock_alloc_send_skb(&xs->sk, NET_SKB_PAD, 1, &err);
		...
		skb_reserve(skb, NET_SKB_PAD);

Eric, what do you think?

> > > > +	if (unlikely(!skb))
> > > > +		return ERR_PTR(err);
> > > > +
> > > > +	addr = desc->addr;
> > > > +	len = desc->len;
> > > > +
> > > > +	buffer = xsk_buff_raw_get_data(xs->pool, addr);
> > > > +	offset = offset_in_page(buffer);
> > > > +	addr = buffer - xs->pool->addrs;
> > > > +
> > > > +	for (copied = 0, i = 0; copied < len; i++) {
> > > > +		page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> > > > +
> > > > +		get_page(page);
> > > > +
> > > > +		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
> > > > +
> > > > +		skb_fill_page_desc(skb, i, page, offset, copy);
> > > > +
> > > > +		copied += copy;
> > > > +		addr += copy;
> > > > +		offset = 0;
> > > > +	}
> > > > +
> > > > +	skb->len += len;
> > > > +	skb->data_len += len;
> > > 
> > > > +	skb->truesize += len;
> > > 
> > > This is not the truesize, unfortunately.
> > > 
> > > We need to account for the number of pages, not number of bytes.
> > 
> > The easiest solution is:
> > 
> > 	skb->truesize += PAGE_SIZE * i;
> > 
> > i would be equal to skb_shinfo(skb)->nr_frags after exiting the loop.
> 
> Oops, pls ignore this. I forgot that XSK buffers are not
> "one per page".
> We need to count the number of pages manually and then do
> 
> 	skb->truesize += PAGE_SIZE * npages;
> 
> Right.
> 
> > > > +
> > > > +	refcount_add(len, &xs->sk.sk_wmem_alloc);
> > > > +
> > > > +	return skb;
> > > > +}
> > > > +
> > 
> > Al
> 
> Thanks,
> Al

Al
Magnus Karlsson Jan. 22, 2021, 12:18 p.m. UTC | #6
On Fri, Jan 22, 2021 at 12:57 PM Alexander Lobakin <alobakin@pm.me> wrote:
>
> From: Alexander Lobakin <alobakin@pm.me>
> Date: Fri, 22 Jan 2021 11:47:45 +0000
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Thu, 21 Jan 2021 16:41:33 +0100
> >
> > > On 1/21/21 2:47 PM, Xuan Zhuo wrote:
> > > > This patch is used to construct skb based on page to save memory copy
> > > > overhead.
> > > >
> > > > This function is implemented based on IFF_TX_SKB_NO_LINEAR. Only the
> > > > network card priv_flags supports IFF_TX_SKB_NO_LINEAR will use page to
> > > > directly construct skb. If this feature is not supported, it is still
> > > > necessary to copy data to construct skb.
> > > >
> > > > ---------------- Performance Testing ------------
> > > >
> > > > The test environment is Aliyun ECS server.
> > > > Test cmd:
> > > > ```
> > > > xdpsock -i eth0 -t  -S -s <msg size>
> > > > ```
> > > >
> > > > Test result data:
> > > >
> > > > size    64      512     1024    1500
> > > > copy    1916747 1775988 1600203 1440054
> > > > page    1974058 1953655 1945463 1904478
> > > > percent 3.0%    10.0%   21.58%  32.3%
> > > >
> > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> > > > ---
> > > >  net/xdp/xsk.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++----------
> > > >  1 file changed, 86 insertions(+), 18 deletions(-)
> > > >
> > > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > > index 4a83117..38af7f1 100644
> > > > --- a/net/xdp/xsk.c
> > > > +++ b/net/xdp/xsk.c
> > > > @@ -430,6 +430,87 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> > > >   sock_wfree(skb);
> > > >  }
> > > >
> > > > +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > > +                                       struct xdp_desc *desc)
> > > > +{
> > > > + u32 len, offset, copy, copied;
> > > > + struct sk_buff *skb;
> > > > + struct page *page;
> > > > + void *buffer;
> > > > + int err, i;
> > > > + u64 addr;
> > > > +
> > > > + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> > > > + if (unlikely(!skb))
> > > > +         return ERR_PTR(err);
> > > > +
> > > > + addr = desc->addr;
> > > > + len = desc->len;
> > > > +
> > > > + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> > > > + offset = offset_in_page(buffer);
> > > > + addr = buffer - xs->pool->addrs;
> > > > +
> > > > + for (copied = 0, i = 0; copied < len; i++) {
> > > > +         page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> > > > +
> > > > +         get_page(page);
> > > > +
> > > > +         copy = min_t(u32, PAGE_SIZE - offset, len - copied);
> > > > +
> > > > +         skb_fill_page_desc(skb, i, page, offset, copy);
> > > > +
> > > > +         copied += copy;
> > > > +         addr += copy;
> > > > +         offset = 0;
> > > > + }
> > > > +
> > > > + skb->len += len;
> > > > + skb->data_len += len;
> > >
> > > > + skb->truesize += len;
> > >
> > > This is not the truesize, unfortunately.
> > >
> > > We need to account for the number of pages, not number of bytes.
> >
> > The easiest solution is:
> >
> >       skb->truesize += PAGE_SIZE * i;
> >
> > i would be equal to skb_shinfo(skb)->nr_frags after exiting the loop.
>
> Oops, pls ignore this. I forgot that XSK buffers are not
> "one per page".
> We need to count the number of pages manually and then do
>
>         skb->truesize += PAGE_SIZE * npages;
>
> Right.

There are two possible packet buffer (chunks) sizes in a umem, 2K and
4K on a system with a PAGE_SIZE of 4K. If I remember correctly, and
please correct me if wrong, truesize is used for memory accounting.
But in this code, no kernel memory has been allocated (apart from the
skb). The page is just a part of the umem that has been already
allocated beforehand and by user-space in this case. So what should
truesize be in this case? Do we add 0, chunk_size * i, or the
complicated case of counting exactly how many 4K pages that are used
when the chunk_size is 2K, as two chunks could occupy the same page,
or just the upper bound of PAGE_SIZE * i that is likely a good
approximation in most cases? Just note that there might be other uses
of truesize that I am unaware of that could impact this choice.

> > > > +
> > > > + refcount_add(len, &xs->sk.sk_wmem_alloc);
> > > > +
> > > > + return skb;
> > > > +}
> > > > +
> >
> > Al
>
> Thanks,
> Al
>
Alexander Lobakin Jan. 22, 2021, 12:39 p.m. UTC | #7
From: Magnus Karlsson <magnus.karlsson@gmail.com>
Date: Fri, 22 Jan 2021 13:18:47 +0100

> On Fri, Jan 22, 2021 at 12:57 PM Alexander Lobakin <alobakin@pm.me> wrote:
> >
> > From: Alexander Lobakin <alobakin@pm.me>
> > Date: Fri, 22 Jan 2021 11:47:45 +0000
> >
> > > From: Eric Dumazet <eric.dumazet@gmail.com>
> > > Date: Thu, 21 Jan 2021 16:41:33 +0100
> > >
> > > > On 1/21/21 2:47 PM, Xuan Zhuo wrote:
> > > > > This patch is used to construct skb based on page to save memory copy
> > > > > overhead.
> > > > >
> > > > > This function is implemented based on IFF_TX_SKB_NO_LINEAR. Only the
> > > > > network card priv_flags supports IFF_TX_SKB_NO_LINEAR will use page to
> > > > > directly construct skb. If this feature is not supported, it is still
> > > > > necessary to copy data to construct skb.
> > > > >
> > > > > ---------------- Performance Testing ------------
> > > > >
> > > > > The test environment is Aliyun ECS server.
> > > > > Test cmd:
> > > > > ```
> > > > > xdpsock -i eth0 -t  -S -s <msg size>
> > > > > ```
> > > > >
> > > > > Test result data:
> > > > >
> > > > > size    64      512     1024    1500
> > > > > copy    1916747 1775988 1600203 1440054
> > > > > page    1974058 1953655 1945463 1904478
> > > > > percent 3.0%    10.0%   21.58%  32.3%
> > > > >
> > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> > > > > ---
> > > > >  net/xdp/xsk.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++----------
> > > > >  1 file changed, 86 insertions(+), 18 deletions(-)
> > > > >
> > > > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > > > index 4a83117..38af7f1 100644
> > > > > --- a/net/xdp/xsk.c
> > > > > +++ b/net/xdp/xsk.c
> > > > > @@ -430,6 +430,87 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> > > > >   sock_wfree(skb);
> > > > >  }
> > > > >
> > > > > +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > > > +                                       struct xdp_desc *desc)
> > > > > +{
> > > > > + u32 len, offset, copy, copied;
> > > > > + struct sk_buff *skb;
> > > > > + struct page *page;
> > > > > + void *buffer;
> > > > > + int err, i;
> > > > > + u64 addr;
> > > > > +
> > > > > + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> > > > > + if (unlikely(!skb))
> > > > > +         return ERR_PTR(err);
> > > > > +
> > > > > + addr = desc->addr;
> > > > > + len = desc->len;
> > > > > +
> > > > > + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> > > > > + offset = offset_in_page(buffer);
> > > > > + addr = buffer - xs->pool->addrs;
> > > > > +
> > > > > + for (copied = 0, i = 0; copied < len; i++) {
> > > > > +         page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> > > > > +
> > > > > +         get_page(page);
> > > > > +
> > > > > +         copy = min_t(u32, PAGE_SIZE - offset, len - copied);
> > > > > +
> > > > > +         skb_fill_page_desc(skb, i, page, offset, copy);
> > > > > +
> > > > > +         copied += copy;
> > > > > +         addr += copy;
> > > > > +         offset = 0;
> > > > > + }
> > > > > +
> > > > > + skb->len += len;
> > > > > + skb->data_len += len;
> > > >
> > > > > + skb->truesize += len;
> > > >
> > > > This is not the truesize, unfortunately.
> > > >
> > > > We need to account for the number of pages, not number of bytes.
> > >
> > > The easiest solution is:
> > >
> > >       skb->truesize += PAGE_SIZE * i;
> > >
> > > i would be equal to skb_shinfo(skb)->nr_frags after exiting the loop.
> >
> > Oops, pls ignore this. I forgot that XSK buffers are not
> > "one per page".
> > We need to count the number of pages manually and then do
> >
> >         skb->truesize += PAGE_SIZE * npages;
> >
> > Right.
> 
> There are two possible packet buffer (chunks) sizes in a umem, 2K and
> 4K on a system with a PAGE_SIZE of 4K. If I remember correctly, and
> please correct me if wrong, truesize is used for memory accounting.
> But in this code, no kernel memory has been allocated (apart from the
> skb). The page is just a part of the umem that has been already
> allocated beforehand and by user-space in this case. So what should
> truesize be in this case? Do we add 0, chunk_size * i, or the
> complicated case of counting exactly how many 4K pages that are used
> when the chunk_size is 2K, as two chunks could occupy the same page,
> or just the upper bound of PAGE_SIZE * i that is likely a good
> approximation in most cases? Just note that there might be other uses
> of truesize that I am unaware of that could impact this choice.

Truesize is "what amount of memory does this skb occupy with all its
fragments, linear space and struct sk_buff itself". The closest it
will be to the actual value, the better.
In this case, I think adding of chunk_size * i would be enough.

(PAGE_SIZE * i can be overwhelming when chunk_size is 2K, especially
for setups with PAGE_SIZE > SZ_4K)

> > > > > +
> > > > > + refcount_add(len, &xs->sk.sk_wmem_alloc);
> > > > > +
> > > > > + return skb;
> > > > > +}
> > > > > +
> > >
> > > Al
> >
> > Thanks,
> > Al

Al
Magnus Karlsson Jan. 22, 2021, 12:55 p.m. UTC | #8
On Fri, Jan 22, 2021 at 1:39 PM Alexander Lobakin <alobakin@pm.me> wrote:
>
> From: Magnus Karlsson <magnus.karlsson@gmail.com>
> Date: Fri, 22 Jan 2021 13:18:47 +0100
>
> > On Fri, Jan 22, 2021 at 12:57 PM Alexander Lobakin <alobakin@pm.me> wrote:
> > >
> > > From: Alexander Lobakin <alobakin@pm.me>
> > > Date: Fri, 22 Jan 2021 11:47:45 +0000
> > >
> > > > From: Eric Dumazet <eric.dumazet@gmail.com>
> > > > Date: Thu, 21 Jan 2021 16:41:33 +0100
> > > >
> > > > > On 1/21/21 2:47 PM, Xuan Zhuo wrote:
> > > > > > This patch is used to construct skb based on page to save memory copy
> > > > > > overhead.
> > > > > >
> > > > > > This function is implemented based on IFF_TX_SKB_NO_LINEAR. Only the
> > > > > > network card priv_flags supports IFF_TX_SKB_NO_LINEAR will use page to
> > > > > > directly construct skb. If this feature is not supported, it is still
> > > > > > necessary to copy data to construct skb.
> > > > > >
> > > > > > ---------------- Performance Testing ------------
> > > > > >
> > > > > > The test environment is Aliyun ECS server.
> > > > > > Test cmd:
> > > > > > ```
> > > > > > xdpsock -i eth0 -t  -S -s <msg size>
> > > > > > ```
> > > > > >
> > > > > > Test result data:
> > > > > >
> > > > > > size    64      512     1024    1500
> > > > > > copy    1916747 1775988 1600203 1440054
> > > > > > page    1974058 1953655 1945463 1904478
> > > > > > percent 3.0%    10.0%   21.58%  32.3%
> > > > > >
> > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> > > > > > ---
> > > > > >  net/xdp/xsk.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++----------
> > > > > >  1 file changed, 86 insertions(+), 18 deletions(-)
> > > > > >
> > > > > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > > > > index 4a83117..38af7f1 100644
> > > > > > --- a/net/xdp/xsk.c
> > > > > > +++ b/net/xdp/xsk.c
> > > > > > @@ -430,6 +430,87 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> > > > > >   sock_wfree(skb);
> > > > > >  }
> > > > > >
> > > > > > +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > > > > +                                       struct xdp_desc *desc)
> > > > > > +{
> > > > > > + u32 len, offset, copy, copied;
> > > > > > + struct sk_buff *skb;
> > > > > > + struct page *page;
> > > > > > + void *buffer;
> > > > > > + int err, i;
> > > > > > + u64 addr;
> > > > > > +
> > > > > > + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> > > > > > + if (unlikely(!skb))
> > > > > > +         return ERR_PTR(err);
> > > > > > +
> > > > > > + addr = desc->addr;
> > > > > > + len = desc->len;
> > > > > > +
> > > > > > + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> > > > > > + offset = offset_in_page(buffer);
> > > > > > + addr = buffer - xs->pool->addrs;
> > > > > > +
> > > > > > + for (copied = 0, i = 0; copied < len; i++) {
> > > > > > +         page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> > > > > > +
> > > > > > +         get_page(page);
> > > > > > +
> > > > > > +         copy = min_t(u32, PAGE_SIZE - offset, len - copied);
> > > > > > +
> > > > > > +         skb_fill_page_desc(skb, i, page, offset, copy);
> > > > > > +
> > > > > > +         copied += copy;
> > > > > > +         addr += copy;
> > > > > > +         offset = 0;
> > > > > > + }
> > > > > > +
> > > > > > + skb->len += len;
> > > > > > + skb->data_len += len;
> > > > >
> > > > > > + skb->truesize += len;
> > > > >
> > > > > This is not the truesize, unfortunately.
> > > > >
> > > > > We need to account for the number of pages, not number of bytes.
> > > >
> > > > The easiest solution is:
> > > >
> > > >       skb->truesize += PAGE_SIZE * i;
> > > >
> > > > i would be equal to skb_shinfo(skb)->nr_frags after exiting the loop.
> > >
> > > Oops, pls ignore this. I forgot that XSK buffers are not
> > > "one per page".
> > > We need to count the number of pages manually and then do
> > >
> > >         skb->truesize += PAGE_SIZE * npages;
> > >
> > > Right.
> >
> > There are two possible packet buffer (chunks) sizes in a umem, 2K and
> > 4K on a system with a PAGE_SIZE of 4K. If I remember correctly, and
> > please correct me if wrong, truesize is used for memory accounting.
> > But in this code, no kernel memory has been allocated (apart from the
> > skb). The page is just a part of the umem that has been already
> > allocated beforehand and by user-space in this case. So what should
> > truesize be in this case? Do we add 0, chunk_size * i, or the
> > complicated case of counting exactly how many 4K pages that are used
> > when the chunk_size is 2K, as two chunks could occupy the same page,
> > or just the upper bound of PAGE_SIZE * i that is likely a good
> > approximation in most cases? Just note that there might be other uses
> > of truesize that I am unaware of that could impact this choice.
>
> Truesize is "what amount of memory does this skb occupy with all its
> fragments, linear space and struct sk_buff itself". The closest it
> will be to the actual value, the better.
> In this case, I think adding of chunk_size * i would be enough.

Sounds like a good approximation to me.

> (PAGE_SIZE * i can be overwhelming when chunk_size is 2K, especially
> for setups with PAGE_SIZE > SZ_4K)

You are right. That would be quite horrible on a system with a page size of 64K.

> > > > > > +
> > > > > > + refcount_add(len, &xs->sk.sk_wmem_alloc);
> > > > > > +
> > > > > > + return skb;
> > > > > > +}
> > > > > > +
> > > >
> > > > Al
> > >
> > > Thanks,
> > > Al
>
> Al
>
diff mbox series

Patch

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 4a83117..38af7f1 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -430,6 +430,87 @@  static void xsk_destruct_skb(struct sk_buff *skb)
 	sock_wfree(skb);
 }
 
+static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
+					      struct xdp_desc *desc)
+{
+	u32 len, offset, copy, copied;
+	struct sk_buff *skb;
+	struct page *page;
+	void *buffer;
+	int err, i;
+	u64 addr;
+
+	skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
+	if (unlikely(!skb))
+		return ERR_PTR(err);
+
+	addr = desc->addr;
+	len = desc->len;
+
+	buffer = xsk_buff_raw_get_data(xs->pool, addr);
+	offset = offset_in_page(buffer);
+	addr = buffer - xs->pool->addrs;
+
+	for (copied = 0, i = 0; copied < len; i++) {
+		page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
+
+		get_page(page);
+
+		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
+
+		skb_fill_page_desc(skb, i, page, offset, copy);
+
+		copied += copy;
+		addr += copy;
+		offset = 0;
+	}
+
+	skb->len += len;
+	skb->data_len += len;
+	skb->truesize += len;
+
+	refcount_add(len, &xs->sk.sk_wmem_alloc);
+
+	return skb;
+}
+
+static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
+				     struct xdp_desc *desc)
+{
+	struct sk_buff *skb;
+
+	if (xs->dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
+		skb = xsk_build_skb_zerocopy(xs, desc);
+		if (IS_ERR(skb))
+			return skb;
+	} else {
+		void *buffer;
+		u32 len;
+		int err;
+
+		len = desc->len;
+		skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
+		if (unlikely(!skb))
+			return ERR_PTR(err);
+
+		skb_put(skb, len);
+		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
+		err = skb_store_bits(skb, 0, buffer, len);
+		if (unlikely(err)) {
+			kfree_skb(skb);
+			return ERR_PTR(err);
+		}
+	}
+
+	skb->dev = xs->dev;
+	skb->priority = xs->sk.sk_priority;
+	skb->mark = xs->sk.sk_mark;
+	skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
+	skb->destructor = xsk_destruct_skb;
+
+	return skb;
+}
+
 static int xsk_generic_xmit(struct sock *sk)
 {
 	struct xdp_sock *xs = xdp_sk(sk);
@@ -446,43 +527,30 @@  static int xsk_generic_xmit(struct sock *sk)
 		goto out;
 
 	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
-		char *buffer;
-		u64 addr;
-		u32 len;
-
 		if (max_batch-- == 0) {
 			err = -EAGAIN;
 			goto out;
 		}
 
-		len = desc.len;
-		skb = sock_alloc_send_skb(sk, len, 1, &err);
-		if (unlikely(!skb))
+		skb = xsk_build_skb(xs, &desc);
+		if (IS_ERR(skb)) {
+			err = PTR_ERR(skb);
 			goto out;
+		}
 
-		skb_put(skb, len);
-		addr = desc.addr;
-		buffer = xsk_buff_raw_get_data(xs->pool, addr);
-		err = skb_store_bits(skb, 0, buffer, len);
 		/* This is the backpressure mechanism for the Tx path.
 		 * Reserve space in the completion queue and only proceed
 		 * if there is space in it. This avoids having to implement
 		 * any buffering in the Tx path.
 		 */
 		spin_lock_irqsave(&xs->pool->cq_lock, flags);
-		if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
+		if (xskq_prod_reserve(xs->pool->cq)) {
 			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 			kfree_skb(skb);
 			goto out;
 		}
 		spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 
-		skb->dev = xs->dev;
-		skb->priority = sk->sk_priority;
-		skb->mark = sk->sk_mark;
-		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
-		skb->destructor = xsk_destruct_skb;
-
 		err = __dev_direct_xmit(skb, xs->queue_id);
 		if  (err == NETDEV_TX_BUSY) {
 			/* Tell user-space to retry the send */